1use color_eyre::eyre;
5use derive_more::Display;
6use indicatif::{ProgressBar, ProgressStyle};
7use owo_colors::OwoColorize;
8use rayon::prelude::*;
9use reqwest::blocking::Client;
10use reqwest::header::HeaderMap;
11use serde::{Deserialize, Serialize};
12use serde_json::Result;
13use std::fmt::Debug;
14use std::fs::File;
15use std::io::{copy, Cursor};
16use std::path::PathBuf;
17use tracing::{debug, error, info, trace};
18use uriparse::URI;
19
20pub mod analyzer;
21pub mod constants;
22pub mod doctor;
23pub mod powerpoint;
24pub mod schema;
25pub mod util;
26
27use crate::util::*;
28
29pub const IGNORE: [&str; 5] = [".gitignore", ".gitlab-ci.yml", ".gitkeep", ".DS_Store", "README.md"];
36
37#[derive(Clone, Debug, Display, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Ord)]
39#[serde(rename_all = "lowercase")]
40pub enum EntryType {
41 #[display("tree")]
45 Tree,
46 #[display("blob")]
50 Blob,
51}
52#[derive(Clone, Debug, Display, Serialize, Deserialize)]
54#[serde(tag = "provider", rename_all = "lowercase")]
55pub enum Repository {
56 #[display("github")]
60 GitHub {
61 uri: String,
63 },
64 #[display("gitlab")]
68 GitLab {
69 id: u64,
73 uri: String,
75 },
76}
77#[derive(Clone, Debug, Serialize, Deserialize)]
102pub struct BucketsConfig {
103 pub buckets: Vec<Bucket>,
105}
106#[derive(Clone, Debug, Serialize, Deserialize)]
108#[serde(rename_all = "camelCase")]
109pub struct Bucket {
110 pub name: String,
114 pub description: Option<String>,
118 #[serde(alias = "repository")]
122 pub code_repository: Repository,
123}
124#[derive(Clone, Debug, Serialize, Deserialize)]
128pub struct GitlabTreeEntry {
129 pub id: String,
133 pub name: String,
135 #[serde(rename = "type")]
137 pub entry_type: EntryType,
138 pub path: String,
142 pub mode: String,
144}
145impl Bucket {
146 fn parse(response: reqwest::blocking::Response) -> Vec<String> {
148 let content = response.text().unwrap();
149 let data: Result<Vec<GitlabTreeEntry>> = serde_json::from_str(&content);
150 debug!("=> {} {} GitLab tree entries", Label::found(), data.as_ref().unwrap().len());
151 match data {
152 | Ok(entries) => entries
153 .into_iter()
154 .filter(GitlabTreeEntry::is_blob)
155 .map(GitlabTreeEntry::get_path)
156 .collect(),
157 | Err(_) => {
158 error!("=> {} Failed to process GitLab tree entries", Label::fail());
159 vec![]
160 }
161 }
162 }
163 fn get_domain(&self) -> String {
165 fn default_domain(repository: Repository) -> String {
166 match repository {
167 | Repository::GitHub { .. } => "github.com".to_string(),
168 | Repository::GitLab { .. } => "gitlab.com".to_string(),
169 }
170 }
171 match &self.code_repository {
172 | Repository::GitHub { uri } => match URI::try_from(uri.as_str()) {
173 | Ok(uri) => uri.host().unwrap().to_string(),
174 | Err(_) => default_domain(self.code_repository.clone()),
175 },
176 | Repository::GitLab { uri, .. } => match URI::try_from(uri.as_str()) {
177 | Ok(uri) => uri.host().unwrap().to_string(),
178 | Err(_) => default_domain(self.code_repository.clone()),
179 },
180 }
181 }
182 fn get_tree(&self, directory: &str, page: u32) -> eyre::Result<reqwest::blocking::Response, reqwest::Error> {
183 let url = self.get_tree_url(directory, page);
184 reqwest::blocking::get(url)
185 }
186 fn get_tree_url(&self, directory: &str, page: u32) -> String {
187 let id = match &self.code_repository {
188 | Repository::GitHub { .. } => todo!(),
189 | Repository::GitLab { id, .. } => id.to_string(),
190 };
191 let per_page = 100;
192 let url = format!(
193 "https://{}/api/v4/projects/{}/repository/tree?&per_page={}&page={}&recursive=true&path={}",
194 self.get_domain(),
195 id,
196 per_page,
197 page,
198 directory
199 );
200 debug!(url = url.as_str(), "=> {}", Label::using());
201 url
202 }
203 pub fn download_files(self: Bucket, output: PathBuf) -> usize {
207 match self.code_repository {
208 | Repository::GitHub { ref uri, .. } => todo!("Add support for GitHub repositories like {uri}"),
209 | Repository::GitLab { ref uri, .. } => {
210 info!("=> Downloading research data from {}...", uri.clone());
211 let paths = self
212 .clone()
213 .get_file_paths("")
214 .into_iter()
215 .filter(|path| !IGNORE.iter().any(|x| path.ends_with(x)))
216 .collect::<Vec<String>>();
217 let progress = ProgressBar::new(paths.len() as u64);
218 progress.set_style(ProgressStyle::with_template(Label::PROGRESS_BAR_TEMPLATE).unwrap());
219 let client = Client::new();
220 paths.par_iter().for_each(|path| {
221 let url = format!("{}/-/raw/main/{}", uri, path);
222 progress.set_message(format!("Downloading {}", path));
223 let folder = format!("{}/{}", output.display(), get_parent(path.clone()));
224 std::fs::create_dir_all(folder.clone()).unwrap();
225 if let Ok(mut file) = File::create(format!("{}/{}", output.display(), path)) {
226 match client.get(url).send() {
227 | Ok(response) => match response.bytes() {
228 | Ok(bytes) => {
229 let mut content = Cursor::new(bytes);
230 let _ = copy(&mut content, &mut file);
231 }
232 | Err(why) => {
233 error!(path, "=> {} Failed to convert to bytes - {why}", Label::fail());
234 }
235 },
236 | Err(why) => {
237 error!(path, "=> {} Failed to download - {why}", Label::fail());
238 }
239 }
240 };
241 progress.inc(1);
242 });
243 let total_data: usize = paths.clone().into_iter().filter(|path| path.to_lowercase().ends_with(".json")).count();
244 let total_images: usize = paths
245 .into_iter()
246 .filter(|path| path.to_lowercase().ends_with(".png") || path.to_lowercase().ends_with(".jpg"))
247 .count();
248 let total = total_data + total_images;
249 let message = if total_data != total_images {
250 let recommendation = if total_data > total_images {
251 "Do you need to add some images?"
252 } else {
253 "Do you need to add some JSON files?"
254 };
255 format!(
256 " ({} data file{}, {} image{} - {})",
257 total_data.yellow(),
258 get_suffix(total_data),
259 total_images.yellow(),
260 get_suffix(total_images),
261 recommendation.italic(),
262 )
263 } else {
264 "".to_string()
265 };
266 progress.set_style(ProgressStyle::with_template("{msg}").unwrap());
267 progress.finish_with_message(format!(
268 " {}Downloaded {} {} file{}{}",
269 if total > 0 { Label::CHECKMARK } else { Label::CAUTION },
270 if total > 0 {
271 total.green().to_string()
272 } else {
273 total.yellow().to_string()
274 },
275 self.clone().name.to_uppercase(),
276 get_suffix(total),
277 message,
278 ));
279 total
280 }
281 }
282 }
283 fn get_file_paths(self: Bucket, directory: &str) -> Vec<String> {
284 const FIRST_PAGE: u32 = 1;
285 match self.code_repository {
286 | Repository::GitHub { .. } => todo!(),
287 | Repository::GitLab { .. } => {
288 fn get_page_count(response: &reqwest::blocking::Response) -> u32 {
289 fn parse_header(headers: &HeaderMap, key: &str) -> u32 {
290 match headers.get(key) {
291 | Some(val) if !val.is_empty() => {
292 let value = val.to_str().unwrap().parse::<u32>().unwrap();
293 debug!("=> {} {} = {}", Label::using(), key, value);
294 value
295 }
296 | Some(_) | None => 0,
297 }
298 }
299 let headers = response.headers();
300 parse_header(headers, "x-total-pages")
301 }
302 match self.get_tree(directory, FIRST_PAGE) {
303 | Ok(response) if response.status().is_success() => (FIRST_PAGE..=get_page_count(&response))
304 .into_par_iter()
305 .map(|page| self.clone().get_file_paths_for_page(directory, page))
306 .reduce(std::vec::Vec::new, |a, b| [a, b].concat()),
307 | Ok(_) | Err(_) => {
308 let url = self.get_tree_url(directory, FIRST_PAGE);
309 debug!(url, "=> {}", Label::using());
310 error!(
311 "=> {} Failed to get file paths for {} bucket",
312 Label::fail(),
313 self.name.to_uppercase().red(),
314 );
315 vec![]
316 }
317 }
318 }
319 }
320 }
321 fn get_file_paths_for_page(self: Bucket, directory: &str, page: u32) -> Vec<String> {
322 match self.get_tree(directory, page) {
323 | Ok(response) if response.status().is_success() => match self.get_tree(directory, page) {
324 | Ok(response) if response.status().is_success() => Bucket::parse(response),
325 | Ok(_) | Err(_) => {
326 let url = self.get_tree_url(directory, 1);
327 error!(url, page, "=> {} Failed to get paths", Label::fail());
328 vec![]
329 }
330 },
331 | Ok(_) | Err(_) => {
332 let url = self.get_tree_url(directory, page);
333 error!(url, page, "=> {} Failed to get paths", Label::fail());
334 vec![]
335 }
336 }
337 }
338}
339impl BucketsConfig {
340 pub fn read_json(path: PathBuf) -> Result<BucketsConfig> {
342 let content = match read_file(path.clone()) {
343 | Ok(value) if !value.is_empty() => value,
344 | Ok(_) | Err(_) => {
345 error!(
346 path = path.to_str().unwrap(),
347 "=> {} Bucket configuration content is not valid",
348 Label::fail()
349 );
350 "{}".to_owned()
351 }
352 };
353 let data: Result<BucketsConfig> = serde_json::from_str(&content);
354 let label = match data {
355 | Ok(_) => Label::using(),
356 | Err(_) => Label::invalid(),
357 };
358 trace!("=> {} Bucket configuration = {:#?}", label, data.dimmed());
359 data
360 }
361}
362impl GitlabTreeEntry {
363 fn get_path(self) -> String {
364 self.path
365 }
366 fn is_blob(&self) -> bool {
367 self.entry_type.eq(&EntryType::Blob)
368 }
369}
370
371#[cfg(test)]
372mod tests;