1use color_eyre::eyre;
2use derive_more::Display;
3use indicatif::{ProgressBar, ProgressStyle};
4use owo_colors::OwoColorize;
5use rayon::prelude::*;
6use reqwest::blocking::Client;
7use reqwest::header::HeaderMap;
8use serde::{Deserialize, Serialize};
9use serde_json::Result;
10use std::fmt::Debug;
11use std::fs::File;
12use std::io::{copy, Cursor};
13use std::path::PathBuf;
14use tracing::{debug, error, info, trace};
15use uriparse::URI;
16
17pub mod analyzer;
18pub mod constants;
19pub mod doctor;
20pub mod powerpoint;
21pub mod schema;
22pub mod util;
23
24use crate::schema::SchemaType;
25use crate::util::*;
26
27const SCHEMA_TYPES: [SchemaType; 3] = [SchemaType::Project, SchemaType::Organization, SchemaType::Highlight];
28
29#[derive(Clone, Debug, Display, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Ord)]
30#[serde(rename_all = "lowercase")]
31pub enum EntryType {
32 #[display("tree")]
33 Tree,
34 #[display("blob")]
35 Blob,
36}
37#[derive(Clone, Debug, Display, Serialize, Deserialize)]
38#[serde(tag = "provider", rename_all = "lowercase")]
39pub enum Repository {
40 #[display("github")]
41 GitHub { uri: String },
42 #[display("gitlab")]
43 GitLab {
44 id: u64,
48 uri: String,
49 },
50}
51#[derive(Clone, Debug, Serialize, Deserialize)]
52pub struct BucketsConfig {
53 pub buckets: Vec<Bucket>,
54}
55#[derive(Clone, Debug, Serialize, Deserialize)]
56#[serde(rename_all = "camelCase")]
57pub struct Bucket {
58 pub name: String,
62 pub description: Option<String>,
66 #[serde(alias = "repository")]
70 pub code_repository: Repository,
71}
72#[derive(Clone, Debug, Serialize, Deserialize)]
73pub struct GitlabTreeEntry {
74 pub id: String,
75 pub name: String,
76 #[serde(rename = "type")]
77 pub entry_type: EntryType,
78 pub path: String,
79 pub mode: String,
80}
81impl Bucket {
82 fn parse(response: reqwest::blocking::Response) -> Vec<String> {
83 let content = response.text().unwrap();
84 let data: Result<Vec<GitlabTreeEntry>> = serde_json::from_str(&content);
85 debug!("=> {} {} GitLab tree entries", Label::found(), data.as_ref().unwrap().len());
86 match data {
87 | Ok(entries) => entries
88 .into_iter()
89 .filter(GitlabTreeEntry::is_blob)
90 .map(GitlabTreeEntry::get_path)
91 .collect(),
92 | Err(_) => {
93 error!("=> {} Failed to process GitLab tree entries", Label::fail());
94 vec![]
95 }
96 }
97 }
98 fn get_domain(&self) -> String {
99 fn default_domain(repository: Repository) -> String {
100 match repository {
101 | Repository::GitHub { .. } => "github.com".to_string(),
102 | Repository::GitLab { .. } => "gitlab.com".to_string(),
103 }
104 }
105 match &self.code_repository {
106 | Repository::GitHub { uri } => match URI::try_from(uri.as_str()) {
107 | Ok(uri) => uri.host().unwrap().to_string(),
108 | Err(_) => default_domain(self.code_repository.clone()),
109 },
110 | Repository::GitLab { uri, .. } => match URI::try_from(uri.as_str()) {
111 | Ok(uri) => uri.host().unwrap().to_string(),
112 | Err(_) => default_domain(self.code_repository.clone()),
113 },
114 }
115 }
116 fn get_tree(&self, schema_type: SchemaType, page: u32) -> eyre::Result<reqwest::blocking::Response, reqwest::Error> {
117 let url = self.get_tree_url(schema_type, page);
118 reqwest::blocking::get(url)
119 }
120 fn get_tree_url(&self, schema_type: SchemaType, page: u32) -> String {
121 let id = match &self.code_repository {
122 | Repository::GitHub { .. } => todo!(),
123 | Repository::GitLab { id, .. } => id.to_string(),
124 };
125 let per_page = 100;
126 let url = format!(
127 "https://{}/api/v4/projects/{}/repository/tree?&per_page={}&page={}&recursive=true&path={}",
128 self.get_domain(),
129 id,
130 per_page,
131 page,
132 schema_type
133 );
134 debug!(url = url.as_str(), "=> {}", Label::using());
135 url
136 }
137 pub fn download_files(self: Bucket, output: PathBuf) -> usize {
138 fn get_suffix(value: usize) -> String {
139 (if value == 1 { "" } else { "s" }).to_string()
140 }
141 let counts = SCHEMA_TYPES.into_iter().map(|schema_type| match self.code_repository {
142 | Repository::GitHub { .. } => todo!(),
143 | Repository::GitLab { ref uri, .. } => {
144 const IGNORE: [&str; 4] = [".gitignore", ".gitkeep", ".DS_Store", "README.md"];
145 info!("=> Downloading {} research data from {}...", schema_type, uri.clone());
146 let paths = self
147 .clone()
148 .get_file_paths(schema_type.clone())
149 .into_iter()
150 .filter(|path| !IGNORE.iter().any(|x| path.ends_with(x)))
151 .collect::<Vec<String>>();
152 let progress = ProgressBar::new(paths.len() as u64);
153 progress.set_style(ProgressStyle::with_template(Label::PROGRESS_BAR_TEMPLATE).unwrap());
154 let client = Client::new();
155 paths.par_iter().for_each(|path| {
156 let url = format!("{}/-/raw/main/{}", uri, path);
157 progress.set_message(format!("Downloading {}", path));
158 let folder = format!("{}/{}", output.display(), get_parent(path.clone()));
159 std::fs::create_dir_all(folder.clone()).unwrap();
160 let mut file = File::create(format!("{}/{}", output.display(), path)).unwrap();
161 let response = client.get(url).send().unwrap();
162 let mut content = Cursor::new(response.bytes().unwrap());
163 let _ = copy(&mut content, &mut file);
164 progress.inc(1);
165 });
166 let total_data: usize = paths.clone().into_iter().filter(|path| path.to_lowercase().ends_with(".json")).count();
167 let total_images: usize = paths
168 .into_iter()
169 .filter(|path| path.to_lowercase().ends_with(".png") || path.to_lowercase().ends_with(".jpg"))
170 .count();
171 let total = total_data + total_images;
172 let message = if total_data != total_images {
173 let recommendation = if total_data > total_images {
174 "Do you need to add some images?"
175 } else {
176 "Do you need to add some JSON files?"
177 };
178 format!(
179 " ({} data file{}, {} image{} - {})",
180 total_data.yellow(),
181 get_suffix(total_data),
182 total_images.yellow(),
183 get_suffix(total_images),
184 recommendation.italic(),
185 )
186 } else {
187 "".to_string()
188 };
189 progress.set_style(ProgressStyle::with_template("{msg}").unwrap());
190 progress.finish_with_message(format!(
191 " {}Downloaded {} {} {} file{}{}",
192 if total > 0 { Label::CHECKMARK } else { Label::CAUTION },
193 if total > 0 {
194 total.green().to_string()
195 } else {
196 total.yellow().to_string()
197 },
198 self.clone().name.to_uppercase(),
199 schema_type,
200 get_suffix(total),
201 message,
202 ));
203 total
204 }
205 });
206 counts.into_iter().sum()
207 }
208 fn get_file_paths(self: Bucket, schema_type: SchemaType) -> Vec<String> {
210 const FIRST_PAGE: u32 = 1;
211 match self.code_repository {
212 | Repository::GitHub { .. } => todo!(),
213 | Repository::GitLab { .. } => {
214 fn get_page_count(response: &reqwest::blocking::Response) -> u32 {
215 fn parse_header(headers: &HeaderMap, key: &str) -> u32 {
216 match headers.get(key) {
217 | Some(val) if !val.is_empty() => {
218 let value = val.to_str().unwrap().parse::<u32>().unwrap();
219 debug!("=> {} {} = {}", Label::using(), key, value);
220 value
221 }
222 | Some(_) | None => 0,
223 }
224 }
225 let headers = response.headers();
226 parse_header(headers, "x-total-pages")
227 }
228 match self.get_tree(schema_type.clone(), FIRST_PAGE) {
229 | Ok(response) if response.status().is_success() => (FIRST_PAGE..=get_page_count(&response))
230 .into_par_iter()
231 .map(|page| self.clone().get_file_paths_for_page(schema_type.clone(), page))
232 .reduce(std::vec::Vec::new, |a, b| [a, b].concat()),
233 | Ok(_) | Err(_) => {
234 let url = self.get_tree_url(schema_type.clone(), FIRST_PAGE);
235 debug!(url, "=> {}", Label::using());
236 error!(
237 "=> {} Failed to get file paths for {} {} bucket",
238 Label::fail(),
239 self.name.to_uppercase().red(),
240 schema_type.red()
241 );
242 vec![]
243 }
244 }
245 }
246 }
247 }
248 fn get_file_paths_for_page(self: Bucket, schema_type: SchemaType, page: u32) -> Vec<String> {
249 match self.get_tree(schema_type.clone(), page) {
250 | Ok(response) if response.status().is_success() => match self.get_tree(schema_type.clone(), page) {
251 | Ok(response) if response.status().is_success() => Bucket::parse(response),
252 | Ok(_) | Err(_) => {
253 let url = self.get_tree_url(schema_type.clone(), 1);
254 error!(url, page, "=> {} Failed to get paths", Label::fail());
255 vec![]
256 }
257 },
258 | Ok(_) | Err(_) => {
259 let url = self.get_tree_url(schema_type, page);
260 error!(url, page, "=> {} Failed to get paths", Label::fail());
261 vec![]
262 }
263 }
264 }
265}
266impl BucketsConfig {
267 pub fn read_json(path: PathBuf) -> Result<BucketsConfig> {
268 let content = match read_file(path.clone()) {
269 | Ok(value) if !value.is_empty() => value,
270 | Ok(_) | Err(_) => {
271 error!(
272 path = path.to_str().unwrap(),
273 "=> {} Bucket configuration content is not valid",
274 Label::fail()
275 );
276 "{}".to_owned()
277 }
278 };
279 let data: Result<BucketsConfig> = serde_json::from_str(&content);
280 let label = match data {
281 | Ok(_) => Label::using(),
282 | Err(_) => Label::invalid(),
283 };
284 trace!("=> {} Bucket configuration = {:#?}", label, data.dimmed());
285 data
286 }
287}
288impl GitlabTreeEntry {
289 fn get_path(self) -> String {
290 self.path
291 }
292 fn is_blob(&self) -> bool {
293 self.entry_type.eq(&EntryType::Blob)
294 }
295}
296
297#[cfg(test)]
298mod tests;