acorn_lib/
lib.rs

1use color_eyre::eyre;
2use derive_more::Display;
3use indicatif::{ProgressBar, ProgressStyle};
4use owo_colors::OwoColorize;
5use rayon::prelude::*;
6use reqwest::blocking::Client;
7use reqwest::header::HeaderMap;
8use serde::{Deserialize, Serialize};
9use serde_json::Result;
10use std::fmt::Debug;
11use std::fs::File;
12use std::io::{copy, Cursor};
13use std::path::PathBuf;
14use tracing::{debug, error, info, trace};
15use uriparse::URI;
16
17pub mod analyzer;
18pub mod constants;
19pub mod doctor;
20pub mod powerpoint;
21pub mod schema;
22pub mod util;
23
24use crate::schema::SchemaType;
25use crate::util::*;
26
27const SCHEMA_TYPES: [SchemaType; 3] = [SchemaType::Project, SchemaType::Organization, SchemaType::Highlight];
28
29#[derive(Clone, Debug, Display, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Ord)]
30#[serde(rename_all = "lowercase")]
31pub enum EntryType {
32    #[display("tree")]
33    Tree,
34    #[display("blob")]
35    Blob,
36}
37#[derive(Clone, Debug, Display, Serialize, Deserialize)]
38#[serde(tag = "provider", rename_all = "lowercase")]
39pub enum Repository {
40    #[display("github")]
41    GitHub { uri: String },
42    #[display("gitlab")]
43    GitLab {
44        /// Integer ID of GitLab project
45        ///
46        /// See <https://docs.gitlab.com/api/projects/#get-a-single-project> for more information
47        id: u64,
48        uri: String,
49    },
50}
51#[derive(Clone, Debug, Serialize, Deserialize)]
52pub struct BucketsConfig {
53    pub buckets: Vec<Bucket>,
54}
55#[derive(Clone, Debug, Serialize, Deserialize)]
56#[serde(rename_all = "camelCase")]
57pub struct Bucket {
58    /// Bucket name
59    ///
60    /// See <https://schema.org/name>
61    pub name: String,
62    /// Bucket description
63    ///
64    /// See <https://schema.org/description>
65    pub description: Option<String>,
66    /// Code repository data of bucket
67    ///
68    /// See <https://schema.org/codeRepository>
69    #[serde(alias = "repository")]
70    pub code_repository: Repository,
71}
72#[derive(Clone, Debug, Serialize, Deserialize)]
73pub struct GitlabTreeEntry {
74    pub id: String,
75    pub name: String,
76    #[serde(rename = "type")]
77    pub entry_type: EntryType,
78    pub path: String,
79    pub mode: String,
80}
81impl Bucket {
82    fn parse(response: reqwest::blocking::Response) -> Vec<String> {
83        let content = response.text().unwrap();
84        let data: Result<Vec<GitlabTreeEntry>> = serde_json::from_str(&content);
85        debug!("=> {} {} GitLab tree entries", Label::found(), data.as_ref().unwrap().len());
86        match data {
87            | Ok(entries) => entries
88                .into_iter()
89                .filter(GitlabTreeEntry::is_blob)
90                .map(GitlabTreeEntry::get_path)
91                .collect(),
92            | Err(_) => {
93                error!("=> {} Failed to process GitLab tree entries", Label::fail());
94                vec![]
95            }
96        }
97    }
98    fn get_domain(&self) -> String {
99        fn default_domain(repository: Repository) -> String {
100            match repository {
101                | Repository::GitHub { .. } => "github.com".to_string(),
102                | Repository::GitLab { .. } => "gitlab.com".to_string(),
103            }
104        }
105        match &self.code_repository {
106            | Repository::GitHub { uri } => match URI::try_from(uri.as_str()) {
107                | Ok(uri) => uri.host().unwrap().to_string(),
108                | Err(_) => default_domain(self.code_repository.clone()),
109            },
110            | Repository::GitLab { uri, .. } => match URI::try_from(uri.as_str()) {
111                | Ok(uri) => uri.host().unwrap().to_string(),
112                | Err(_) => default_domain(self.code_repository.clone()),
113            },
114        }
115    }
116    fn get_tree(&self, schema_type: SchemaType, page: u32) -> eyre::Result<reqwest::blocking::Response, reqwest::Error> {
117        let url = self.get_tree_url(schema_type, page);
118        reqwest::blocking::get(url)
119    }
120    fn get_tree_url(&self, schema_type: SchemaType, page: u32) -> String {
121        let id = match &self.code_repository {
122            | Repository::GitHub { .. } => todo!(),
123            | Repository::GitLab { id, .. } => id.to_string(),
124        };
125        let per_page = 100;
126        let url = format!(
127            "https://{}/api/v4/projects/{}/repository/tree?&per_page={}&page={}&recursive=true&path={}",
128            self.get_domain(),
129            id,
130            per_page,
131            page,
132            schema_type
133        );
134        debug!(url = url.as_str(), "=> {}", Label::using());
135        url
136    }
137    pub fn download_files(self: Bucket, output: PathBuf) -> usize {
138        fn get_suffix(value: usize) -> String {
139            (if value == 1 { "" } else { "s" }).to_string()
140        }
141        let counts = SCHEMA_TYPES.into_iter().map(|schema_type| match self.code_repository {
142            | Repository::GitHub { .. } => todo!(),
143            | Repository::GitLab { ref uri, .. } => {
144                const IGNORE: [&str; 4] = [".gitignore", ".gitkeep", ".DS_Store", "README.md"];
145                info!("=> Downloading {} research data from {}...", schema_type, uri.clone());
146                let paths = self
147                    .clone()
148                    .get_file_paths(schema_type.clone())
149                    .into_iter()
150                    .filter(|path| !IGNORE.iter().any(|x| path.ends_with(x)))
151                    .collect::<Vec<String>>();
152                let progress = ProgressBar::new(paths.len() as u64);
153                progress.set_style(ProgressStyle::with_template(Label::PROGRESS_BAR_TEMPLATE).unwrap());
154                let client = Client::new();
155                paths.par_iter().for_each(|path| {
156                    let url = format!("{}/-/raw/main/{}", uri, path);
157                    progress.set_message(format!("Downloading {}", path));
158                    let folder = format!("{}/{}", output.display(), get_parent(path.clone()));
159                    std::fs::create_dir_all(folder.clone()).unwrap();
160                    let mut file = File::create(format!("{}/{}", output.display(), path)).unwrap();
161                    let response = client.get(url).send().unwrap();
162                    let mut content = Cursor::new(response.bytes().unwrap());
163                    let _ = copy(&mut content, &mut file);
164                    progress.inc(1);
165                });
166                let total_data: usize = paths.clone().into_iter().filter(|path| path.to_lowercase().ends_with(".json")).count();
167                let total_images: usize = paths
168                    .into_iter()
169                    .filter(|path| path.to_lowercase().ends_with(".png") || path.to_lowercase().ends_with(".jpg"))
170                    .count();
171                let total = total_data + total_images;
172                let message = if total_data != total_images {
173                    let recommendation = if total_data > total_images {
174                        "Do you need to add some images?"
175                    } else {
176                        "Do you need to add some JSON files?"
177                    };
178                    format!(
179                        " ({} data file{}, {} image{} - {})",
180                        total_data.yellow(),
181                        get_suffix(total_data),
182                        total_images.yellow(),
183                        get_suffix(total_images),
184                        recommendation.italic(),
185                    )
186                } else {
187                    "".to_string()
188                };
189                progress.set_style(ProgressStyle::with_template("{msg}").unwrap());
190                progress.finish_with_message(format!(
191                    "  {}Downloaded {} {} {} file{}{}",
192                    if total > 0 { Label::CHECKMARK } else { Label::CAUTION },
193                    if total > 0 {
194                        total.green().to_string()
195                    } else {
196                        total.yellow().to_string()
197                    },
198                    self.clone().name.to_uppercase(),
199                    schema_type,
200                    get_suffix(total),
201                    message,
202                ));
203                total
204            }
205        });
206        counts.into_iter().sum()
207    }
208    // https://docs.gitlab.com/ee/api/repositories.html
209    fn get_file_paths(self: Bucket, schema_type: SchemaType) -> Vec<String> {
210        const FIRST_PAGE: u32 = 1;
211        match self.code_repository {
212            | Repository::GitHub { .. } => todo!(),
213            | Repository::GitLab { .. } => {
214                fn get_page_count(response: &reqwest::blocking::Response) -> u32 {
215                    fn parse_header(headers: &HeaderMap, key: &str) -> u32 {
216                        match headers.get(key) {
217                            | Some(val) if !val.is_empty() => {
218                                let value = val.to_str().unwrap().parse::<u32>().unwrap();
219                                debug!("=> {} {} = {}", Label::using(), key, value);
220                                value
221                            }
222                            | Some(_) | None => 0,
223                        }
224                    }
225                    let headers = response.headers();
226                    parse_header(headers, "x-total-pages")
227                }
228                match self.get_tree(schema_type.clone(), FIRST_PAGE) {
229                    | Ok(response) if response.status().is_success() => (FIRST_PAGE..=get_page_count(&response))
230                        .into_par_iter()
231                        .map(|page| self.clone().get_file_paths_for_page(schema_type.clone(), page))
232                        .reduce(std::vec::Vec::new, |a, b| [a, b].concat()),
233                    | Ok(_) | Err(_) => {
234                        let url = self.get_tree_url(schema_type.clone(), FIRST_PAGE);
235                        debug!(url, "=> {}", Label::using());
236                        error!(
237                            "=> {} Failed to get file paths for {} {} bucket",
238                            Label::fail(),
239                            self.name.to_uppercase().red(),
240                            schema_type.red()
241                        );
242                        vec![]
243                    }
244                }
245            }
246        }
247    }
248    fn get_file_paths_for_page(self: Bucket, schema_type: SchemaType, page: u32) -> Vec<String> {
249        match self.get_tree(schema_type.clone(), page) {
250            | Ok(response) if response.status().is_success() => match self.get_tree(schema_type.clone(), page) {
251                | Ok(response) if response.status().is_success() => Bucket::parse(response),
252                | Ok(_) | Err(_) => {
253                    let url = self.get_tree_url(schema_type.clone(), 1);
254                    error!(url, page, "=> {} Failed to get paths", Label::fail());
255                    vec![]
256                }
257            },
258            | Ok(_) | Err(_) => {
259                let url = self.get_tree_url(schema_type, page);
260                error!(url, page, "=> {} Failed to get paths", Label::fail());
261                vec![]
262            }
263        }
264    }
265}
266impl BucketsConfig {
267    pub fn read_json(path: PathBuf) -> Result<BucketsConfig> {
268        let content = match read_file(path.clone()) {
269            | Ok(value) if !value.is_empty() => value,
270            | Ok(_) | Err(_) => {
271                error!(
272                    path = path.to_str().unwrap(),
273                    "=> {} Bucket configuration content is not valid",
274                    Label::fail()
275                );
276                "{}".to_owned()
277            }
278        };
279        let data: Result<BucketsConfig> = serde_json::from_str(&content);
280        let label = match data {
281            | Ok(_) => Label::using(),
282            | Err(_) => Label::invalid(),
283        };
284        trace!("=> {} Bucket configuration = {:#?}", label, data.dimmed());
285        data
286    }
287}
288impl GitlabTreeEntry {
289    fn get_path(self) -> String {
290        self.path
291    }
292    fn is_blob(&self) -> bool {
293        self.entry_type.eq(&EntryType::Blob)
294    }
295}
296
297#[cfg(test)]
298mod tests;