acorn_lib/
lib.rs

1//! # ACORN Library
2//!
3//! This library provides functions for working with [ACORN](https://acorn.ornl.gov) data and supports the ACORN CLI.
4use color_eyre::eyre;
5use derive_more::Display;
6use indicatif::{ProgressBar, ProgressStyle};
7use owo_colors::OwoColorize;
8use rayon::prelude::*;
9use reqwest::blocking::Client;
10use reqwest::header::HeaderMap;
11use serde::{Deserialize, Serialize};
12use serde_json::Result;
13use std::fmt::Debug;
14use std::fs::File;
15use std::io::{copy, Cursor};
16use std::path::PathBuf;
17use tracing::{debug, error, info, trace};
18use uriparse::URI;
19
20pub mod analyzer;
21pub mod constants;
22pub mod doctor;
23pub mod powerpoint;
24pub mod schema;
25pub mod util;
26
27use crate::util::*;
28
29/// Files to ignore
30///
31/// - `.gitignore`
32/// - `.gitkeep`
33/// - `.DS_Store`
34/// - `README.md`
35pub const IGNORE: [&str; 5] = [".gitignore", ".gitlab-ci.yml", ".gitkeep", ".DS_Store", "README.md"];
36
37/// Type for GitLab tree entry
38#[derive(Clone, Debug, Display, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Ord)]
39#[serde(rename_all = "lowercase")]
40pub enum EntryType {
41    /// List of files and directories
42    ///
43    /// See <https://docs.gitlab.com/api/repositories/#list-repository-tree>
44    #[display("tree")]
45    Tree,
46    /// Base64 enoded content
47    ///
48    /// See <https://docs.gitlab.com/api/repositories/#get-a-blob-from-repository>
49    #[display("blob")]
50    Blob,
51}
52/// Git hosting repository data
53#[derive(Clone, Debug, Display, Serialize, Deserialize)]
54#[serde(tag = "provider", rename_all = "lowercase")]
55pub enum Repository {
56    /// GitHub
57    ///
58    /// See <https://docs.github.com/en/rest/reference/repos>
59    #[display("github")]
60    GitHub {
61        /// Repository URI
62        uri: String,
63    },
64    /// GitLab
65    ///
66    /// See <https://docs.gitlab.com/api/repositories/#list-repository-tree>
67    #[display("gitlab")]
68    GitLab {
69        /// Integer ID of GitLab project
70        ///
71        /// See <https://docs.gitlab.com/api/projects/#get-a-single-project> for more information
72        id: u64,
73        /// Repository URI
74        uri: String,
75    },
76}
77/// Struct for buckets configuration
78///
79/// ### Example buckets.json
80/// ```json
81/// {
82///     "buckets": [
83///         {
84///             "name": "example",
85///             "repository": {
86///                 "provider": "github",
87///                 "uri": "https://github.com/username/example"
88///             }
89///         },
90///         {
91///             "name": "example",
92///             "repository": {
93///                 "provider": "gitlab",
94///                 "id": 12345,
95///                 "uri": "https://gitlab.com/username/example"
96///             }
97///         }
98///     ]
99/// }
100/// ```
101#[derive(Clone, Debug, Serialize, Deserialize)]
102pub struct BucketsConfig {
103    /// List of buckets
104    pub buckets: Vec<Bucket>,
105}
106/// Struct for bucket data
107#[derive(Clone, Debug, Serialize, Deserialize)]
108#[serde(rename_all = "camelCase")]
109pub struct Bucket {
110    /// Bucket name
111    ///
112    /// See <https://schema.org/name>
113    pub name: String,
114    /// Bucket description
115    ///
116    /// See <https://schema.org/description>
117    pub description: Option<String>,
118    /// Code repository data of bucket
119    ///
120    /// See <https://schema.org/codeRepository>
121    #[serde(alias = "repository")]
122    pub code_repository: Repository,
123}
124/// Struct for GitLab tree entry
125///
126/// See <https://docs.gitlab.com/api/repositories/#list-repository-tree>
127#[derive(Clone, Debug, Serialize, Deserialize)]
128pub struct GitlabTreeEntry {
129    /// Integer ID of GitLab project
130    ///
131    /// See <https://docs.gitlab.com/api/projects/#get-a-single-project>
132    pub id: String,
133    /// Name of tree entry
134    pub name: String,
135    /// Type of tree entry
136    #[serde(rename = "type")]
137    pub entry_type: EntryType,
138    /// Path of tree entry
139    ///
140    /// The path inside the repository. Used to get content of subdirectories.
141    pub path: String,
142    /// Mode of tree entry
143    pub mode: String,
144}
145impl Bucket {
146    /// Parse GitLab tree entries
147    fn parse(response: reqwest::blocking::Response) -> Vec<String> {
148        let content = response.text().unwrap();
149        let data: Result<Vec<GitlabTreeEntry>> = serde_json::from_str(&content);
150        debug!("=> {} {} GitLab tree entries", Label::found(), data.as_ref().unwrap().len());
151        match data {
152            | Ok(entries) => entries
153                .into_iter()
154                .filter(GitlabTreeEntry::is_blob)
155                .map(GitlabTreeEntry::get_path)
156                .collect(),
157            | Err(_) => {
158                error!("=> {} Failed to process GitLab tree entries", Label::fail());
159                vec![]
160            }
161        }
162    }
163    /// Get hosting domain from bucket struct
164    fn get_domain(&self) -> String {
165        fn default_domain(repository: Repository) -> String {
166            match repository {
167                | Repository::GitHub { .. } => "github.com".to_string(),
168                | Repository::GitLab { .. } => "gitlab.com".to_string(),
169            }
170        }
171        match &self.code_repository {
172            | Repository::GitHub { uri } => match URI::try_from(uri.as_str()) {
173                | Ok(uri) => uri.host().unwrap().to_string(),
174                | Err(_) => default_domain(self.code_repository.clone()),
175            },
176            | Repository::GitLab { uri, .. } => match URI::try_from(uri.as_str()) {
177                | Ok(uri) => uri.host().unwrap().to_string(),
178                | Err(_) => default_domain(self.code_repository.clone()),
179            },
180        }
181    }
182    fn get_tree(&self, directory: &str, page: u32) -> eyre::Result<reqwest::blocking::Response, reqwest::Error> {
183        let url = self.get_tree_url(directory, page);
184        reqwest::blocking::get(url)
185    }
186    fn get_tree_url(&self, directory: &str, page: u32) -> String {
187        let id = match &self.code_repository {
188            | Repository::GitHub { .. } => todo!(),
189            | Repository::GitLab { id, .. } => id.to_string(),
190        };
191        let per_page = 100;
192        let url = format!(
193            "https://{}/api/v4/projects/{}/repository/tree?&per_page={}&page={}&recursive=true&path={}",
194            self.get_domain(),
195            id,
196            per_page,
197            page,
198            directory
199        );
200        debug!(url = url.as_str(), "=> {}", Label::using());
201        url
202    }
203    /// Download files from bucket to local directory
204    ///
205    /// Ignores files listed in [`IGNORE`]
206    pub fn download_files(self: Bucket, output: PathBuf) -> usize {
207        match self.code_repository {
208            | Repository::GitHub { ref uri, .. } => todo!("Add support for GitHub repositories like {uri}"),
209            | Repository::GitLab { ref uri, .. } => {
210                info!("=> Downloading research data from {}...", uri.clone());
211                let paths = self
212                    .clone()
213                    .get_file_paths("")
214                    .into_iter()
215                    .filter(|path| !IGNORE.iter().any(|x| path.ends_with(x)))
216                    .collect::<Vec<String>>();
217                let progress = ProgressBar::new(paths.len() as u64);
218                progress.set_style(ProgressStyle::with_template(Label::PROGRESS_BAR_TEMPLATE).unwrap());
219                let client = Client::new();
220                paths.par_iter().for_each(|path| {
221                    let url = format!("{}/-/raw/main/{}", uri, path);
222                    progress.set_message(format!("Downloading {}", path));
223                    let folder = format!("{}/{}", output.display(), get_parent(path.clone()));
224                    std::fs::create_dir_all(folder.clone()).unwrap();
225                    if let Ok(mut file) = File::create(format!("{}/{}", output.display(), path)) {
226                        match client.get(url).send() {
227                            | Ok(response) => match response.bytes() {
228                                | Ok(bytes) => {
229                                    let mut content = Cursor::new(bytes);
230                                    let _ = copy(&mut content, &mut file);
231                                }
232                                | Err(why) => {
233                                    error!(path, "=> {} Failed to convert to bytes - {why}", Label::fail());
234                                }
235                            },
236                            | Err(why) => {
237                                error!(path, "=> {} Failed to download - {why}", Label::fail());
238                            }
239                        }
240                    };
241                    progress.inc(1);
242                });
243                let total_data: usize = paths.clone().into_iter().filter(|path| path.to_lowercase().ends_with(".json")).count();
244                let total_images: usize = paths
245                    .into_iter()
246                    .filter(|path| path.to_lowercase().ends_with(".png") || path.to_lowercase().ends_with(".jpg"))
247                    .count();
248                let total = total_data + total_images;
249                let message = if total_data != total_images {
250                    let recommendation = if total_data > total_images {
251                        "Do you need to add some images?"
252                    } else {
253                        "Do you need to add some JSON files?"
254                    };
255                    format!(
256                        " ({} data file{}, {} image{} - {})",
257                        total_data.yellow(),
258                        get_suffix(total_data),
259                        total_images.yellow(),
260                        get_suffix(total_images),
261                        recommendation.italic(),
262                    )
263                } else {
264                    "".to_string()
265                };
266                progress.set_style(ProgressStyle::with_template("{msg}").unwrap());
267                progress.finish_with_message(format!(
268                    "  {}Downloaded {} {} file{}{}",
269                    if total > 0 { Label::CHECKMARK } else { Label::CAUTION },
270                    if total > 0 {
271                        total.green().to_string()
272                    } else {
273                        total.yellow().to_string()
274                    },
275                    self.clone().name.to_uppercase(),
276                    get_suffix(total),
277                    message,
278                ));
279                total
280            }
281        }
282    }
283    fn get_file_paths(self: Bucket, directory: &str) -> Vec<String> {
284        const FIRST_PAGE: u32 = 1;
285        match self.code_repository {
286            | Repository::GitHub { .. } => todo!(),
287            | Repository::GitLab { .. } => {
288                fn get_page_count(response: &reqwest::blocking::Response) -> u32 {
289                    fn parse_header(headers: &HeaderMap, key: &str) -> u32 {
290                        match headers.get(key) {
291                            | Some(val) if !val.is_empty() => {
292                                let value = val.to_str().unwrap().parse::<u32>().unwrap();
293                                debug!("=> {} {} = {}", Label::using(), key, value);
294                                value
295                            }
296                            | Some(_) | None => 0,
297                        }
298                    }
299                    let headers = response.headers();
300                    parse_header(headers, "x-total-pages")
301                }
302                match self.get_tree(directory, FIRST_PAGE) {
303                    | Ok(response) if response.status().is_success() => (FIRST_PAGE..=get_page_count(&response))
304                        .into_par_iter()
305                        .map(|page| self.clone().get_file_paths_for_page(directory, page))
306                        .reduce(std::vec::Vec::new, |a, b| [a, b].concat()),
307                    | Ok(_) | Err(_) => {
308                        let url = self.get_tree_url(directory, FIRST_PAGE);
309                        debug!(url, "=> {}", Label::using());
310                        error!(
311                            "=> {} Failed to get file paths for {} bucket",
312                            Label::fail(),
313                            self.name.to_uppercase().red(),
314                        );
315                        vec![]
316                    }
317                }
318            }
319        }
320    }
321    fn get_file_paths_for_page(self: Bucket, directory: &str, page: u32) -> Vec<String> {
322        match self.get_tree(directory, page) {
323            | Ok(response) if response.status().is_success() => match self.get_tree(directory, page) {
324                | Ok(response) if response.status().is_success() => Bucket::parse(response),
325                | Ok(_) | Err(_) => {
326                    let url = self.get_tree_url(directory, 1);
327                    error!(url, page, "=> {} Failed to get paths", Label::fail());
328                    vec![]
329                }
330            },
331            | Ok(_) | Err(_) => {
332                let url = self.get_tree_url(directory, page);
333                error!(url, page, "=> {} Failed to get paths", Label::fail());
334                vec![]
335            }
336        }
337    }
338}
339impl BucketsConfig {
340    /// Read buckets configuration using Serde and [`BucketsConfig`] struct
341    pub fn read_json(path: PathBuf) -> Result<BucketsConfig> {
342        let content = match read_file(path.clone()) {
343            | Ok(value) if !value.is_empty() => value,
344            | Ok(_) | Err(_) => {
345                error!(
346                    path = path.to_str().unwrap(),
347                    "=> {} Bucket configuration content is not valid",
348                    Label::fail()
349                );
350                "{}".to_owned()
351            }
352        };
353        let data: Result<BucketsConfig> = serde_json::from_str(&content);
354        let label = match data {
355            | Ok(_) => Label::using(),
356            | Err(_) => Label::invalid(),
357        };
358        trace!("=> {} Bucket configuration = {:#?}", label, data.dimmed());
359        data
360    }
361}
362impl GitlabTreeEntry {
363    fn get_path(self) -> String {
364        self.path
365    }
366    fn is_blob(&self) -> bool {
367        self.entry_type.eq(&EntryType::Blob)
368    }
369}
370
371#[cfg(test)]
372mod tests;