1use crate::constants::*;
5use crate::util::{find_first, Label};
6use clap::ValueEnum;
7use derive_more::Display;
8use dotenvy::dotenv;
9use fancy_regex::Regex;
10use lazy_static::lazy_static;
11use std::collections::HashMap;
12use tracing::{debug, trace, warn};
13
14lazy_static! {
15 pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
17 pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
19 pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
21 pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
23 pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
25 pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
27 pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
29 pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
31 pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
33 pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
35 pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
37 pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
39 pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
41 ("child", "children"),
42 ("cow", "cattle"),
43 ("foot", "feet"),
44 ("goose", "geese"),
45 ("man", "men"),
46 ("move", "moves"),
47 ("person", "people"),
48 ("radius", "radii"),
49 ("sex", "sexes"),
50 ("tooth", "teeth"),
51 ("woman", "women"),
52 ].into_iter().collect();
53 pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
57 pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
61 ("ayo", 2),
62 ("australian", 3),
63 ("dionysius", 5),
64 ("disbursement", 3),
65 ("discouragement", 4),
66 ("disenfranchisement", 5),
67 ("disengagement", 4),
68 ("disgraceful", 3),
69 ("diskette", 2),
70 ("displacement", 3),
71 ("distasteful", 3),
72 ("distinctiveness", 4),
73 ("distraction", 3),
74 ("geoffrion", 4),
75 ("mcquaid", 2),
76 ("mcquaide", 2),
77 ("mcquaig", 2),
78 ("mcquain", 2),
79 ("nonbusiness", 3),
80 ("nonetheless", 3),
81 ("nonmanagement", 4),
82 ("outplacement", 3),
83 ("outrageously", 4),
84 ("postponement", 3),
85 ("preemption", 3),
86 ("preignition", 4),
87 ("preinvasion", 4),
88 ("preisler", 3),
89 ("preoccupation", 5),
90 ("prevette", 2),
91 ("probusiness", 3),
92 ("procurement", 3),
93 ("pronouncement", 3),
94 ("sidewater", 3),
95 ("sidewinder", 3),
96 ("ungerer", 3),
97 ].into_iter().collect();
98 pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
100 ("abalone", 4),
101 ("abare", 3),
102 ("abbruzzese", 4),
103 ("abed", 2),
104 ("aborigine", 5),
105 ("abruzzese", 4),
106 ("acreage", 3),
107 ("adame", 3),
108 ("adieu", 2),
109 ("adobe", 3),
110 ("anemone", 4),
111 ("anyone", 3),
112 ("apache", 3),
113 ("aphrodite", 4),
114 ("apostrophe", 4),
115 ("ariadne", 4),
116 ("cafe", 2),
117 ("café", 2),
118 ("calliope", 4),
119 ("catastrophe", 4),
120 ("chile", 2),
121 ("chloe", 2),
122 ("circe", 2),
123 ("cliche", 2),
124 ("cliché", 2),
125 ("contrariety", 4),
126 ("coyote", 3),
127 ("daphne", 2),
128 ("epitome", 4),
129 ("eurydice", 4),
130 ("euterpe", 3),
131 ("every", 2),
132 ("everywhere", 3),
133 ("forever", 3),
134 ("gethsemane", 4),
135 ("guacamole", 4),
136 ("hermione", 4),
137 ("hyperbole", 4),
138 ("jesse", 2),
139 ("jukebox", 2),
140 ("karate", 3),
141 ("machete", 3),
142 ("maybe", 2),
143 ("naive", 2),
144 ("newlywed", 3),
145 ("ninety", 2),
146 ("penelope", 4),
147 ("people", 2),
148 ("persephone", 4),
149 ("phoebe", 2),
150 ("pulse", 1),
151 ("queue", 1),
152 ("recipe", 3),
153 ("reptilian", 4),
154 ("resumé", 2),
155 ("riverbed", 3),
156 ("scotia", 3),
157 ("sesame", 3),
158 ("shoreline", 2),
159 ("simile", 3),
160 ("snuffleupagus", 5),
161 ("sometimes", 2),
162 ("syncope", 3),
163 ("tamale", 3),
164 ("waterbed", 3),
165 ("wednesday", 2),
166 ("viceroyship", 3),
167 ("yosemite", 4),
168 ("zoë", 2),
169 ].into_iter().collect();
170}
171const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
173 (r#"(quiz)zes$"#, r#"${1}"#),
174 (r#"(matr)ices$"#, r#"${1}ix"#),
175 (r#"(vert|ind)ices$"#, r#"${1}ex"#),
176 (r#"^(ox)en$"#, r#"${1}"#),
177 (r#"(alias)es$"#, r#"${1}"#),
178 (r#"(octop|vir)i$"#, r#"${1}us"#),
179 (r#"(cris|ax|test)es$"#, r#"${1}is"#),
180 (r#"(shoe)s$"#, r#"${1}"#),
181 (r#"(o)es$"#, r#"${1}"#),
182 (r#"(bus)es$"#, r#"${1}"#),
183 (r#"([m|l])ice$"#, r#"${1}ouse"#),
184 (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
185 (r#"(m)ovies$"#, r#"${1}ovie"#),
186 (r#"(s)eries$"#, r#"${1}eries"#),
187 (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
188 (r#"([lr])ves$"#, r#"${1}f"#),
189 (r#"(tive)s$"#, r#"${1}"#),
190 (r#"(hive)s$"#, r#"${1}"#),
191 (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
192 (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
193 (r#"(^analy)ses$"#, r#"${1}sis"#),
194 (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
195 (r#"([ti])a$"#, r#"${1}um"#),
196 (r#"(n)ews$"#, r#"${1}ews"#),
197 (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
198 (r#"(corpse)s$"#, r#"${1}"#),
199 (r#"(us)es$"#, r#"${1}"#),
200 (r#"s$"#, r#""#),
201];
202pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
204 "accommodation",
205 "advice",
206 "alms",
207 "aircraft",
208 "aluminum",
209 "barracks",
210 "bison",
211 "binoculars",
212 "bourgeois",
213 "breadfruit",
214 "buffalo",
215 "cannon",
216 "caribou",
217 "chalk",
218 "chassis",
219 "chinos",
220 "clippers",
221 "clothing",
222 "cod",
223 "concrete",
224 "corps",
225 "correspondence",
226 "crossroads",
227 "data",
228 "deer",
229 "doldrums",
230 "dungarees",
231 "education",
232 "eggfruit",
233 "elk",
234 "equipment",
235 "eyeglasses",
236 "fish",
237 "flares",
238 "flour",
239 "food",
240 "fruit",
241 "furniture",
242 "gallows",
243 "goldfish",
244 "grapefruit",
245 "greenfly",
246 "grouse",
247 "haddock",
248 "halibut",
249 "head",
250 "headquarters",
251 "help",
252 "homework",
253 "hovercraft",
254 "ides",
255 "information",
256 "insignia",
257 "jackfruit",
258 "jeans",
259 "knickers",
260 "knowledge",
261 "kudos",
262 "leggings",
263 "lego",
264 "luggage",
265 "mathematics",
266 "money",
267 "moose",
268 "monkfish",
269 "mullet",
270 "nailclippers",
271 "news",
272 "nitrogen",
273 "offspring",
274 "oxygen",
275 "pants",
276 "pyjamas",
277 "passionfruit",
278 "pike",
279 "pliers",
280 "police",
281 "premises",
282 "reindeer",
283 "rendezvous",
284 "rice",
285 "salmon",
286 "scissors",
287 "series",
288 "shambles",
289 "sheep",
290 "shellfish",
291 "shorts",
292 "shrimp",
293 "smithereens",
294 "spacecraft",
295 "species",
296 "squid",
297 "staff",
298 "starfruit",
299 "statistics",
300 "stone",
301 "sugar",
302 "swine",
303 "tights",
304 "tongs",
305 "traffic",
306 "trousers",
307 "trout",
308 "tuna",
309 "tweezers",
310 "wheat",
311 "whitebait",
312 "wood",
313 "you",
314];
315#[derive(Clone, Copy, Debug, Default, Display, PartialEq, ValueEnum)]
317pub enum ReadabilityType {
318 #[display("ari")]
322 ARI,
323 #[display("cli")]
327 CLI,
328 #[default]
332 #[display("fkgl")]
333 FKGL,
334 #[display("fres")]
338 FRES,
339 #[display("gfi")]
341 GFI,
342 #[display("lix")]
346 Lix,
347 #[display("smog")]
351 SMOG,
352}
353impl ReadabilityType {
354 pub fn calculate(self, text: &str) -> f64 {
356 match self {
357 | ReadabilityType::ARI => automated_readability_index(text),
358 | ReadabilityType::CLI => coleman_liau_index(text),
359 | ReadabilityType::FKGL => flesch_kincaid_grade_level(text),
360 | ReadabilityType::FRES => flesch_reading_ease_score(text),
361 | ReadabilityType::GFI => gunning_fog_index(text),
362 | ReadabilityType::Lix => lix(text),
363 | ReadabilityType::SMOG => smog(text),
364 }
365 }
366 pub fn from_string(value: &str) -> ReadabilityType {
368 match value.to_lowercase().replace("-", " ").as_str() {
369 | "ari" | "automated readability index" => ReadabilityType::ARI,
370 | "cli" | "coleman liau index" => ReadabilityType::CLI,
371 | "fkgl" | "flesch kincaid grade level" => ReadabilityType::FKGL,
372 | "fres" | "flesch reading ease score" => ReadabilityType::FRES,
373 | "gfi" | "gunning fog index" => ReadabilityType::GFI,
374 | "lix" => ReadabilityType::Lix,
375 | "smog" | "simple measure of gobbledygook" => ReadabilityType::SMOG,
376 | _ => {
377 warn!(value, "=> {} Unknown Readability Type", Label::using());
378 ReadabilityType::default()
379 }
380 }
381 }
382 pub fn maximum_allowed(self) -> f64 {
384 match self {
385 | ReadabilityType::ARI => MAX_ALLOWED_ARI,
386 | ReadabilityType::CLI => MAX_ALLOWED_CLI,
387 | ReadabilityType::FKGL => MAX_ALLOWED_FKGL,
388 | ReadabilityType::FRES => MAX_ALLOWED_FRES,
389 | ReadabilityType::GFI => MAX_ALLOWED_GFI,
390 | ReadabilityType::Lix => MAX_ALLOWED_LIX,
391 | ReadabilityType::SMOG => MAX_ALLOWED_SMOG,
392 }
393 }
394 pub fn maximum_allowed_from_env(self) -> Option<f64> {
396 match dotenv() {
397 | Ok(_) => {
398 let variables = dotenvy::vars().collect::<Vec<(String, String)>>();
399 let pair = match self {
400 | ReadabilityType::ARI => find_first(variables, "MAX_ALLOWED_ARI"),
401 | ReadabilityType::CLI => find_first(variables, "MAX_ALLOWED_CLI"),
402 | ReadabilityType::FKGL => find_first(variables, "MAX_ALLOWED_FKGL"),
403 | ReadabilityType::FRES => find_first(variables, "MAX_ALLOWED_FRES"),
404 | ReadabilityType::GFI => find_first(variables, "MAX_ALLOWED_GFI"),
405 | ReadabilityType::Lix => find_first(variables, "MAX_ALLOWED_LIX"),
406 | ReadabilityType::SMOG => find_first(variables, "MAX_ALLOWED_SMOG"),
407 };
408 match pair {
409 | Some((_, value)) => Some(value.parse::<f64>().unwrap()),
410 | None => None,
411 }
412 }
413 | Err(_) => None,
414 }
415 }
416}
417pub fn complex_word_count(text: &str) -> u32 {
421 words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
422}
423pub fn letter_count(text: &str) -> u32 {
427 text.chars()
428 .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
429 .count() as u32
430}
431pub fn long_word_count(text: &str) -> u32 {
435 words(text).iter().filter(|word| word.len() > 6).count() as u32
436}
437pub fn sentence_count(text: &str) -> u32 {
439 text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
440}
441pub fn words(text: &str) -> Vec<String> {
443 text.split_whitespace().map(String::from).collect()
444}
445pub fn word_count(text: &str) -> u32 {
449 words(text).len() as u32
450}
451pub fn automated_readability_index(text: &str) -> f64 {
460 let letters = letter_count(text);
461 let words = word_count(text);
462 let sentences = sentence_count(text);
463 debug!(letters, words, sentences, "=> {}", Label::using());
464 let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
465 format!("{score:.2}").parse().unwrap()
466}
467pub fn coleman_liau_index(text: &str) -> f64 {
471 let letters = letter_count(text);
472 let words = word_count(text);
473 let sentences = sentence_count(text);
474 debug!(letters, words, sentences, "=> {}", Label::using());
475 let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
476 format!("{score:.2}").parse().unwrap()
477}
478pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
490 let words = word_count(text);
491 let sentences = sentence_count(text);
492 let syllables = syllable_count(text);
493 debug!(words, sentences, syllables, "=> {}", Label::using());
494 let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
495 format!("{score:.2}").parse().unwrap()
496}
497pub fn flesch_reading_ease_score(text: &str) -> f64 {
505 let words = word_count(text);
506 let sentences = sentence_count(text);
507 let syllables = syllable_count(text);
508 debug!(words, sentences, syllables, "=> {}", Label::using());
509 let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
510 format!("{score:.2}").parse().unwrap()
511}
512pub fn gunning_fog_index(text: &str) -> f64 {
520 let words = word_count(text);
521 let complex_words = complex_word_count(text);
522 let sentences = sentence_count(text);
523 let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
524 format!("{score:.2}").parse().unwrap()
525}
526pub fn lix(text: &str) -> f64 {
536 let words = word_count(text);
537 let sentences = sentence_count(text);
538 let long_words = long_word_count(text);
539 let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
540 format!("{score:.2}").parse().unwrap()
541}
542pub fn smog(text: &str) -> f64 {
552 let sentences = sentence_count(text);
553 let complex_words = complex_word_count(text);
554 let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
555 format!("{score:.2}").parse().unwrap()
556}
557pub fn singular_form(word: &str) -> String {
561 match word.to_lowercase().as_str() {
562 | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
563 | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
564 | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
565 | Some(value) => value.to_string(),
566 | None => value.to_string(),
567 },
568 | value => {
569 let pair = PLURAL_TO_SINGULAR
570 .iter()
571 .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
572 | Ok(true) => true,
573 | Ok(false) | Err(_) => false,
574 });
575 match pair {
576 | Some((pattern, replacement)) => {
577 trace!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
578 let re = Regex::new(pattern).unwrap();
579 re.replace_all(value, *replacement).to_string()
580 }
581 | None => value.to_string(),
582 }
583 }
584 }
585}
586pub fn syllable_count(text: &str) -> usize {
595 fn syllables(word: String) -> usize {
596 let singular = singular_form(&word);
597 match word.as_str() {
598 | "" => 0,
599 | value if value.len() < 3 => 1,
600 | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
601 | Some(x) => *x,
602 | None => 0,
603 },
604 | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
605 | Some(x) => *x,
606 | None => 0,
607 },
608 | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
609 | Some(x) => *x,
610 | None => 0,
611 },
612 | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
613 | Some(x) => *x,
614 | None => 0,
615 },
616 | _ => {
617 let mut count: isize = 0;
618 let mut input = word;
619 count += 3 * TRIPLE.find_iter(&input).count() as isize;
620 input = TRIPLE.replace_all(&input, "").to_string();
621 count += 2 * DOUBLE.find_iter(&input).count() as isize;
622 input = DOUBLE.replace_all(&input, "").to_string();
623 count += SINGLE.find_iter(&input).count() as isize;
624 input = SINGLE.replace_all(&input, "").to_string();
625 count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
626 count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
627 count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
628 count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
629 count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
630 count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
631 count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
632 count as usize
633 }
634 }
635 }
636 let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
637 tokens.into_iter().map(syllables).sum()
638}
639pub fn tokenize(value: &str) -> Vec<String> {
646 value
647 .replace("é", "-e")
648 .replace("ë", "-e")
649 .split('-')
650 .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
651 .collect::<Vec<_>>()
652}