1use crate::constants::*;
5use crate::util::{find_first, Label};
6use clap::ValueEnum;
7use derive_more::Display;
8use dotenvy::dotenv;
9use fancy_regex::Regex;
10use lazy_static::lazy_static;
11use std::collections::HashMap;
12use tracing::{debug, trace, warn};
13
14lazy_static! {
15 pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
17 pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
19 pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
21 pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
23 pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
25 pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
27 pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
29 pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
31 pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
33 pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
35 pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
37 pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
39 pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
41 ("child", "children"),
42 ("cow", "cattle"),
43 ("foot", "feet"),
44 ("goose", "geese"),
45 ("man", "men"),
46 ("move", "moves"),
47 ("person", "people"),
48 ("radius", "radii"),
49 ("sex", "sexes"),
50 ("tooth", "teeth"),
51 ("woman", "women"),
52 ].into_iter().collect();
53 pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
57 pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
61 ("ayo", 2),
62 ("australian", 3),
63 ("dionysius", 5),
64 ("disbursement", 3),
65 ("discouragement", 4),
66 ("disenfranchisement", 5),
67 ("disengagement", 4),
68 ("disgraceful", 3),
69 ("diskette", 2),
70 ("displacement", 3),
71 ("distasteful", 3),
72 ("distinctiveness", 4),
73 ("distraction", 3),
74 ("geoffrion", 4),
75 ("mcquaid", 2),
76 ("mcquaide", 2),
77 ("mcquaig", 2),
78 ("mcquain", 2),
79 ("nonbusiness", 3),
80 ("nonetheless", 3),
81 ("nonmanagement", 4),
82 ("outplacement", 3),
83 ("outrageously", 4),
84 ("postponement", 3),
85 ("preemption", 3),
86 ("preignition", 4),
87 ("preinvasion", 4),
88 ("preisler", 3),
89 ("preoccupation", 5),
90 ("prevette", 2),
91 ("probusiness", 3),
92 ("procurement", 3),
93 ("pronouncement", 3),
94 ("sidewater", 3),
95 ("sidewinder", 3),
96 ("ungerer", 3),
97 ].into_iter().collect();
98 pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
100 ("abalone", 4),
101 ("abare", 3),
102 ("abbruzzese", 4),
103 ("abed", 2),
104 ("aborigine", 5),
105 ("abruzzese", 4),
106 ("acreage", 3),
107 ("adame", 3),
108 ("adieu", 2),
109 ("adobe", 3),
110 ("anemone", 4),
111 ("anyone", 3),
112 ("apache", 3),
113 ("aphrodite", 4),
114 ("apostrophe", 4),
115 ("ariadne", 4),
116 ("cafe", 2),
117 ("café", 2),
118 ("calliope", 4),
119 ("catastrophe", 4),
120 ("chile", 2),
121 ("chloe", 2),
122 ("circe", 2),
123 ("cliche", 2),
124 ("cliché", 2),
125 ("contrariety", 4),
126 ("coyote", 3),
127 ("daphne", 2),
128 ("epitome", 4),
129 ("eurydice", 4),
130 ("euterpe", 3),
131 ("every", 2),
132 ("everywhere", 3),
133 ("forever", 3),
134 ("gethsemane", 4),
135 ("guacamole", 4),
136 ("hermione", 4),
137 ("hyperbole", 4),
138 ("jesse", 2),
139 ("jukebox", 2),
140 ("karate", 3),
141 ("machete", 3),
142 ("maybe", 2),
143 ("naive", 2),
144 ("newlywed", 3),
145 ("ninety", 2),
146 ("penelope", 4),
147 ("people", 2),
148 ("persephone", 4),
149 ("phoebe", 2),
150 ("pulse", 1),
151 ("queue", 1),
152 ("recipe", 3),
153 ("reptilian", 4),
154 ("resumé", 2),
155 ("riverbed", 3),
156 ("scotia", 3),
157 ("sesame", 3),
158 ("shoreline", 2),
159 ("simile", 3),
160 ("snuffleupagus", 5),
161 ("sometimes", 2),
162 ("syncope", 3),
163 ("tamale", 3),
164 ("waterbed", 3),
165 ("wednesday", 2),
166 ("viceroyship", 3),
167 ("yosemite", 4),
168 ("zoë", 2),
169 ].into_iter().collect();
170}
171const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
173 (r#"(quiz)zes$"#, r#"${1}"#),
174 (r#"(matr)ices$"#, r#"${1}ix"#),
175 (r#"(vert|ind)ices$"#, r#"${1}ex"#),
176 (r#"^(ox)en$"#, r#"${1}"#),
177 (r#"(alias)es$"#, r#"${1}"#),
178 (r#"(octop|vir)i$"#, r#"${1}us"#),
179 (r#"(cris|ax|test)es$"#, r#"${1}is"#),
180 (r#"(shoe)s$"#, r#"${1}"#),
181 (r#"(o)es$"#, r#"${1}"#),
182 (r#"(bus)es$"#, r#"${1}"#),
183 (r#"([m|l])ice$"#, r#"${1}ouse"#),
184 (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
185 (r#"(m)ovies$"#, r#"${1}ovie"#),
186 (r#"(s)eries$"#, r#"${1}eries"#),
187 (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
188 (r#"([lr])ves$"#, r#"${1}f"#),
189 (r#"(tive)s$"#, r#"${1}"#),
190 (r#"(hive)s$"#, r#"${1}"#),
191 (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
192 (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
193 (r#"(^analy)ses$"#, r#"${1}sis"#),
194 (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
195 (r#"([ti])a$"#, r#"${1}um"#),
196 (r#"(n)ews$"#, r#"${1}ews"#),
197 (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
198 (r#"(corpse)s$"#, r#"${1}"#),
199 (r#"(us)es$"#, r#"${1}"#),
200 (r#"s$"#, r#""#),
201];
202pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
204 "accommodation",
205 "advice",
206 "alms",
207 "aircraft",
208 "aluminum",
209 "barracks",
210 "bison",
211 "binoculars",
212 "bourgeois",
213 "breadfruit",
214 "buffalo",
215 "cannon",
216 "caribou",
217 "chalk",
218 "chassis",
219 "chinos",
220 "clippers",
221 "clothing",
222 "cod",
223 "concrete",
224 "corps",
225 "correspondence",
226 "crossroads",
227 "data",
228 "deer",
229 "doldrums",
230 "dungarees",
231 "education",
232 "eggfruit",
233 "elk",
234 "equipment",
235 "eyeglasses",
236 "fish",
237 "flares",
238 "flour",
239 "food",
240 "fruit",
241 "furniture",
242 "gallows",
243 "goldfish",
244 "grapefruit",
245 "greenfly",
246 "grouse",
247 "haddock",
248 "halibut",
249 "head",
250 "headquarters",
251 "help",
252 "homework",
253 "hovercraft",
254 "ides",
255 "information",
256 "insignia",
257 "jackfruit",
258 "jeans",
259 "knickers",
260 "knowledge",
261 "kudos",
262 "leggings",
263 "lego",
264 "luggage",
265 "mathematics",
266 "money",
267 "moose",
268 "monkfish",
269 "mullet",
270 "nailclippers",
271 "news",
272 "nitrogen",
273 "offspring",
274 "oxygen",
275 "pants",
276 "pyjamas",
277 "passionfruit",
278 "pike",
279 "pliers",
280 "police",
281 "premises",
282 "reindeer",
283 "rendezvous",
284 "rice",
285 "salmon",
286 "scissors",
287 "series",
288 "shambles",
289 "sheep",
290 "shellfish",
291 "shorts",
292 "shrimp",
293 "smithereens",
294 "spacecraft",
295 "species",
296 "squid",
297 "staff",
298 "starfruit",
299 "statistics",
300 "stone",
301 "sugar",
302 "swine",
303 "tights",
304 "tongs",
305 "traffic",
306 "trousers",
307 "trout",
308 "tuna",
309 "tweezers",
310 "wheat",
311 "whitebait",
312 "wood",
313 "you",
314];
315#[derive(Clone, Copy, Debug, Default, Display, PartialEq, ValueEnum)]
317pub enum ReadabilityType {
318 #[display("ari")]
320 ARI,
321 #[display("cli")]
323 CLI,
324 #[default]
326 #[display("fkgl")]
327 FKGL,
328 #[display("fres")]
330 FRES,
331 #[display("gfi")]
333 GFI,
334 #[display("lix")]
336 Lix,
337 #[display("smog")]
339 SMOG,
340}
341impl ReadabilityType {
342 pub fn calculate(self, text: &str) -> f64 {
344 match self {
345 | ReadabilityType::ARI => automated_readability_index(text),
346 | ReadabilityType::CLI => coleman_liau_index(text),
347 | ReadabilityType::FKGL => flesch_kincaid_grade_level(text),
348 | ReadabilityType::FRES => flesch_reading_ease_score(text),
349 | ReadabilityType::GFI => gunning_fog_index(text),
350 | ReadabilityType::Lix => lix(text),
351 | ReadabilityType::SMOG => smog(text),
352 }
353 }
354 pub fn from_string(value: &str) -> ReadabilityType {
356 match value.to_lowercase().replace("-", " ").as_str() {
357 | "ari" | "automated readability index" => ReadabilityType::ARI,
358 | "cli" | "coleman liau index" => ReadabilityType::CLI,
359 | "fkgl" | "flesch kincaid grade level" => ReadabilityType::FKGL,
360 | "fres" | "flesch reading ease score" => ReadabilityType::FRES,
361 | "gfi" | "gunning fog index" => ReadabilityType::GFI,
362 | "lix" => ReadabilityType::Lix,
363 | "smog" | "simple measure of gobbledygook" => ReadabilityType::SMOG,
364 | _ => {
365 warn!(value, "=> {} Unknown Readability Type", Label::using());
366 ReadabilityType::default()
367 }
368 }
369 }
370 pub fn maximum_allowed(self) -> f64 {
372 match dotenv() {
373 | Ok(_) => {
374 let variables = dotenvy::vars().collect::<Vec<(String, String)>>();
375 let pair = match self {
376 | ReadabilityType::ARI => find_first(variables, "MAX_ALLOWED_ARI"),
377 | ReadabilityType::CLI => find_first(variables, "MAX_ALLOWED_CLI"),
378 | ReadabilityType::FKGL => find_first(variables, "MAX_ALLOWED_FKGL"),
379 | ReadabilityType::FRES => find_first(variables, "MAX_ALLOWED_FRES"),
380 | ReadabilityType::GFI => find_first(variables, "MAX_ALLOWED_GFI"),
381 | ReadabilityType::Lix => find_first(variables, "MAX_ALLOWED_LIX"),
382 | ReadabilityType::SMOG => find_first(variables, "MAX_ALLOWED_SMOG"),
383 };
384 match pair {
385 | Some((_, value)) => value.parse::<f64>().unwrap(),
386 | None => MAX_ALLOWED_ARI,
387 }
388 }
389 | Err(_) => match self {
390 | ReadabilityType::ARI => MAX_ALLOWED_ARI,
391 | ReadabilityType::CLI => MAX_ALLOWED_CLI,
392 | ReadabilityType::FKGL => MAX_ALLOWED_FKGL,
393 | ReadabilityType::FRES => MAX_ALLOWED_FRES,
394 | ReadabilityType::GFI => MAX_ALLOWED_GFI,
395 | ReadabilityType::Lix => MAX_ALLOWED_LIX,
396 | ReadabilityType::SMOG => MAX_ALLOWED_SMOG,
397 },
398 }
399 }
400}
401pub fn complex_word_count(text: &str) -> u32 {
405 words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
406}
407pub fn letter_count(text: &str) -> u32 {
411 text.chars()
412 .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
413 .count() as u32
414}
415pub fn long_word_count(text: &str) -> u32 {
419 words(text).iter().filter(|word| word.len() > 6).count() as u32
420}
421pub fn sentence_count(text: &str) -> u32 {
423 text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
424}
425pub fn words(text: &str) -> Vec<String> {
427 text.split_whitespace().map(String::from).collect()
428}
429pub fn word_count(text: &str) -> u32 {
433 words(text).len() as u32
434}
435pub fn automated_readability_index(text: &str) -> f64 {
444 let letters = letter_count(text);
445 let words = word_count(text);
446 let sentences = sentence_count(text);
447 debug!(letters, words, sentences, "=> {}", Label::using());
448 let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
449 format!("{score:.2}").parse().unwrap()
450}
451pub fn coleman_liau_index(text: &str) -> f64 {
455 let letters = letter_count(text);
456 let words = word_count(text);
457 let sentences = sentence_count(text);
458 debug!(letters, words, sentences, "=> {}", Label::using());
459 let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
460 format!("{score:.2}").parse().unwrap()
461}
462pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
471 let words = word_count(text);
472 let sentences = sentence_count(text);
473 let syllables = syllable_count(text);
474 debug!(words, sentences, syllables, "=> {}", Label::using());
475 let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
476 format!("{score:.2}").parse().unwrap()
477}
478pub fn flesch_reading_ease_score(text: &str) -> f64 {
486 let words = word_count(text);
487 let sentences = sentence_count(text);
488 let syllables = syllable_count(text);
489 debug!(words, sentences, syllables, "=> {}", Label::using());
490 let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
491 format!("{score:.2}").parse().unwrap()
492}
493pub fn gunning_fog_index(text: &str) -> f64 {
501 let words = word_count(text);
502 let complex_words = complex_word_count(text);
503 let sentences = sentence_count(text);
504 let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
505 format!("{score:.2}").parse().unwrap()
506}
507pub fn lix(text: &str) -> f64 {
517 let words = word_count(text);
518 let sentences = sentence_count(text);
519 let long_words = long_word_count(text);
520 let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
521 format!("{score:.2}").parse().unwrap()
522}
523pub fn smog(text: &str) -> f64 {
533 let sentences = sentence_count(text);
534 let complex_words = complex_word_count(text);
535 let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
536 format!("{score:.2}").parse().unwrap()
537}
538pub fn singular_form(word: &str) -> String {
542 match word.to_lowercase().as_str() {
543 | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
544 | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
545 | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
546 | Some(value) => value.to_string(),
547 | None => value.to_string(),
548 },
549 | value => {
550 let pair = PLURAL_TO_SINGULAR
551 .iter()
552 .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
553 | Ok(true) => true,
554 | Ok(false) | Err(_) => false,
555 });
556 match pair {
557 | Some((pattern, replacement)) => {
558 trace!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
559 let re = Regex::new(pattern).unwrap();
560 re.replace_all(value, *replacement).to_string()
561 }
562 | None => value.to_string(),
563 }
564 }
565 }
566}
567pub fn syllable_count(text: &str) -> usize {
569 fn syllables(word: String) -> usize {
570 let singular = singular_form(&word);
571 match word.as_str() {
572 | "" => 0,
573 | value if value.len() < 3 => 1,
574 | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
575 | Some(x) => *x,
576 | None => 0,
577 },
578 | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
579 | Some(x) => *x,
580 | None => 0,
581 },
582 | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
583 | Some(x) => *x,
584 | None => 0,
585 },
586 | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
587 | Some(x) => *x,
588 | None => 0,
589 },
590 | _ => {
591 let mut count: isize = 0;
592 let mut input = word;
593 count += 3 * TRIPLE.find_iter(&input).count() as isize;
594 input = TRIPLE.replace_all(&input, "").to_string();
595 count += 2 * DOUBLE.find_iter(&input).count() as isize;
596 input = DOUBLE.replace_all(&input, "").to_string();
597 count += SINGLE.find_iter(&input).count() as isize;
598 input = SINGLE.replace_all(&input, "").to_string();
599 count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
600 count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
601 count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
602 count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
603 count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
604 count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
605 count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
606 count as usize
607 }
608 }
609 }
610 let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
611 tokens.into_iter().map(syllables).sum()
612}
613pub fn tokenize(value: &str) -> Vec<String> {
620 value
621 .replace("é", "-e")
622 .replace("ë", "-e")
623 .split('-')
624 .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
625 .collect::<Vec<_>>()
626}