1use crate::util::Label;
5use fancy_regex::Regex;
6use lazy_static::lazy_static;
7use std::collections::HashMap;
8use tracing::debug;
9
10lazy_static! {
11 pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
13 pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
15 pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
17 pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
19 pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
21 pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
23 pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
25 pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
27 pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
29 pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
31 pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
33 pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
35 pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
37 ("child", "children"),
38 ("cow", "cattle"),
39 ("foot", "feet"),
40 ("goose", "geese"),
41 ("man", "men"),
42 ("move", "moves"),
43 ("person", "people"),
44 ("radius", "radii"),
45 ("sex", "sexes"),
46 ("tooth", "teeth"),
47 ("woman", "women"),
48 ].into_iter().collect();
49 pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
53 pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
57 ("ayo", 2),
58 ("australian", 3),
59 ("dionysius", 5),
60 ("disbursement", 3),
61 ("discouragement", 4),
62 ("disenfranchisement", 5),
63 ("disengagement", 4),
64 ("disgraceful", 3),
65 ("diskette", 2),
66 ("displacement", 3),
67 ("distasteful", 3),
68 ("distinctiveness", 4),
69 ("distraction", 3),
70 ("geoffrion", 4),
71 ("mcquaid", 2),
72 ("mcquaide", 2),
73 ("mcquaig", 2),
74 ("mcquain", 2),
75 ("nonbusiness", 3),
76 ("nonetheless", 3),
77 ("nonmanagement", 4),
78 ("outplacement", 3),
79 ("outrageously", 4),
80 ("postponement", 3),
81 ("preemption", 3),
82 ("preignition", 4),
83 ("preinvasion", 4),
84 ("preisler", 3),
85 ("preoccupation", 5),
86 ("prevette", 2),
87 ("probusiness", 3),
88 ("procurement", 3),
89 ("pronouncement", 3),
90 ("sidewater", 3),
91 ("sidewinder", 3),
92 ("ungerer", 3),
93 ].into_iter().collect();
94 pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
96 ("abalone", 4),
97 ("abare", 3),
98 ("abbruzzese", 4),
99 ("abed", 2),
100 ("aborigine", 5),
101 ("abruzzese", 4),
102 ("acreage", 3),
103 ("adame", 3),
104 ("adieu", 2),
105 ("adobe", 3),
106 ("anemone", 4),
107 ("anyone", 3),
108 ("apache", 3),
109 ("aphrodite", 4),
110 ("apostrophe", 4),
111 ("ariadne", 4),
112 ("cafe", 2),
113 ("café", 2),
114 ("calliope", 4),
115 ("catastrophe", 4),
116 ("chile", 2),
117 ("chloe", 2),
118 ("circe", 2),
119 ("cliche", 2),
120 ("cliché", 2),
121 ("contrariety", 4),
122 ("coyote", 3),
123 ("daphne", 2),
124 ("epitome", 4),
125 ("eurydice", 4),
126 ("euterpe", 3),
127 ("every", 2),
128 ("everywhere", 3),
129 ("forever", 3),
130 ("gethsemane", 4),
131 ("guacamole", 4),
132 ("hermione", 4),
133 ("hyperbole", 4),
134 ("jesse", 2),
135 ("jukebox", 2),
136 ("karate", 3),
137 ("machete", 3),
138 ("maybe", 2),
139 ("naive", 2),
140 ("newlywed", 3),
141 ("ninety", 2),
142 ("penelope", 4),
143 ("people", 2),
144 ("persephone", 4),
145 ("phoebe", 2),
146 ("pulse", 1),
147 ("queue", 1),
148 ("recipe", 3),
149 ("reptilian", 4),
150 ("resumé", 2),
151 ("riverbed", 3),
152 ("scotia", 3),
153 ("sesame", 3),
154 ("shoreline", 2),
155 ("simile", 3),
156 ("snuffleupagus", 5),
157 ("sometimes", 2),
158 ("syncope", 3),
159 ("tamale", 3),
160 ("waterbed", 3),
161 ("wednesday", 2),
162 ("viceroyship", 3),
163 ("yosemite", 4),
164 ("zoë", 2),
165 ].into_iter().collect();
166}
167const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
169 (r#"(quiz)zes$"#, r#"${1}"#),
170 (r#"(matr)ices$"#, r#"${1}ix"#),
171 (r#"(vert|ind)ices$"#, r#"${1}ex"#),
172 (r#"^(ox)en$"#, r#"${1}"#),
173 (r#"(alias)es$"#, r#"${1}"#),
174 (r#"(octop|vir)i$"#, r#"${1}us"#),
175 (r#"(cris|ax|test)es$"#, r#"${1}is"#),
176 (r#"(shoe)s$"#, r#"${1}"#),
177 (r#"(o)es$"#, r#"${1}"#),
178 (r#"(bus)es$"#, r#"${1}"#),
179 (r#"([m|l])ice$"#, r#"${1}ouse"#),
180 (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
181 (r#"(m)ovies$"#, r#"${1}ovie"#),
182 (r#"(s)eries$"#, r#"${1}eries"#),
183 (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
184 (r#"([lr])ves$"#, r#"${1}f"#),
185 (r#"(tive)s$"#, r#"${1}"#),
186 (r#"(hive)s$"#, r#"${1}"#),
187 (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
188 (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
189 (r#"(^analy)ses$"#, r#"${1}sis"#),
190 (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
191 (r#"([ti])a$"#, r#"${1}um"#),
192 (r#"(n)ews$"#, r#"${1}ews"#),
193 (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
194 (r#"(corpse)s$"#, r#"${1}"#),
195 (r#"(us)es$"#, r#"${1}"#),
196 (r#"s$"#, r#""#),
197];
198pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
200 "accommodation",
201 "advice",
202 "alms",
203 "aircraft",
204 "aluminum",
205 "barracks",
206 "bison",
207 "binoculars",
208 "bourgeois",
209 "breadfruit",
210 "buffalo",
211 "cannon",
212 "caribou",
213 "chalk",
214 "chassis",
215 "chinos",
216 "clippers",
217 "clothing",
218 "cod",
219 "concrete",
220 "corps",
221 "correspondence",
222 "crossroads",
223 "data",
224 "deer",
225 "doldrums",
226 "dungarees",
227 "education",
228 "eggfruit",
229 "elk",
230 "equipment",
231 "eyeglasses",
232 "fish",
233 "flares",
234 "flour",
235 "food",
236 "fruit",
237 "furniture",
238 "gallows",
239 "goldfish",
240 "grapefruit",
241 "greenfly",
242 "grouse",
243 "haddock",
244 "halibut",
245 "head",
246 "headquarters",
247 "help",
248 "homework",
249 "hovercraft",
250 "ides",
251 "information",
252 "insignia",
253 "jackfruit",
254 "jeans",
255 "knickers",
256 "knowledge",
257 "kudos",
258 "leggings",
259 "lego",
260 "luggage",
261 "mathematics",
262 "money",
263 "moose",
264 "monkfish",
265 "mullet",
266 "nailclippers",
267 "news",
268 "nitrogen",
269 "offspring",
270 "oxygen",
271 "pants",
272 "pyjamas",
273 "passionfruit",
274 "pike",
275 "pliers",
276 "police",
277 "premises",
278 "reindeer",
279 "rendezvous",
280 "rice",
281 "salmon",
282 "scissors",
283 "series",
284 "shambles",
285 "sheep",
286 "shellfish",
287 "shorts",
288 "shrimp",
289 "smithereens",
290 "spacecraft",
291 "species",
292 "squid",
293 "staff",
294 "starfruit",
295 "statistics",
296 "stone",
297 "sugar",
298 "swine",
299 "tights",
300 "tongs",
301 "traffic",
302 "trousers",
303 "trout",
304 "tuna",
305 "tweezers",
306 "wheat",
307 "whitebait",
308 "wood",
309 "you",
310];
311pub fn complex_word_count(text: &str) -> u32 {
315 words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
316}
317pub fn letter_count(text: &str) -> u32 {
321 text.chars()
322 .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
323 .count() as u32
324}
325pub fn long_word_count(text: &str) -> u32 {
329 words(text).iter().filter(|word| word.len() > 6).count() as u32
330}
331pub fn sentence_count(text: &str) -> u32 {
333 text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
334}
335pub fn words(text: &str) -> Vec<String> {
337 text.split_whitespace().map(String::from).collect()
338}
339pub fn word_count(text: &str) -> u32 {
343 words(text).len() as u32
344}
345pub fn automated_readability_index(text: &str) -> f64 {
354 let letters = letter_count(text);
355 let words = word_count(text);
356 let sentences = sentence_count(text);
357 debug!(letters, words, sentences, "=> {}", Label::using());
358 let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
359 format!("{score:.2}").parse().unwrap()
360}
361pub fn coleman_liau_index(text: &str) -> f64 {
365 let letters = letter_count(text);
366 let words = word_count(text);
367 let sentences = sentence_count(text);
368 debug!(letters, words, sentences, "=> {}", Label::using());
369 let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
370 format!("{score:.2}").parse().unwrap()
371}
372pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
381 let words = word_count(text);
382 let sentences = sentence_count(text);
383 let syllables = syllable_count(text);
384 debug!(words, sentences, syllables, "=> {}", Label::using());
385 let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
386 format!("{score:.2}").parse().unwrap()
387}
388pub fn flesch_reading_ease_score(text: &str) -> f64 {
396 let words = word_count(text);
397 let sentences = sentence_count(text);
398 let syllables = syllable_count(text);
399 debug!(words, sentences, syllables, "=> {}", Label::using());
400 let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
401 format!("{score:.2}").parse().unwrap()
402}
403pub fn gunning_fog_index(text: &str) -> f64 {
411 let words = word_count(text);
412 let complex_words = complex_word_count(text);
413 let sentences = sentence_count(text);
414 let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
415 format!("{score:.2}").parse().unwrap()
416}
417pub fn lix(text: &str) -> f64 {
427 let words = word_count(text);
428 let sentences = sentence_count(text);
429 let long_words = long_word_count(text);
430 let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
431 format!("{score:.2}").parse().unwrap()
432}
433pub fn smog(text: &str) -> f64 {
443 let sentences = sentence_count(text);
444 let complex_words = complex_word_count(text);
445 let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
446 format!("{score:.2}").parse().unwrap()
447}
448pub fn singular_form(word: &str) -> String {
452 match word.to_lowercase().as_str() {
453 | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
454 | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
455 | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
456 | Some(value) => value.to_string(),
457 | None => value.to_string(),
458 },
459 | value => {
460 let pair = PLURAL_TO_SINGULAR
461 .iter()
462 .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
463 | Ok(true) => true,
464 | Ok(false) | Err(_) => false,
465 });
466 match pair {
467 | Some((pattern, replacement)) => {
468 debug!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
469 let re = Regex::new(pattern).unwrap();
470 re.replace_all(value, *replacement).to_string()
471 }
472 | None => value.to_string(),
473 }
474 }
475 }
476}
477pub fn syllable_count(text: &str) -> usize {
479 fn syllables(word: String) -> usize {
480 let singular = singular_form(&word);
481 match word.as_str() {
482 | "" => 0,
483 | value if value.len() < 3 => 1,
484 | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
485 | Some(x) => *x,
486 | None => 0,
487 },
488 | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
489 | Some(x) => *x,
490 | None => 0,
491 },
492 | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
493 | Some(x) => *x,
494 | None => 0,
495 },
496 | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
497 | Some(x) => *x,
498 | None => 0,
499 },
500 | _ => {
501 let mut input = word;
502 let mut count: isize = 0;
503 count += 3 * TRIPLE.find_iter(&input).count() as isize;
505 input = TRIPLE.replace_all(&input, "").to_string();
506 count += 2 * DOUBLE.find_iter(&input).count() as isize;
507 input = DOUBLE.replace_all(&input, "").to_string();
508 count += SINGLE.find_iter(&input).count() as isize;
509 input = SINGLE.replace_all(&input, "").to_string();
510 count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
511 count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
512 count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
513 count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
514 count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
515 count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
516 count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
517 count as usize
518 }
519 }
520 }
521 let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
522 tokens.into_iter().map(syllables).sum()
523}
524pub fn tokenize(value: &str) -> Vec<String> {
531 value
532 .replace("é", "-e")
533 .replace("ë", "-e")
534 .split('-')
535 .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
536 .collect::<Vec<_>>()
537}