1use crate::util::Label;
2use fancy_regex::Regex;
3use lazy_static::lazy_static;
4use std::collections::HashMap;
5use tracing::debug;
6
7lazy_static! {
8 pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
10 pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
12 pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
14 pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
16 pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
18 pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
20 pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
22 pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
24 pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
26 pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
28 pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
30 pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
32 pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
34 ("child", "children"),
35 ("cow", "cattle"),
36 ("foot", "feet"),
37 ("goose", "geese"),
38 ("man", "men"),
39 ("move", "moves"),
40 ("person", "people"),
41 ("radius", "radii"),
42 ("sex", "sexes"),
43 ("tooth", "teeth"),
44 ("woman", "women"),
45 ].into_iter().collect();
46 pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
50 pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
54 ("ayo", 2),
55 ("australian", 3),
56 ("dionysius", 5),
57 ("disbursement", 3),
58 ("discouragement", 4),
59 ("disenfranchisement", 5),
60 ("disengagement", 4),
61 ("disgraceful", 3),
62 ("diskette", 2),
63 ("displacement", 3),
64 ("distasteful", 3),
65 ("distinctiveness", 4),
66 ("distraction", 3),
67 ("geoffrion", 4),
68 ("mcquaid", 2),
69 ("mcquaide", 2),
70 ("mcquaig", 2),
71 ("mcquain", 2),
72 ("nonbusiness", 3),
73 ("nonetheless", 3),
74 ("nonmanagement", 4),
75 ("outplacement", 3),
76 ("outrageously", 4),
77 ("postponement", 3),
78 ("preemption", 3),
79 ("preignition", 4),
80 ("preinvasion", 4),
81 ("preisler", 3),
82 ("preoccupation", 5),
83 ("prevette", 2),
84 ("probusiness", 3),
85 ("procurement", 3),
86 ("pronouncement", 3),
87 ("sidewater", 3),
88 ("sidewinder", 3),
89 ("ungerer", 3),
90 ].into_iter().collect();
91 pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
93 ("abalone", 4),
94 ("abare", 3),
95 ("abbruzzese", 4),
96 ("abed", 2),
97 ("aborigine", 5),
98 ("abruzzese", 4),
99 ("acreage", 3),
100 ("adame", 3),
101 ("adieu", 2),
102 ("adobe", 3),
103 ("anemone", 4),
104 ("anyone", 3),
105 ("apache", 3),
106 ("aphrodite", 4),
107 ("apostrophe", 4),
108 ("ariadne", 4),
109 ("cafe", 2),
110 ("café", 2),
111 ("calliope", 4),
112 ("catastrophe", 4),
113 ("chile", 2),
114 ("chloe", 2),
115 ("circe", 2),
116 ("cliche", 2),
117 ("cliché", 2),
118 ("contrariety", 4),
119 ("coyote", 3),
120 ("daphne", 2),
121 ("epitome", 4),
122 ("eurydice", 4),
123 ("euterpe", 3),
124 ("every", 2),
125 ("everywhere", 3),
126 ("forever", 3),
127 ("gethsemane", 4),
128 ("guacamole", 4),
129 ("hermione", 4),
130 ("hyperbole", 4),
131 ("jesse", 2),
132 ("jukebox", 2),
133 ("karate", 3),
134 ("machete", 3),
135 ("maybe", 2),
136 ("naive", 2),
137 ("newlywed", 3),
138 ("ninety", 2),
139 ("penelope", 4),
140 ("people", 2),
141 ("persephone", 4),
142 ("phoebe", 2),
143 ("pulse", 1),
144 ("queue", 1),
145 ("recipe", 3),
146 ("reptilian", 4),
147 ("resumé", 2),
148 ("riverbed", 3),
149 ("scotia", 3),
150 ("sesame", 3),
151 ("shoreline", 2),
152 ("simile", 3),
153 ("snuffleupagus", 5),
154 ("sometimes", 2),
155 ("syncope", 3),
156 ("tamale", 3),
157 ("waterbed", 3),
158 ("wednesday", 2),
159 ("viceroyship", 3),
160 ("yosemite", 4),
161 ("zoë", 2),
162 ].into_iter().collect();
163}
164const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
166 (r#"(quiz)zes$"#, r#"${1}"#),
167 (r#"(matr)ices$"#, r#"${1}ix"#),
168 (r#"(vert|ind)ices$"#, r#"${1}ex"#),
169 (r#"^(ox)en$"#, r#"${1}"#),
170 (r#"(alias)es$"#, r#"${1}"#),
171 (r#"(octop|vir)i$"#, r#"${1}us"#),
172 (r#"(cris|ax|test)es$"#, r#"${1}is"#),
173 (r#"(shoe)s$"#, r#"${1}"#),
174 (r#"(o)es$"#, r#"${1}"#),
175 (r#"(bus)es$"#, r#"${1}"#),
176 (r#"([m|l])ice$"#, r#"${1}ouse"#),
177 (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
178 (r#"(m)ovies$"#, r#"${1}ovie"#),
179 (r#"(s)eries$"#, r#"${1}eries"#),
180 (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
181 (r#"([lr])ves$"#, r#"${1}f"#),
182 (r#"(tive)s$"#, r#"${1}"#),
183 (r#"(hive)s$"#, r#"${1}"#),
184 (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
185 (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
186 (r#"(^analy)ses$"#, r#"${1}sis"#),
187 (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
188 (r#"([ti])a$"#, r#"${1}um"#),
189 (r#"(n)ews$"#, r#"${1}ews"#),
190 (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
191 (r#"(corpse)s$"#, r#"${1}"#),
192 (r#"(us)es$"#, r#"${1}"#),
193 (r#"s$"#, r#""#),
194];
195pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
197 "accommodation",
198 "advice",
199 "alms",
200 "aircraft",
201 "aluminum",
202 "barracks",
203 "bison",
204 "binoculars",
205 "bourgeois",
206 "breadfruit",
207 "buffalo",
208 "cannon",
209 "caribou",
210 "chalk",
211 "chassis",
212 "chinos",
213 "clippers",
214 "clothing",
215 "cod",
216 "concrete",
217 "corps",
218 "correspondence",
219 "crossroads",
220 "data",
221 "deer",
222 "doldrums",
223 "dungarees",
224 "education",
225 "eggfruit",
226 "elk",
227 "equipment",
228 "eyeglasses",
229 "fish",
230 "flares",
231 "flour",
232 "food",
233 "fruit",
234 "furniture",
235 "gallows",
236 "goldfish",
237 "grapefruit",
238 "greenfly",
239 "grouse",
240 "haddock",
241 "halibut",
242 "head",
243 "headquarters",
244 "help",
245 "homework",
246 "hovercraft",
247 "ides",
248 "information",
249 "insignia",
250 "jackfruit",
251 "jeans",
252 "knickers",
253 "knowledge",
254 "kudos",
255 "leggings",
256 "lego",
257 "luggage",
258 "mathematics",
259 "money",
260 "moose",
261 "monkfish",
262 "mullet",
263 "nailclippers",
264 "news",
265 "nitrogen",
266 "offspring",
267 "oxygen",
268 "pants",
269 "pyjamas",
270 "passionfruit",
271 "pike",
272 "pliers",
273 "police",
274 "premises",
275 "reindeer",
276 "rendezvous",
277 "rice",
278 "salmon",
279 "scissors",
280 "series",
281 "shambles",
282 "sheep",
283 "shellfish",
284 "shorts",
285 "shrimp",
286 "smithereens",
287 "spacecraft",
288 "species",
289 "squid",
290 "staff",
291 "starfruit",
292 "statistics",
293 "stone",
294 "sugar",
295 "swine",
296 "tights",
297 "tongs",
298 "traffic",
299 "trousers",
300 "trout",
301 "tuna",
302 "tweezers",
303 "wheat",
304 "whitebait",
305 "wood",
306 "you",
307];
308pub fn get_complex_word_count(text: &str) -> u32 {
312 get_words(text).iter().filter(|word| get_syllable_count(word) > 2).count() as u32
313}
314pub fn get_letter_count(text: &str) -> u32 {
318 text.chars()
319 .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
320 .count() as u32
321}
322pub fn get_long_word_count(text: &str) -> u32 {
326 get_words(text).iter().filter(|word| word.len() > 6).count() as u32
327}
328pub fn get_sentence_count(text: &str) -> u32 {
330 text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
331}
332pub fn get_words(text: &str) -> Vec<String> {
334 text.split_whitespace().map(String::from).collect()
335}
336pub fn get_word_count(text: &str) -> u32 {
340 get_words(text).len() as u32
341}
342pub fn automated_readability_index(text: &str) -> f64 {
351 let letters = get_letter_count(text);
352 let words = get_word_count(text);
353 let sentences = get_sentence_count(text);
354 debug!(letters, words, sentences, "=> {}", Label::using());
355 let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
356 format!("{:.2}", score).parse().unwrap()
357}
358pub fn coleman_liau_index(text: &str) -> f64 {
362 let letters = get_letter_count(text);
363 let words = get_word_count(text);
364 let sentences = get_sentence_count(text);
365 debug!(letters, words, sentences, "=> {}", Label::using());
366 let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
367 format!("{:.2}", score).parse().unwrap()
368}
369pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
378 let words = get_word_count(text);
379 let sentences = get_sentence_count(text);
380 let syllables = get_syllable_count(text);
381 debug!(words, sentences, syllables, "=> {}", Label::using());
382 let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
383 format!("{:.2}", score).parse().unwrap()
384}
385pub fn flesch_reading_ease_score(text: &str) -> f64 {
393 let words = get_word_count(text);
394 let sentences = get_sentence_count(text);
395 let syllables = get_syllable_count(text);
396 debug!(words, sentences, syllables, "=> {}", Label::using());
397 let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
398 format!("{:.2}", score).parse().unwrap()
399}
400pub fn gunning_fog_index(text: &str) -> f64 {
408 let words = get_word_count(text);
409 let complex_words = get_complex_word_count(text);
410 let sentences = get_sentence_count(text);
411 let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
412 format!("{:.2}", score).parse().unwrap()
413}
414pub fn lix(text: &str) -> f64 {
424 let words = get_word_count(text);
425 let sentences = get_sentence_count(text);
426 let long_words = get_long_word_count(text);
427 let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
428 format!("{:.2}", score).parse().unwrap()
429}
430pub fn smog(text: &str) -> f64 {
440 let sentences = get_sentence_count(text);
441 let complex_words = get_complex_word_count(text);
442 let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
443 format!("{:.2}", score).parse().unwrap()
444}
445pub fn get_singular_form(word: &str) -> String {
449 match word.to_lowercase().as_str() {
450 | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
451 | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
452 | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
453 | Some(value) => value.to_string(),
454 | None => value.to_string(),
455 },
456 | value => {
457 let pair = PLURAL_TO_SINGULAR
458 .iter()
459 .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
460 | Ok(true) => true,
461 | Ok(false) | Err(_) => false,
462 });
463 match pair {
464 | Some((pattern, replacement)) => {
465 debug!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
466 let re = Regex::new(pattern).unwrap();
467 re.replace_all(value, *replacement).to_string()
468 }
469 | None => value.to_string(),
470 }
471 }
472 }
473}
474pub fn get_syllable_count(text: &str) -> usize {
475 fn get_syllables(word: String) -> usize {
476 let singular = get_singular_form(&word);
477 match word.as_str() {
478 | "" => 0,
479 | value if value.len() < 3 => 1,
480 | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
481 | Some(x) => *x,
482 | None => 0,
483 },
484 | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
485 | Some(x) => *x,
486 | None => 0,
487 },
488 | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
489 | Some(x) => *x,
490 | None => 0,
491 },
492 | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
493 | Some(x) => *x,
494 | None => 0,
495 },
496 | _ => {
497 let mut input = word;
498 let mut count: isize = 0;
499 count += 3 * TRIPLE.find_iter(&input).count() as isize;
501 input = TRIPLE.replace_all(&input, "").to_string();
502 count += 2 * DOUBLE.find_iter(&input).count() as isize;
503 input = DOUBLE.replace_all(&input, "").to_string();
504 count += SINGLE.find_iter(&input).count() as isize;
505 input = SINGLE.replace_all(&input, "").to_string();
506 count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
507 count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
508 count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
509 count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
510 count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
511 count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
512 count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
513 count as usize
514 }
515 }
516 }
517 let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
518 tokens.into_iter().map(get_syllables).sum()
519}
520pub fn tokenize(value: &str) -> Vec<String> {
527 value
528 .replace("é", "-e")
529 .replace("ë", "-e")
530 .split('-')
531 .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
532 .collect::<Vec<_>>()
533}