acorn_lib/analyzer/
readability.rs

1use crate::util::Label;
2use fancy_regex::Regex;
3use lazy_static::lazy_static;
4use std::collections::HashMap;
5use tracing::debug;
6
7lazy_static! {
8    /// Apostrophe
9    pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
10    /// Non-alphabetic
11    pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
12    /// Vowels
13    pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
14    /// ###  Match single syllable pre- and suffixes
15    pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
16    /// ### Match double syllable pre- and suffixes
17    pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
18    /// ### Match triple syllabble suffixes
19    pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
20    /// ### Match syllables counted as two, but should be one
21    pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
22    /// ### Match two-syllable words counted as two, but should be one
23    pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
24    /// ### Match syllables counted as one, but should be two
25    pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
26    /// ### Match two-syllable words counted as one, but should be two
27    pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
28    /// ### Match three-syllable words counted as one, but should be two
29    pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
30    /// ### Match four-syllable words counted as one, but should be two
31    pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
32    /// Nouns with irregular singular/plural forms
33    pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
34        ("child", "children"),
35        ("cow", "cattle"),
36        ("foot", "feet"),
37        ("goose", "geese"),
38        ("man", "men"),
39        ("move", "moves"),
40        ("person", "people"),
41        ("radius", "radii"),
42        ("sex", "sexes"),
43        ("tooth", "teeth"),
44        ("woman", "women"),
45    ].into_iter().collect();
46    /// Nouns with irregular plural/singular forms
47    ///
48    /// Inverted version of [IRREGULAR_NOUNS]
49    pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
50    /// ### Nouns that need to be fixed when counting syllables
51    ///
52    /// All counts are (correct - 1)
53    pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
54        ("ayo", 2),
55        ("australian", 3),
56        ("dionysius", 5),
57        ("disbursement", 3),
58        ("discouragement", 4),
59        ("disenfranchisement", 5),
60        ("disengagement", 4),
61        ("disgraceful", 3),
62        ("diskette", 2),
63        ("displacement", 3),
64        ("distasteful", 3),
65        ("distinctiveness", 4),
66        ("distraction", 3),
67        ("geoffrion", 4),
68        ("mcquaid", 2),
69        ("mcquaide", 2),
70        ("mcquaig", 2),
71        ("mcquain", 2),
72        ("nonbusiness", 3),
73        ("nonetheless", 3),
74        ("nonmanagement", 4),
75        ("outplacement", 3),
76        ("outrageously", 4),
77        ("postponement", 3),
78        ("preemption", 3),
79        ("preignition", 4),
80        ("preinvasion", 4),
81        ("preisler", 3),
82        ("preoccupation", 5),
83        ("prevette", 2),
84        ("probusiness", 3),
85        ("procurement", 3),
86        ("pronouncement", 3),
87        ("sidewater", 3),
88        ("sidewinder", 3),
89        ("ungerer", 3),
90    ].into_iter().collect();
91    /// ### Nouns with problematic syllable counts
92    pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
93        ("abalone", 4),
94        ("abare", 3),
95        ("abbruzzese", 4),
96        ("abed", 2),
97        ("aborigine", 5),
98        ("abruzzese", 4),
99        ("acreage", 3),
100        ("adame", 3),
101        ("adieu", 2),
102        ("adobe", 3),
103        ("anemone", 4),
104        ("anyone", 3),
105        ("apache", 3),
106        ("aphrodite", 4),
107        ("apostrophe", 4),
108        ("ariadne", 4),
109        ("cafe", 2),
110        ("café", 2),
111        ("calliope", 4),
112        ("catastrophe", 4),
113        ("chile", 2),
114        ("chloe", 2),
115        ("circe", 2),
116        ("cliche", 2),
117        ("cliché", 2),
118        ("contrariety", 4),
119        ("coyote", 3),
120        ("daphne", 2),
121        ("epitome", 4),
122        ("eurydice", 4),
123        ("euterpe", 3),
124        ("every", 2),
125        ("everywhere", 3),
126        ("forever", 3),
127        ("gethsemane", 4),
128        ("guacamole", 4),
129        ("hermione", 4),
130        ("hyperbole", 4),
131        ("jesse", 2),
132        ("jukebox", 2),
133        ("karate", 3),
134        ("machete", 3),
135        ("maybe", 2),
136        ("naive", 2),
137        ("newlywed", 3),
138        ("ninety", 2),
139        ("penelope", 4),
140        ("people", 2),
141        ("persephone", 4),
142        ("phoebe", 2),
143        ("pulse", 1),
144        ("queue", 1),
145        ("recipe", 3),
146        ("reptilian", 4),
147        ("resumé", 2),
148        ("riverbed", 3),
149        ("scotia", 3),
150        ("sesame", 3),
151        ("shoreline", 2),
152        ("simile", 3),
153        ("snuffleupagus", 5),
154        ("sometimes", 2),
155        ("syncope", 3),
156        ("tamale", 3),
157        ("waterbed", 3),
158        ("wednesday", 2),
159        ("viceroyship", 3),
160        ("yosemite", 4),
161        ("zoë", 2),
162    ].into_iter().collect();
163}
164/// Plural to singular regex patterns
165const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
166    (r#"(quiz)zes$"#, r#"${1}"#),
167    (r#"(matr)ices$"#, r#"${1}ix"#),
168    (r#"(vert|ind)ices$"#, r#"${1}ex"#),
169    (r#"^(ox)en$"#, r#"${1}"#),
170    (r#"(alias)es$"#, r#"${1}"#),
171    (r#"(octop|vir)i$"#, r#"${1}us"#),
172    (r#"(cris|ax|test)es$"#, r#"${1}is"#),
173    (r#"(shoe)s$"#, r#"${1}"#),
174    (r#"(o)es$"#, r#"${1}"#),
175    (r#"(bus)es$"#, r#"${1}"#),
176    (r#"([m|l])ice$"#, r#"${1}ouse"#),
177    (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
178    (r#"(m)ovies$"#, r#"${1}ovie"#),
179    (r#"(s)eries$"#, r#"${1}eries"#),
180    (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
181    (r#"([lr])ves$"#, r#"${1}f"#),
182    (r#"(tive)s$"#, r#"${1}"#),
183    (r#"(hive)s$"#, r#"${1}"#),
184    (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
185    (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
186    (r#"(^analy)ses$"#, r#"${1}sis"#),
187    (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
188    (r#"([ti])a$"#, r#"${1}um"#),
189    (r#"(n)ews$"#, r#"${1}ews"#),
190    (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
191    (r#"(corpse)s$"#, r#"${1}"#),
192    (r#"(us)es$"#, r#"${1}"#),
193    (r#"s$"#, r#""#),
194];
195/// ### Nouns with the same singular and plural forms
196pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
197    "accommodation",
198    "advice",
199    "alms",
200    "aircraft",
201    "aluminum",
202    "barracks",
203    "bison",
204    "binoculars",
205    "bourgeois",
206    "breadfruit",
207    "buffalo",
208    "cannon",
209    "caribou",
210    "chalk",
211    "chassis",
212    "chinos",
213    "clippers",
214    "clothing",
215    "cod",
216    "concrete",
217    "corps",
218    "correspondence",
219    "crossroads",
220    "data",
221    "deer",
222    "doldrums",
223    "dungarees",
224    "education",
225    "eggfruit",
226    "elk",
227    "equipment",
228    "eyeglasses",
229    "fish",
230    "flares",
231    "flour",
232    "food",
233    "fruit",
234    "furniture",
235    "gallows",
236    "goldfish",
237    "grapefruit",
238    "greenfly",
239    "grouse",
240    "haddock",
241    "halibut",
242    "head",
243    "headquarters",
244    "help",
245    "homework",
246    "hovercraft",
247    "ides",
248    "information",
249    "insignia",
250    "jackfruit",
251    "jeans",
252    "knickers",
253    "knowledge",
254    "kudos",
255    "leggings",
256    "lego",
257    "luggage",
258    "mathematics",
259    "money",
260    "moose",
261    "monkfish",
262    "mullet",
263    "nailclippers",
264    "news",
265    "nitrogen",
266    "offspring",
267    "oxygen",
268    "pants",
269    "pyjamas",
270    "passionfruit",
271    "pike",
272    "pliers",
273    "police",
274    "premises",
275    "reindeer",
276    "rendezvous",
277    "rice",
278    "salmon",
279    "scissors",
280    "series",
281    "shambles",
282    "sheep",
283    "shellfish",
284    "shorts",
285    "shrimp",
286    "smithereens",
287    "spacecraft",
288    "species",
289    "squid",
290    "staff",
291    "starfruit",
292    "statistics",
293    "stone",
294    "sugar",
295    "swine",
296    "tights",
297    "tongs",
298    "traffic",
299    "trousers",
300    "trout",
301    "tuna",
302    "tweezers",
303    "wheat",
304    "whitebait",
305    "wood",
306    "you",
307];
308/// Count the number of "complex words"[^complex] in a given text
309///
310/// [^complex]: Words with 3 or more syllables
311pub fn get_complex_word_count(text: &str) -> u32 {
312    get_words(text).iter().filter(|word| get_syllable_count(word) > 2).count() as u32
313}
314/// Count the number of letters in a given text
315///
316/// Does NOT count white space or punctuation
317pub fn get_letter_count(text: &str) -> u32 {
318    text.chars()
319        .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
320        .count() as u32
321}
322/// Count the number of "long words"[^long] in a given text
323///
324/// [^long]: Words with more than 6 letters
325pub fn get_long_word_count(text: &str) -> u32 {
326    get_words(text).iter().filter(|word| word.len() > 6).count() as u32
327}
328/// Count the number of sentences in a given text
329pub fn get_sentence_count(text: &str) -> u32 {
330    text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
331}
332/// Get list of words in a given text
333pub fn get_words(text: &str) -> Vec<String> {
334    text.split_whitespace().map(String::from).collect()
335}
336/// Count the number of words in a given text
337///
338/// See [`get_words`]
339pub fn get_word_count(text: &str) -> u32 {
340    get_words(text).len() as u32
341}
342/// Automated Readability Index (ARI)
343///
344/// The formula was derived from a large dataset of texts used in US schools.
345/// The result is a number that corresponds with a US grade level.
346///
347/// Requires counting letters, words, and sentences
348///
349/// See <https://en.wikipedia.org/wiki/Automated_readability_index> for more information
350pub fn automated_readability_index(text: &str) -> f64 {
351    let letters = get_letter_count(text);
352    let words = get_word_count(text);
353    let sentences = get_sentence_count(text);
354    debug!(letters, words, sentences, "=> {}", Label::using());
355    let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
356    format!("{:.2}", score).parse().unwrap()
357}
358/// Coleman-Liau Index (CLI)
359///
360/// Requires counting letters, words, and sentences
361pub fn coleman_liau_index(text: &str) -> f64 {
362    let letters = get_letter_count(text);
363    let words = get_word_count(text);
364    let sentences = get_sentence_count(text);
365    debug!(letters, words, sentences, "=> {}", Label::using());
366    let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
367    format!("{:.2}", score).parse().unwrap()
368}
369/// Flesch-Kincaid Grade Level (FKGL)
370///
371/// Arguably the most popular readability test.
372/// The result is a number that corresponds with a US grade level.
373///
374/// Requires counting words, sentences, and syllables
375///
376/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
377pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
378    let words = get_word_count(text);
379    let sentences = get_sentence_count(text);
380    let syllables = get_syllable_count(text);
381    debug!(words, sentences, syllables, "=> {}", Label::using());
382    let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
383    format!("{:.2}", score).parse().unwrap()
384}
385/// Flesch Reading Ease Score (FRES)
386///
387/// FRES range is 100 (very easy) - 0 (extremely difficult)
388///
389/// Requires counting words, sentences, and syllables
390///
391/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
392pub fn flesch_reading_ease_score(text: &str) -> f64 {
393    let words = get_word_count(text);
394    let sentences = get_sentence_count(text);
395    let syllables = get_syllable_count(text);
396    debug!(words, sentences, syllables, "=> {}", Label::using());
397    let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
398    format!("{:.2}", score).parse().unwrap()
399}
400/// Gunning Fog Index (GFI)
401///
402/// Estimates the years of formal education a person needs to understand the text on the first reading
403///
404/// Requires counting words, sentences, and "complex words" (see [get_complex_word_count])
405///
406/// See <https://en.wikipedia.org/wiki/Gunning_fog_index> for more information
407pub fn gunning_fog_index(text: &str) -> f64 {
408    let words = get_word_count(text);
409    let complex_words = get_complex_word_count(text);
410    let sentences = get_sentence_count(text);
411    let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
412    format!("{:.2}", score).parse().unwrap()
413}
414/// Lix
415///
416/// Indicates the difficulty of reading a text
417///
418/// Requires counting words, sentences, and long words (see [get_long_word_count])
419///
420/// "Lix" is an abbreviation of *läsbarhetsindex*, which means "readability index" in Swedish
421///
422/// See <https://en.wikipedia.org/wiki/Lix_(readability_test)> for more information
423pub fn lix(text: &str) -> f64 {
424    let words = get_word_count(text);
425    let sentences = get_sentence_count(text);
426    let long_words = get_long_word_count(text);
427    let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
428    format!("{:.2}", score).parse().unwrap()
429}
430/// Simple Measure of Gobbledygook (SMOG)
431///
432/// Estimates the years of education needed to understand a piece of writing
433///
434/// **Caution**: SMOG formula was normalized on 30-sentence samples
435///
436/// Requires counting sentences, and "complex words" (see [get_complex_word_count])
437///
438/// See <https://en.wikipedia.org/wiki/SMOG> for more information
439pub fn smog(text: &str) -> f64 {
440    let sentences = get_sentence_count(text);
441    let complex_words = get_complex_word_count(text);
442    let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
443    format!("{:.2}", score).parse().unwrap()
444}
445/// Get the singular form of a word (e.g. "people" -> "person")
446///
447/// Adapted from the PHP library, [Text-Statistics](https://github.com/DaveChild/Text-Statistics)
448pub fn get_singular_form(word: &str) -> String {
449    match word.to_lowercase().as_str() {
450        | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
451        | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
452        | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
453            | Some(value) => value.to_string(),
454            | None => value.to_string(),
455        },
456        | value => {
457            let pair = PLURAL_TO_SINGULAR
458                .iter()
459                .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
460                    | Ok(true) => true,
461                    | Ok(false) | Err(_) => false,
462                });
463            match pair {
464                | Some((pattern, replacement)) => {
465                    debug!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
466                    let re = Regex::new(pattern).unwrap();
467                    re.replace_all(value, *replacement).to_string()
468                }
469                | None => value.to_string(),
470            }
471        }
472    }
473}
474pub fn get_syllable_count(text: &str) -> usize {
475    fn get_syllables(word: String) -> usize {
476        let singular = get_singular_form(&word);
477        match word.as_str() {
478            | "" => 0,
479            | value if value.len() < 3 => 1,
480            | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
481                | Some(x) => *x,
482                | None => 0,
483            },
484            | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
485                | Some(x) => *x,
486                | None => 0,
487            },
488            | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
489                | Some(x) => *x,
490                | None => 0,
491            },
492            | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
493                | Some(x) => *x,
494                | None => 0,
495            },
496            | _ => {
497                let mut input = word;
498                let mut count: isize = 0;
499                // TODO: Combine SINGLE, DOUBLE, and TRIPLE regex operations
500                count += 3 * TRIPLE.find_iter(&input).count() as isize;
501                input = TRIPLE.replace_all(&input, "").to_string();
502                count += 2 * DOUBLE.find_iter(&input).count() as isize;
503                input = DOUBLE.replace_all(&input, "").to_string();
504                count += SINGLE.find_iter(&input).count() as isize;
505                input = SINGLE.replace_all(&input, "").to_string();
506                count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
507                count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
508                count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
509                count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
510                count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
511                count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
512                count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
513                count as usize
514            }
515        }
516    }
517    let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
518    tokens.into_iter().map(get_syllables).sum()
519}
520// TODO: Expand acronyms into words
521/// Break text into tokens
522///
523/// Currently replaces `é` and `ë` with `-e`, splits on hyphens, and removes non-alphabetic characters.
524///
525/// This function is a good entry point for adding support for the nuacnces of 'scientific" texts
526pub fn tokenize(value: &str) -> Vec<String> {
527    value
528        .replace("é", "-e")
529        .replace("ë", "-e")
530        .split('-')
531        .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
532        .collect::<Vec<_>>()
533}