acorn_lib/analyzer/
readability.rs

1//! # Readability utilities
2//!
3//! Analyze readabilty of prose using modern readability metrics.
4use crate::util::Label;
5use fancy_regex::Regex;
6use lazy_static::lazy_static;
7use std::collections::HashMap;
8use tracing::debug;
9
10lazy_static! {
11    /// Apostrophe
12    pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
13    /// Non-alphabetic
14    pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
15    /// Vowels
16    pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
17    /// ###  Match single syllable pre- and suffixes
18    pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
19    /// ### Match double syllable pre- and suffixes
20    pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
21    /// ### Match triple syllabble suffixes
22    pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
23    /// ### Match syllables counted as two, but should be one
24    pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
25    /// ### Match two-syllable words counted as two, but should be one
26    pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
27    /// ### Match syllables counted as one, but should be two
28    pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
29    /// ### Match two-syllable words counted as one, but should be two
30    pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
31    /// ### Match three-syllable words counted as one, but should be two
32    pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
33    /// ### Match four-syllable words counted as one, but should be two
34    pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
35    /// Nouns with irregular singular/plural forms
36    pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
37        ("child", "children"),
38        ("cow", "cattle"),
39        ("foot", "feet"),
40        ("goose", "geese"),
41        ("man", "men"),
42        ("move", "moves"),
43        ("person", "people"),
44        ("radius", "radii"),
45        ("sex", "sexes"),
46        ("tooth", "teeth"),
47        ("woman", "women"),
48    ].into_iter().collect();
49    /// Nouns with irregular plural/singular forms
50    ///
51    /// Inverted version of [IRREGULAR_NOUNS]
52    pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
53    /// ### Nouns that need to be fixed when counting syllables
54    ///
55    /// All counts are (correct - 1)
56    pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
57        ("ayo", 2),
58        ("australian", 3),
59        ("dionysius", 5),
60        ("disbursement", 3),
61        ("discouragement", 4),
62        ("disenfranchisement", 5),
63        ("disengagement", 4),
64        ("disgraceful", 3),
65        ("diskette", 2),
66        ("displacement", 3),
67        ("distasteful", 3),
68        ("distinctiveness", 4),
69        ("distraction", 3),
70        ("geoffrion", 4),
71        ("mcquaid", 2),
72        ("mcquaide", 2),
73        ("mcquaig", 2),
74        ("mcquain", 2),
75        ("nonbusiness", 3),
76        ("nonetheless", 3),
77        ("nonmanagement", 4),
78        ("outplacement", 3),
79        ("outrageously", 4),
80        ("postponement", 3),
81        ("preemption", 3),
82        ("preignition", 4),
83        ("preinvasion", 4),
84        ("preisler", 3),
85        ("preoccupation", 5),
86        ("prevette", 2),
87        ("probusiness", 3),
88        ("procurement", 3),
89        ("pronouncement", 3),
90        ("sidewater", 3),
91        ("sidewinder", 3),
92        ("ungerer", 3),
93    ].into_iter().collect();
94    /// ### Nouns with problematic syllable counts
95    pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
96        ("abalone", 4),
97        ("abare", 3),
98        ("abbruzzese", 4),
99        ("abed", 2),
100        ("aborigine", 5),
101        ("abruzzese", 4),
102        ("acreage", 3),
103        ("adame", 3),
104        ("adieu", 2),
105        ("adobe", 3),
106        ("anemone", 4),
107        ("anyone", 3),
108        ("apache", 3),
109        ("aphrodite", 4),
110        ("apostrophe", 4),
111        ("ariadne", 4),
112        ("cafe", 2),
113        ("café", 2),
114        ("calliope", 4),
115        ("catastrophe", 4),
116        ("chile", 2),
117        ("chloe", 2),
118        ("circe", 2),
119        ("cliche", 2),
120        ("cliché", 2),
121        ("contrariety", 4),
122        ("coyote", 3),
123        ("daphne", 2),
124        ("epitome", 4),
125        ("eurydice", 4),
126        ("euterpe", 3),
127        ("every", 2),
128        ("everywhere", 3),
129        ("forever", 3),
130        ("gethsemane", 4),
131        ("guacamole", 4),
132        ("hermione", 4),
133        ("hyperbole", 4),
134        ("jesse", 2),
135        ("jukebox", 2),
136        ("karate", 3),
137        ("machete", 3),
138        ("maybe", 2),
139        ("naive", 2),
140        ("newlywed", 3),
141        ("ninety", 2),
142        ("penelope", 4),
143        ("people", 2),
144        ("persephone", 4),
145        ("phoebe", 2),
146        ("pulse", 1),
147        ("queue", 1),
148        ("recipe", 3),
149        ("reptilian", 4),
150        ("resumé", 2),
151        ("riverbed", 3),
152        ("scotia", 3),
153        ("sesame", 3),
154        ("shoreline", 2),
155        ("simile", 3),
156        ("snuffleupagus", 5),
157        ("sometimes", 2),
158        ("syncope", 3),
159        ("tamale", 3),
160        ("waterbed", 3),
161        ("wednesday", 2),
162        ("viceroyship", 3),
163        ("yosemite", 4),
164        ("zoë", 2),
165    ].into_iter().collect();
166}
167/// Plural to singular regex patterns
168const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
169    (r#"(quiz)zes$"#, r#"${1}"#),
170    (r#"(matr)ices$"#, r#"${1}ix"#),
171    (r#"(vert|ind)ices$"#, r#"${1}ex"#),
172    (r#"^(ox)en$"#, r#"${1}"#),
173    (r#"(alias)es$"#, r#"${1}"#),
174    (r#"(octop|vir)i$"#, r#"${1}us"#),
175    (r#"(cris|ax|test)es$"#, r#"${1}is"#),
176    (r#"(shoe)s$"#, r#"${1}"#),
177    (r#"(o)es$"#, r#"${1}"#),
178    (r#"(bus)es$"#, r#"${1}"#),
179    (r#"([m|l])ice$"#, r#"${1}ouse"#),
180    (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
181    (r#"(m)ovies$"#, r#"${1}ovie"#),
182    (r#"(s)eries$"#, r#"${1}eries"#),
183    (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
184    (r#"([lr])ves$"#, r#"${1}f"#),
185    (r#"(tive)s$"#, r#"${1}"#),
186    (r#"(hive)s$"#, r#"${1}"#),
187    (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
188    (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
189    (r#"(^analy)ses$"#, r#"${1}sis"#),
190    (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
191    (r#"([ti])a$"#, r#"${1}um"#),
192    (r#"(n)ews$"#, r#"${1}ews"#),
193    (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
194    (r#"(corpse)s$"#, r#"${1}"#),
195    (r#"(us)es$"#, r#"${1}"#),
196    (r#"s$"#, r#""#),
197];
198/// ### Nouns with the same singular and plural forms
199pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
200    "accommodation",
201    "advice",
202    "alms",
203    "aircraft",
204    "aluminum",
205    "barracks",
206    "bison",
207    "binoculars",
208    "bourgeois",
209    "breadfruit",
210    "buffalo",
211    "cannon",
212    "caribou",
213    "chalk",
214    "chassis",
215    "chinos",
216    "clippers",
217    "clothing",
218    "cod",
219    "concrete",
220    "corps",
221    "correspondence",
222    "crossroads",
223    "data",
224    "deer",
225    "doldrums",
226    "dungarees",
227    "education",
228    "eggfruit",
229    "elk",
230    "equipment",
231    "eyeglasses",
232    "fish",
233    "flares",
234    "flour",
235    "food",
236    "fruit",
237    "furniture",
238    "gallows",
239    "goldfish",
240    "grapefruit",
241    "greenfly",
242    "grouse",
243    "haddock",
244    "halibut",
245    "head",
246    "headquarters",
247    "help",
248    "homework",
249    "hovercraft",
250    "ides",
251    "information",
252    "insignia",
253    "jackfruit",
254    "jeans",
255    "knickers",
256    "knowledge",
257    "kudos",
258    "leggings",
259    "lego",
260    "luggage",
261    "mathematics",
262    "money",
263    "moose",
264    "monkfish",
265    "mullet",
266    "nailclippers",
267    "news",
268    "nitrogen",
269    "offspring",
270    "oxygen",
271    "pants",
272    "pyjamas",
273    "passionfruit",
274    "pike",
275    "pliers",
276    "police",
277    "premises",
278    "reindeer",
279    "rendezvous",
280    "rice",
281    "salmon",
282    "scissors",
283    "series",
284    "shambles",
285    "sheep",
286    "shellfish",
287    "shorts",
288    "shrimp",
289    "smithereens",
290    "spacecraft",
291    "species",
292    "squid",
293    "staff",
294    "starfruit",
295    "statistics",
296    "stone",
297    "sugar",
298    "swine",
299    "tights",
300    "tongs",
301    "traffic",
302    "trousers",
303    "trout",
304    "tuna",
305    "tweezers",
306    "wheat",
307    "whitebait",
308    "wood",
309    "you",
310];
311/// Count the number of "complex words"[^complex] in a given text
312///
313/// [^complex]: Words with 3 or more syllables
314pub fn complex_word_count(text: &str) -> u32 {
315    words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
316}
317/// Count the number of letters in a given text
318///
319/// Does NOT count white space or punctuation
320pub fn letter_count(text: &str) -> u32 {
321    text.chars()
322        .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
323        .count() as u32
324}
325/// Count the number of "long words"[^long] in a given text
326///
327/// [^long]: Words with more than 6 letters
328pub fn long_word_count(text: &str) -> u32 {
329    words(text).iter().filter(|word| word.len() > 6).count() as u32
330}
331/// Count the number of sentences in a given text
332pub fn sentence_count(text: &str) -> u32 {
333    text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
334}
335/// Get list of words in a given text
336pub fn words(text: &str) -> Vec<String> {
337    text.split_whitespace().map(String::from).collect()
338}
339/// Count the number of words in a given text
340///
341/// See [`words`]
342pub fn word_count(text: &str) -> u32 {
343    words(text).len() as u32
344}
345/// Automated Readability Index (ARI)
346///
347/// The formula was derived from a large dataset of texts used in US schools.
348/// The result is a number that corresponds with a US grade level.
349///
350/// Requires counting letters, words, and sentences
351///
352/// See <https://en.wikipedia.org/wiki/Automated_readability_index> for more information
353pub fn automated_readability_index(text: &str) -> f64 {
354    let letters = letter_count(text);
355    let words = word_count(text);
356    let sentences = sentence_count(text);
357    debug!(letters, words, sentences, "=> {}", Label::using());
358    let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
359    format!("{score:.2}").parse().unwrap()
360}
361/// Coleman-Liau Index (CLI)
362///
363/// Requires counting letters, words, and sentences
364pub fn coleman_liau_index(text: &str) -> f64 {
365    let letters = letter_count(text);
366    let words = word_count(text);
367    let sentences = sentence_count(text);
368    debug!(letters, words, sentences, "=> {}", Label::using());
369    let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
370    format!("{score:.2}").parse().unwrap()
371}
372/// Flesch-Kincaid Grade Level (FKGL)
373///
374/// Arguably the most popular readability test.
375/// The result is a number that corresponds with a US grade level.
376///
377/// Requires counting words, sentences, and syllables
378///
379/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
380pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
381    let words = word_count(text);
382    let sentences = sentence_count(text);
383    let syllables = syllable_count(text);
384    debug!(words, sentences, syllables, "=> {}", Label::using());
385    let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
386    format!("{score:.2}").parse().unwrap()
387}
388/// Flesch Reading Ease Score (FRES)
389///
390/// FRES range is 100 (very easy) - 0 (extremely difficult)
391///
392/// Requires counting words, sentences, and syllables
393///
394/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
395pub fn flesch_reading_ease_score(text: &str) -> f64 {
396    let words = word_count(text);
397    let sentences = sentence_count(text);
398    let syllables = syllable_count(text);
399    debug!(words, sentences, syllables, "=> {}", Label::using());
400    let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
401    format!("{score:.2}").parse().unwrap()
402}
403/// Gunning Fog Index (GFI)
404///
405/// Estimates the years of formal education a person needs to understand the text on the first reading
406///
407/// Requires counting words, sentences, and "complex words" (see [complex_word_count])
408///
409/// See <https://en.wikipedia.org/wiki/Gunning_fog_index> for more information
410pub fn gunning_fog_index(text: &str) -> f64 {
411    let words = word_count(text);
412    let complex_words = complex_word_count(text);
413    let sentences = sentence_count(text);
414    let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
415    format!("{score:.2}").parse().unwrap()
416}
417/// Lix (abbreviation of Swedish läsbarhetsindex)
418///
419/// Indicates the difficulty of reading a text
420///
421/// Requires counting words, sentences, and long words (see [long_word_count])
422///
423/// "Lix" is an abbreviation of *läsbarhetsindex*, which means "readability index" in Swedish
424///
425/// See <https://en.wikipedia.org/wiki/Lix_(readability_test)> for more information
426pub fn lix(text: &str) -> f64 {
427    let words = word_count(text);
428    let sentences = sentence_count(text);
429    let long_words = long_word_count(text);
430    let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
431    format!("{score:.2}").parse().unwrap()
432}
433/// Simple Measure of Gobbledygook (SMOG)
434///
435/// Estimates the years of education needed to understand a piece of writing
436///
437/// **Caution**: SMOG formula was normalized on 30-sentence samples
438///
439/// Requires counting sentences, and "complex words" (see [complex_word_count])
440///
441/// See <https://en.wikipedia.org/wiki/SMOG> for more information
442pub fn smog(text: &str) -> f64 {
443    let sentences = sentence_count(text);
444    let complex_words = complex_word_count(text);
445    let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
446    format!("{score:.2}").parse().unwrap()
447}
448/// Get the singular form of a word (e.g. "people" -> "person")
449///
450/// Adapted from the PHP library, [Text-Statistics](https://github.com/DaveChild/Text-Statistics)
451pub fn singular_form(word: &str) -> String {
452    match word.to_lowercase().as_str() {
453        | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
454        | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
455        | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
456            | Some(value) => value.to_string(),
457            | None => value.to_string(),
458        },
459        | value => {
460            let pair = PLURAL_TO_SINGULAR
461                .iter()
462                .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
463                    | Ok(true) => true,
464                    | Ok(false) | Err(_) => false,
465                });
466            match pair {
467                | Some((pattern, replacement)) => {
468                    debug!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
469                    let re = Regex::new(pattern).unwrap();
470                    re.replace_all(value, *replacement).to_string()
471                }
472                | None => value.to_string(),
473            }
474        }
475    }
476}
477/// Count the number of syllables in a given text
478pub fn syllable_count(text: &str) -> usize {
479    fn syllables(word: String) -> usize {
480        let singular = singular_form(&word);
481        match word.as_str() {
482            | "" => 0,
483            | value if value.len() < 3 => 1,
484            | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
485                | Some(x) => *x,
486                | None => 0,
487            },
488            | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
489                | Some(x) => *x,
490                | None => 0,
491            },
492            | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
493                | Some(x) => *x,
494                | None => 0,
495            },
496            | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
497                | Some(x) => *x,
498                | None => 0,
499            },
500            | _ => {
501                let mut input = word;
502                let mut count: isize = 0;
503                // TODO: Combine SINGLE, DOUBLE, and TRIPLE regex operations
504                count += 3 * TRIPLE.find_iter(&input).count() as isize;
505                input = TRIPLE.replace_all(&input, "").to_string();
506                count += 2 * DOUBLE.find_iter(&input).count() as isize;
507                input = DOUBLE.replace_all(&input, "").to_string();
508                count += SINGLE.find_iter(&input).count() as isize;
509                input = SINGLE.replace_all(&input, "").to_string();
510                count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
511                count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
512                count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
513                count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
514                count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
515                count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
516                count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
517                count as usize
518            }
519        }
520    }
521    let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
522    tokens.into_iter().map(syllables).sum()
523}
524// TODO: Expand acronyms into words
525/// Break text into tokens
526///
527/// Currently replaces `é` and `ë` with `-e`, splits on hyphens, and removes non-alphabetic characters.
528///
529/// This function is a good entry point for adding support for the nuacnces of 'scientific" texts
530pub fn tokenize(value: &str) -> Vec<String> {
531    value
532        .replace("é", "-e")
533        .replace("ë", "-e")
534        .split('-')
535        .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
536        .collect::<Vec<_>>()
537}