acorn_lib/analyzer/
readability.rs

1//! # Readability utilities
2//!
3//! Analyze readabilty of prose using modern readability metrics.
4use crate::constants::*;
5use crate::util::{find_first, Label};
6use clap::ValueEnum;
7use derive_more::Display;
8use dotenvy::dotenv;
9use fancy_regex::Regex;
10use lazy_static::lazy_static;
11use std::collections::HashMap;
12use tracing::{debug, trace, warn};
13
14lazy_static! {
15    /// Apostrophe
16    pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
17    /// Non-alphabetic
18    pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
19    /// Vowels
20    pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
21    /// Match single syllable pre- and suffixes
22    pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
23    /// Match double syllable pre- and suffixes
24    pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
25    /// Match triple syllabble suffixes
26    pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
27    /// Match syllables counted as two, but should be one
28    pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
29    /// Match two-syllable words counted as two, but should be one
30    pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
31    /// Match syllables counted as one, but should be two
32    pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
33    /// Match two-syllable words counted as one, but should be two
34    pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
35    /// Match three-syllable words counted as one, but should be two
36    pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
37    /// Match four-syllable words counted as one, but should be two
38    pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
39    /// Nouns with irregular singular/plural forms
40    pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
41        ("child", "children"),
42        ("cow", "cattle"),
43        ("foot", "feet"),
44        ("goose", "geese"),
45        ("man", "men"),
46        ("move", "moves"),
47        ("person", "people"),
48        ("radius", "radii"),
49        ("sex", "sexes"),
50        ("tooth", "teeth"),
51        ("woman", "women"),
52    ].into_iter().collect();
53    /// Nouns with irregular plural/singular forms
54    ///
55    /// Inverted version of [`IRREGULAR_NOUNS`]
56    pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
57    /// Nouns that need to be fixed when counting syllables
58    /// ### Note
59    /// > All counts are (correct - 1)
60    pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
61        ("ayo", 2),
62        ("australian", 3),
63        ("dionysius", 5),
64        ("disbursement", 3),
65        ("discouragement", 4),
66        ("disenfranchisement", 5),
67        ("disengagement", 4),
68        ("disgraceful", 3),
69        ("diskette", 2),
70        ("displacement", 3),
71        ("distasteful", 3),
72        ("distinctiveness", 4),
73        ("distraction", 3),
74        ("geoffrion", 4),
75        ("mcquaid", 2),
76        ("mcquaide", 2),
77        ("mcquaig", 2),
78        ("mcquain", 2),
79        ("nonbusiness", 3),
80        ("nonetheless", 3),
81        ("nonmanagement", 4),
82        ("outplacement", 3),
83        ("outrageously", 4),
84        ("postponement", 3),
85        ("preemption", 3),
86        ("preignition", 4),
87        ("preinvasion", 4),
88        ("preisler", 3),
89        ("preoccupation", 5),
90        ("prevette", 2),
91        ("probusiness", 3),
92        ("procurement", 3),
93        ("pronouncement", 3),
94        ("sidewater", 3),
95        ("sidewinder", 3),
96        ("ungerer", 3),
97    ].into_iter().collect();
98    /// Nouns with problematic syllable counts
99    pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
100        ("abalone", 4),
101        ("abare", 3),
102        ("abbruzzese", 4),
103        ("abed", 2),
104        ("aborigine", 5),
105        ("abruzzese", 4),
106        ("acreage", 3),
107        ("adame", 3),
108        ("adieu", 2),
109        ("adobe", 3),
110        ("anemone", 4),
111        ("anyone", 3),
112        ("apache", 3),
113        ("aphrodite", 4),
114        ("apostrophe", 4),
115        ("ariadne", 4),
116        ("cafe", 2),
117        ("café", 2),
118        ("calliope", 4),
119        ("catastrophe", 4),
120        ("chile", 2),
121        ("chloe", 2),
122        ("circe", 2),
123        ("cliche", 2),
124        ("cliché", 2),
125        ("contrariety", 4),
126        ("coyote", 3),
127        ("daphne", 2),
128        ("epitome", 4),
129        ("eurydice", 4),
130        ("euterpe", 3),
131        ("every", 2),
132        ("everywhere", 3),
133        ("forever", 3),
134        ("gethsemane", 4),
135        ("guacamole", 4),
136        ("hermione", 4),
137        ("hyperbole", 4),
138        ("jesse", 2),
139        ("jukebox", 2),
140        ("karate", 3),
141        ("machete", 3),
142        ("maybe", 2),
143        ("naive", 2),
144        ("newlywed", 3),
145        ("ninety", 2),
146        ("penelope", 4),
147        ("people", 2),
148        ("persephone", 4),
149        ("phoebe", 2),
150        ("pulse", 1),
151        ("queue", 1),
152        ("recipe", 3),
153        ("reptilian", 4),
154        ("resumé", 2),
155        ("riverbed", 3),
156        ("scotia", 3),
157        ("sesame", 3),
158        ("shoreline", 2),
159        ("simile", 3),
160        ("snuffleupagus", 5),
161        ("sometimes", 2),
162        ("syncope", 3),
163        ("tamale", 3),
164        ("waterbed", 3),
165        ("wednesday", 2),
166        ("viceroyship", 3),
167        ("yosemite", 4),
168        ("zoë", 2),
169    ].into_iter().collect();
170}
171/// Plural to singular regex patterns
172const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
173    (r#"(quiz)zes$"#, r#"${1}"#),
174    (r#"(matr)ices$"#, r#"${1}ix"#),
175    (r#"(vert|ind)ices$"#, r#"${1}ex"#),
176    (r#"^(ox)en$"#, r#"${1}"#),
177    (r#"(alias)es$"#, r#"${1}"#),
178    (r#"(octop|vir)i$"#, r#"${1}us"#),
179    (r#"(cris|ax|test)es$"#, r#"${1}is"#),
180    (r#"(shoe)s$"#, r#"${1}"#),
181    (r#"(o)es$"#, r#"${1}"#),
182    (r#"(bus)es$"#, r#"${1}"#),
183    (r#"([m|l])ice$"#, r#"${1}ouse"#),
184    (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
185    (r#"(m)ovies$"#, r#"${1}ovie"#),
186    (r#"(s)eries$"#, r#"${1}eries"#),
187    (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
188    (r#"([lr])ves$"#, r#"${1}f"#),
189    (r#"(tive)s$"#, r#"${1}"#),
190    (r#"(hive)s$"#, r#"${1}"#),
191    (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
192    (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
193    (r#"(^analy)ses$"#, r#"${1}sis"#),
194    (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
195    (r#"([ti])a$"#, r#"${1}um"#),
196    (r#"(n)ews$"#, r#"${1}ews"#),
197    (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
198    (r#"(corpse)s$"#, r#"${1}"#),
199    (r#"(us)es$"#, r#"${1}"#),
200    (r#"s$"#, r#""#),
201];
202/// Nouns with the same singular and plural forms
203pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
204    "accommodation",
205    "advice",
206    "alms",
207    "aircraft",
208    "aluminum",
209    "barracks",
210    "bison",
211    "binoculars",
212    "bourgeois",
213    "breadfruit",
214    "buffalo",
215    "cannon",
216    "caribou",
217    "chalk",
218    "chassis",
219    "chinos",
220    "clippers",
221    "clothing",
222    "cod",
223    "concrete",
224    "corps",
225    "correspondence",
226    "crossroads",
227    "data",
228    "deer",
229    "doldrums",
230    "dungarees",
231    "education",
232    "eggfruit",
233    "elk",
234    "equipment",
235    "eyeglasses",
236    "fish",
237    "flares",
238    "flour",
239    "food",
240    "fruit",
241    "furniture",
242    "gallows",
243    "goldfish",
244    "grapefruit",
245    "greenfly",
246    "grouse",
247    "haddock",
248    "halibut",
249    "head",
250    "headquarters",
251    "help",
252    "homework",
253    "hovercraft",
254    "ides",
255    "information",
256    "insignia",
257    "jackfruit",
258    "jeans",
259    "knickers",
260    "knowledge",
261    "kudos",
262    "leggings",
263    "lego",
264    "luggage",
265    "mathematics",
266    "money",
267    "moose",
268    "monkfish",
269    "mullet",
270    "nailclippers",
271    "news",
272    "nitrogen",
273    "offspring",
274    "oxygen",
275    "pants",
276    "pyjamas",
277    "passionfruit",
278    "pike",
279    "pliers",
280    "police",
281    "premises",
282    "reindeer",
283    "rendezvous",
284    "rice",
285    "salmon",
286    "scissors",
287    "series",
288    "shambles",
289    "sheep",
290    "shellfish",
291    "shorts",
292    "shrimp",
293    "smithereens",
294    "spacecraft",
295    "species",
296    "squid",
297    "staff",
298    "starfruit",
299    "statistics",
300    "stone",
301    "sugar",
302    "swine",
303    "tights",
304    "tongs",
305    "traffic",
306    "trousers",
307    "trout",
308    "tuna",
309    "tweezers",
310    "wheat",
311    "whitebait",
312    "wood",
313    "you",
314];
315/// Readability Type
316#[derive(Clone, Copy, Debug, Default, Display, PartialEq, ValueEnum)]
317pub enum ReadabilityType {
318    /// Automated Readability Index (ARI)
319    ///
320    /// See [`automated_readability_index`]
321    #[display("ari")]
322    ARI,
323    /// Coleman-Liau Index (CLI)
324    ///
325    /// See [`coleman_liau_index`]
326    #[display("cli")]
327    CLI,
328    /// Flesch-Kincaid Grade Level (FKGL)
329    ///
330    /// See [`flesch_kincaid_grade_level`]
331    #[default]
332    #[display("fkgl")]
333    FKGL,
334    /// Flesch Reading Ease (FRES)
335    ///
336    /// See [`flesch_reading_ease_score`]
337    #[display("fres")]
338    FRES,
339    /// Gunning Fog Index (GFI)
340    #[display("gfi")]
341    GFI,
342    /// Lix (abbreviation of Swedish läsbarhetsindex)
343    ///
344    /// See [`lix`]
345    #[display("lix")]
346    Lix,
347    /// SMOG Index (SMOG)
348    ///
349    /// See [`smog`]
350    #[display("smog")]
351    SMOG,
352}
353impl ReadabilityType {
354    /// Calculate Readability for a given text and readability type
355    pub fn calculate(self, text: &str) -> f64 {
356        match self {
357            | ReadabilityType::ARI => automated_readability_index(text),
358            | ReadabilityType::CLI => coleman_liau_index(text),
359            | ReadabilityType::FKGL => flesch_kincaid_grade_level(text),
360            | ReadabilityType::FRES => flesch_reading_ease_score(text),
361            | ReadabilityType::GFI => gunning_fog_index(text),
362            | ReadabilityType::Lix => lix(text),
363            | ReadabilityType::SMOG => smog(text),
364        }
365    }
366    /// Get Readability Type from string
367    pub fn from_string(value: &str) -> ReadabilityType {
368        match value.to_lowercase().replace("-", " ").as_str() {
369            | "ari" | "automated readability index" => ReadabilityType::ARI,
370            | "cli" | "coleman liau index" => ReadabilityType::CLI,
371            | "fkgl" | "flesch kincaid grade level" => ReadabilityType::FKGL,
372            | "fres" | "flesch reading ease score" => ReadabilityType::FRES,
373            | "gfi" | "gunning fog index" => ReadabilityType::GFI,
374            | "lix" => ReadabilityType::Lix,
375            | "smog" | "simple measure of gobbledygook" => ReadabilityType::SMOG,
376            | _ => {
377                warn!(value, "=> {} Unknown Readability Type", Label::using());
378                ReadabilityType::default()
379            }
380        }
381    }
382    /// Get maximum allowed value for a given readability type
383    pub fn maximum_allowed(self) -> f64 {
384        match self {
385            | ReadabilityType::ARI => MAX_ALLOWED_ARI,
386            | ReadabilityType::CLI => MAX_ALLOWED_CLI,
387            | ReadabilityType::FKGL => MAX_ALLOWED_FKGL,
388            | ReadabilityType::FRES => MAX_ALLOWED_FRES,
389            | ReadabilityType::GFI => MAX_ALLOWED_GFI,
390            | ReadabilityType::Lix => MAX_ALLOWED_LIX,
391            | ReadabilityType::SMOG => MAX_ALLOWED_SMOG,
392        }
393    }
394    /// Get maximum allowed value for a given readability type, from environment file
395    pub fn maximum_allowed_from_env(self) -> Option<f64> {
396        match dotenv() {
397            | Ok(_) => {
398                let variables = dotenvy::vars().collect::<Vec<(String, String)>>();
399                let pair = match self {
400                    | ReadabilityType::ARI => find_first(variables, "MAX_ALLOWED_ARI"),
401                    | ReadabilityType::CLI => find_first(variables, "MAX_ALLOWED_CLI"),
402                    | ReadabilityType::FKGL => find_first(variables, "MAX_ALLOWED_FKGL"),
403                    | ReadabilityType::FRES => find_first(variables, "MAX_ALLOWED_FRES"),
404                    | ReadabilityType::GFI => find_first(variables, "MAX_ALLOWED_GFI"),
405                    | ReadabilityType::Lix => find_first(variables, "MAX_ALLOWED_LIX"),
406                    | ReadabilityType::SMOG => find_first(variables, "MAX_ALLOWED_SMOG"),
407                };
408                match pair {
409                    | Some((_, value)) => Some(value.parse::<f64>().unwrap()),
410                    | None => None,
411                }
412            }
413            | Err(_) => None,
414        }
415    }
416}
417/// Count the number of "complex words"[^complex] in a given text
418///
419/// [^complex]: Words with 3 or more syllables
420pub fn complex_word_count(text: &str) -> u32 {
421    words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
422}
423/// Count the number of letters in a given text
424///
425/// Does NOT count white space or punctuation
426pub fn letter_count(text: &str) -> u32 {
427    text.chars()
428        .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
429        .count() as u32
430}
431/// Count the number of "long words"[^long] in a given text
432///
433/// [^long]: Words with more than 6 letters
434pub fn long_word_count(text: &str) -> u32 {
435    words(text).iter().filter(|word| word.len() > 6).count() as u32
436}
437/// Count the number of sentences in a given text
438pub fn sentence_count(text: &str) -> u32 {
439    text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
440}
441/// Get list of words in a given text
442pub fn words(text: &str) -> Vec<String> {
443    text.split_whitespace().map(String::from).collect()
444}
445/// Count the number of words in a given text
446///
447/// See [`words`]
448pub fn word_count(text: &str) -> u32 {
449    words(text).len() as u32
450}
451/// Automated Readability Index (ARI)
452///
453/// The formula was derived from a large dataset of texts used in US schools.
454/// The result is a number that corresponds with a US grade level.
455///
456/// Requires counting letters, words, and sentences
457///
458/// See <https://en.wikipedia.org/wiki/Automated_readability_index> for more information
459pub fn automated_readability_index(text: &str) -> f64 {
460    let letters = letter_count(text);
461    let words = word_count(text);
462    let sentences = sentence_count(text);
463    debug!(letters, words, sentences, "=> {}", Label::using());
464    let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
465    format!("{score:.2}").parse().unwrap()
466}
467/// Coleman-Liau Index (CLI)
468///
469/// Requires counting letters, words, and sentences
470pub fn coleman_liau_index(text: &str) -> f64 {
471    let letters = letter_count(text);
472    let words = word_count(text);
473    let sentences = sentence_count(text);
474    debug!(letters, words, sentences, "=> {}", Label::using());
475    let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
476    format!("{score:.2}").parse().unwrap()
477}
478/// Flesch-Kincaid Grade Level (FKGL)[^cite]
479///
480/// Arguably the most popular readability test.
481///
482/// The result is a number that corresponds with a US grade level.
483///
484/// Requires counting words, sentences, and syllables
485///
486/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
487///
488/// [^cite]: Flesch, R. (1948). A new readability yardstick. Journal of Applied Psychology, 32(3), 221–233. <https://doi.org/10.1037/h0057532>
489pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
490    let words = word_count(text);
491    let sentences = sentence_count(text);
492    let syllables = syllable_count(text);
493    debug!(words, sentences, syllables, "=> {}", Label::using());
494    let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
495    format!("{score:.2}").parse().unwrap()
496}
497/// Flesch Reading Ease Score (FRES)
498///
499/// FRES range is 100 (very easy) - 0 (extremely difficult)
500///
501/// Requires counting words, sentences, and syllables
502///
503/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
504pub fn flesch_reading_ease_score(text: &str) -> f64 {
505    let words = word_count(text);
506    let sentences = sentence_count(text);
507    let syllables = syllable_count(text);
508    debug!(words, sentences, syllables, "=> {}", Label::using());
509    let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
510    format!("{score:.2}").parse().unwrap()
511}
512/// Gunning Fog Index (GFI)
513///
514/// Estimates the years of formal education a person needs to understand the text on the first reading
515///
516/// Requires counting words, sentences, and "complex words" (see [complex_word_count])
517///
518/// See <https://en.wikipedia.org/wiki/Gunning_fog_index> for more information
519pub fn gunning_fog_index(text: &str) -> f64 {
520    let words = word_count(text);
521    let complex_words = complex_word_count(text);
522    let sentences = sentence_count(text);
523    let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
524    format!("{score:.2}").parse().unwrap()
525}
526/// Lix (abbreviation of Swedish läsbarhetsindex)
527///
528/// Indicates the difficulty of reading a text
529///
530/// Requires counting words, sentences, and long words (see [long_word_count])
531///
532/// "Lix" is an abbreviation of *läsbarhetsindex*, which means "readability index" in Swedish
533///
534/// See <https://en.wikipedia.org/wiki/Lix_(readability_test)> for more information
535pub fn lix(text: &str) -> f64 {
536    let words = word_count(text);
537    let sentences = sentence_count(text);
538    let long_words = long_word_count(text);
539    let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
540    format!("{score:.2}").parse().unwrap()
541}
542/// Simple Measure of Gobbledygook (SMOG)
543///
544/// Estimates the years of education needed to understand a piece of writing
545///
546/// **Caution**: SMOG formula was normalized on 30-sentence samples
547///
548/// Requires counting sentences, and "complex words" (see [complex_word_count])
549///
550/// See <https://en.wikipedia.org/wiki/SMOG> for more information
551pub fn smog(text: &str) -> f64 {
552    let sentences = sentence_count(text);
553    let complex_words = complex_word_count(text);
554    let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
555    format!("{score:.2}").parse().unwrap()
556}
557/// Get the singular form of a word (e.g. "people" -> "person")
558///
559/// Adapted from the PHP library, [Text-Statistics](https://github.com/DaveChild/Text-Statistics)
560pub fn singular_form(word: &str) -> String {
561    match word.to_lowercase().as_str() {
562        | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
563        | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
564        | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
565            | Some(value) => value.to_string(),
566            | None => value.to_string(),
567        },
568        | value => {
569            let pair = PLURAL_TO_SINGULAR
570                .iter()
571                .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
572                    | Ok(true) => true,
573                    | Ok(false) | Err(_) => false,
574                });
575            match pair {
576                | Some((pattern, replacement)) => {
577                    trace!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
578                    let re = Regex::new(pattern).unwrap();
579                    re.replace_all(value, *replacement).to_string()
580                }
581                | None => value.to_string(),
582            }
583        }
584    }
585}
586/// Count the number of syllables in a given text
587/// ### Example
588/// ```rust
589/// use acorn_lib::analyzer::readability::syllable_count;
590///
591/// let sentence = "The quick brown fox jumps over the lazy dog.";
592/// assert_eq!(syllable_count(sentence), 11);
593/// ```
594pub fn syllable_count(text: &str) -> usize {
595    fn syllables(word: String) -> usize {
596        let singular = singular_form(&word);
597        match word.as_str() {
598            | "" => 0,
599            | value if value.len() < 3 => 1,
600            | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
601                | Some(x) => *x,
602                | None => 0,
603            },
604            | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
605                | Some(x) => *x,
606                | None => 0,
607            },
608            | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
609                | Some(x) => *x,
610                | None => 0,
611            },
612            | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
613                | Some(x) => *x,
614                | None => 0,
615            },
616            | _ => {
617                let mut count: isize = 0;
618                let mut input = word;
619                count += 3 * TRIPLE.find_iter(&input).count() as isize;
620                input = TRIPLE.replace_all(&input, "").to_string();
621                count += 2 * DOUBLE.find_iter(&input).count() as isize;
622                input = DOUBLE.replace_all(&input, "").to_string();
623                count += SINGLE.find_iter(&input).count() as isize;
624                input = SINGLE.replace_all(&input, "").to_string();
625                count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
626                count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
627                count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
628                count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
629                count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
630                count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
631                count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
632                count as usize
633            }
634        }
635    }
636    let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
637    tokens.into_iter().map(syllables).sum()
638}
639// TODO: Expand acronyms into words
640/// Break text into tokens
641///
642/// Currently replaces `é` and `ë` with `-e`, splits on hyphens, and removes non-alphabetic characters.
643///
644/// This function is a good entry point for adding support for the nuacnces of 'scientific" texts
645pub fn tokenize(value: &str) -> Vec<String> {
646    value
647        .replace("é", "-e")
648        .replace("ë", "-e")
649        .split('-')
650        .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
651        .collect::<Vec<_>>()
652}
acorn_lib/analyzer/readability.rs

acorn_lib/analyzer/
readability.rs