acorn_lib/analyzer/
readability.rs

1//! # Readability utilities
2//!
3//! Analyze readabilty of prose using modern readability metrics.
4use crate::constants::*;
5use crate::util::{find_first, Label};
6use clap::ValueEnum;
7use derive_more::Display;
8use dotenvy::dotenv;
9use fancy_regex::Regex;
10use lazy_static::lazy_static;
11use std::collections::HashMap;
12use tracing::{debug, trace, warn};
13
14lazy_static! {
15    /// Apostrophe
16    pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
17    /// Non-alphabetic
18    pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
19    /// Vowels
20    pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
21    /// ###  Match single syllable pre- and suffixes
22    pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
23    /// ### Match double syllable pre- and suffixes
24    pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
25    /// ### Match triple syllabble suffixes
26    pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
27    /// ### Match syllables counted as two, but should be one
28    pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
29    /// ### Match two-syllable words counted as two, but should be one
30    pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
31    /// ### Match syllables counted as one, but should be two
32    pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
33    /// ### Match two-syllable words counted as one, but should be two
34    pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
35    /// ### Match three-syllable words counted as one, but should be two
36    pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
37    /// ### Match four-syllable words counted as one, but should be two
38    pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
39    /// Nouns with irregular singular/plural forms
40    pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
41        ("child", "children"),
42        ("cow", "cattle"),
43        ("foot", "feet"),
44        ("goose", "geese"),
45        ("man", "men"),
46        ("move", "moves"),
47        ("person", "people"),
48        ("radius", "radii"),
49        ("sex", "sexes"),
50        ("tooth", "teeth"),
51        ("woman", "women"),
52    ].into_iter().collect();
53    /// Nouns with irregular plural/singular forms
54    ///
55    /// Inverted version of [IRREGULAR_NOUNS]
56    pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
57    /// ### Nouns that need to be fixed when counting syllables
58    ///
59    /// All counts are (correct - 1)
60    pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
61        ("ayo", 2),
62        ("australian", 3),
63        ("dionysius", 5),
64        ("disbursement", 3),
65        ("discouragement", 4),
66        ("disenfranchisement", 5),
67        ("disengagement", 4),
68        ("disgraceful", 3),
69        ("diskette", 2),
70        ("displacement", 3),
71        ("distasteful", 3),
72        ("distinctiveness", 4),
73        ("distraction", 3),
74        ("geoffrion", 4),
75        ("mcquaid", 2),
76        ("mcquaide", 2),
77        ("mcquaig", 2),
78        ("mcquain", 2),
79        ("nonbusiness", 3),
80        ("nonetheless", 3),
81        ("nonmanagement", 4),
82        ("outplacement", 3),
83        ("outrageously", 4),
84        ("postponement", 3),
85        ("preemption", 3),
86        ("preignition", 4),
87        ("preinvasion", 4),
88        ("preisler", 3),
89        ("preoccupation", 5),
90        ("prevette", 2),
91        ("probusiness", 3),
92        ("procurement", 3),
93        ("pronouncement", 3),
94        ("sidewater", 3),
95        ("sidewinder", 3),
96        ("ungerer", 3),
97    ].into_iter().collect();
98    /// ### Nouns with problematic syllable counts
99    pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
100        ("abalone", 4),
101        ("abare", 3),
102        ("abbruzzese", 4),
103        ("abed", 2),
104        ("aborigine", 5),
105        ("abruzzese", 4),
106        ("acreage", 3),
107        ("adame", 3),
108        ("adieu", 2),
109        ("adobe", 3),
110        ("anemone", 4),
111        ("anyone", 3),
112        ("apache", 3),
113        ("aphrodite", 4),
114        ("apostrophe", 4),
115        ("ariadne", 4),
116        ("cafe", 2),
117        ("café", 2),
118        ("calliope", 4),
119        ("catastrophe", 4),
120        ("chile", 2),
121        ("chloe", 2),
122        ("circe", 2),
123        ("cliche", 2),
124        ("cliché", 2),
125        ("contrariety", 4),
126        ("coyote", 3),
127        ("daphne", 2),
128        ("epitome", 4),
129        ("eurydice", 4),
130        ("euterpe", 3),
131        ("every", 2),
132        ("everywhere", 3),
133        ("forever", 3),
134        ("gethsemane", 4),
135        ("guacamole", 4),
136        ("hermione", 4),
137        ("hyperbole", 4),
138        ("jesse", 2),
139        ("jukebox", 2),
140        ("karate", 3),
141        ("machete", 3),
142        ("maybe", 2),
143        ("naive", 2),
144        ("newlywed", 3),
145        ("ninety", 2),
146        ("penelope", 4),
147        ("people", 2),
148        ("persephone", 4),
149        ("phoebe", 2),
150        ("pulse", 1),
151        ("queue", 1),
152        ("recipe", 3),
153        ("reptilian", 4),
154        ("resumé", 2),
155        ("riverbed", 3),
156        ("scotia", 3),
157        ("sesame", 3),
158        ("shoreline", 2),
159        ("simile", 3),
160        ("snuffleupagus", 5),
161        ("sometimes", 2),
162        ("syncope", 3),
163        ("tamale", 3),
164        ("waterbed", 3),
165        ("wednesday", 2),
166        ("viceroyship", 3),
167        ("yosemite", 4),
168        ("zoë", 2),
169    ].into_iter().collect();
170}
171/// Plural to singular regex patterns
172const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
173    (r#"(quiz)zes$"#, r#"${1}"#),
174    (r#"(matr)ices$"#, r#"${1}ix"#),
175    (r#"(vert|ind)ices$"#, r#"${1}ex"#),
176    (r#"^(ox)en$"#, r#"${1}"#),
177    (r#"(alias)es$"#, r#"${1}"#),
178    (r#"(octop|vir)i$"#, r#"${1}us"#),
179    (r#"(cris|ax|test)es$"#, r#"${1}is"#),
180    (r#"(shoe)s$"#, r#"${1}"#),
181    (r#"(o)es$"#, r#"${1}"#),
182    (r#"(bus)es$"#, r#"${1}"#),
183    (r#"([m|l])ice$"#, r#"${1}ouse"#),
184    (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
185    (r#"(m)ovies$"#, r#"${1}ovie"#),
186    (r#"(s)eries$"#, r#"${1}eries"#),
187    (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
188    (r#"([lr])ves$"#, r#"${1}f"#),
189    (r#"(tive)s$"#, r#"${1}"#),
190    (r#"(hive)s$"#, r#"${1}"#),
191    (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
192    (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
193    (r#"(^analy)ses$"#, r#"${1}sis"#),
194    (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
195    (r#"([ti])a$"#, r#"${1}um"#),
196    (r#"(n)ews$"#, r#"${1}ews"#),
197    (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
198    (r#"(corpse)s$"#, r#"${1}"#),
199    (r#"(us)es$"#, r#"${1}"#),
200    (r#"s$"#, r#""#),
201];
202/// ### Nouns with the same singular and plural forms
203pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
204    "accommodation",
205    "advice",
206    "alms",
207    "aircraft",
208    "aluminum",
209    "barracks",
210    "bison",
211    "binoculars",
212    "bourgeois",
213    "breadfruit",
214    "buffalo",
215    "cannon",
216    "caribou",
217    "chalk",
218    "chassis",
219    "chinos",
220    "clippers",
221    "clothing",
222    "cod",
223    "concrete",
224    "corps",
225    "correspondence",
226    "crossroads",
227    "data",
228    "deer",
229    "doldrums",
230    "dungarees",
231    "education",
232    "eggfruit",
233    "elk",
234    "equipment",
235    "eyeglasses",
236    "fish",
237    "flares",
238    "flour",
239    "food",
240    "fruit",
241    "furniture",
242    "gallows",
243    "goldfish",
244    "grapefruit",
245    "greenfly",
246    "grouse",
247    "haddock",
248    "halibut",
249    "head",
250    "headquarters",
251    "help",
252    "homework",
253    "hovercraft",
254    "ides",
255    "information",
256    "insignia",
257    "jackfruit",
258    "jeans",
259    "knickers",
260    "knowledge",
261    "kudos",
262    "leggings",
263    "lego",
264    "luggage",
265    "mathematics",
266    "money",
267    "moose",
268    "monkfish",
269    "mullet",
270    "nailclippers",
271    "news",
272    "nitrogen",
273    "offspring",
274    "oxygen",
275    "pants",
276    "pyjamas",
277    "passionfruit",
278    "pike",
279    "pliers",
280    "police",
281    "premises",
282    "reindeer",
283    "rendezvous",
284    "rice",
285    "salmon",
286    "scissors",
287    "series",
288    "shambles",
289    "sheep",
290    "shellfish",
291    "shorts",
292    "shrimp",
293    "smithereens",
294    "spacecraft",
295    "species",
296    "squid",
297    "staff",
298    "starfruit",
299    "statistics",
300    "stone",
301    "sugar",
302    "swine",
303    "tights",
304    "tongs",
305    "traffic",
306    "trousers",
307    "trout",
308    "tuna",
309    "tweezers",
310    "wheat",
311    "whitebait",
312    "wood",
313    "you",
314];
315/// Readability Type
316#[derive(Clone, Copy, Debug, Default, Display, PartialEq, ValueEnum)]
317pub enum ReadabilityType {
318    /// Automated Readability Index (ARI)
319    #[display("ari")]
320    ARI,
321    /// Coleman-Liau Index (CLI)
322    #[display("cli")]
323    CLI,
324    /// Flesch-Kincaid Grade Level (FKGL)
325    #[default]
326    #[display("fkgl")]
327    FKGL,
328    /// Flesch Reading Ease (FRES)
329    #[display("fres")]
330    FRES,
331    /// Gunning Fog Index (GFI)
332    #[display("gfi")]
333    GFI,
334    /// Lix (abbreviation of Swedish läsbarhetsindex)
335    #[display("lix")]
336    Lix,
337    /// SMOG Index (SMOG)
338    #[display("smog")]
339    SMOG,
340}
341impl ReadabilityType {
342    /// Calculate Readability for a given text and readability type
343    pub fn calculate(self, text: &str) -> f64 {
344        match self {
345            | ReadabilityType::ARI => automated_readability_index(text),
346            | ReadabilityType::CLI => coleman_liau_index(text),
347            | ReadabilityType::FKGL => flesch_kincaid_grade_level(text),
348            | ReadabilityType::FRES => flesch_reading_ease_score(text),
349            | ReadabilityType::GFI => gunning_fog_index(text),
350            | ReadabilityType::Lix => lix(text),
351            | ReadabilityType::SMOG => smog(text),
352        }
353    }
354    /// Get Readability Type from string
355    pub fn from_string(value: &str) -> ReadabilityType {
356        match value.to_lowercase().replace("-", " ").as_str() {
357            | "ari" | "automated readability index" => ReadabilityType::ARI,
358            | "cli" | "coleman liau index" => ReadabilityType::CLI,
359            | "fkgl" | "flesch kincaid grade level" => ReadabilityType::FKGL,
360            | "fres" | "flesch reading ease score" => ReadabilityType::FRES,
361            | "gfi" | "gunning fog index" => ReadabilityType::GFI,
362            | "lix" => ReadabilityType::Lix,
363            | "smog" | "simple measure of gobbledygook" => ReadabilityType::SMOG,
364            | _ => {
365                warn!(value, "=> {} Unknown Readability Type", Label::using());
366                ReadabilityType::default()
367            }
368        }
369    }
370    /// Get maximum allowed value for a given readability type
371    pub fn maximum_allowed(self) -> f64 {
372        match dotenv() {
373            | Ok(_) => {
374                let variables = dotenvy::vars().collect::<Vec<(String, String)>>();
375                let pair = match self {
376                    | ReadabilityType::ARI => find_first(variables, "MAX_ALLOWED_ARI"),
377                    | ReadabilityType::CLI => find_first(variables, "MAX_ALLOWED_CLI"),
378                    | ReadabilityType::FKGL => find_first(variables, "MAX_ALLOWED_FKGL"),
379                    | ReadabilityType::FRES => find_first(variables, "MAX_ALLOWED_FRES"),
380                    | ReadabilityType::GFI => find_first(variables, "MAX_ALLOWED_GFI"),
381                    | ReadabilityType::Lix => find_first(variables, "MAX_ALLOWED_LIX"),
382                    | ReadabilityType::SMOG => find_first(variables, "MAX_ALLOWED_SMOG"),
383                };
384                match pair {
385                    | Some((_, value)) => value.parse::<f64>().unwrap(),
386                    | None => MAX_ALLOWED_ARI,
387                }
388            }
389            | Err(_) => match self {
390                | ReadabilityType::ARI => MAX_ALLOWED_ARI,
391                | ReadabilityType::CLI => MAX_ALLOWED_CLI,
392                | ReadabilityType::FKGL => MAX_ALLOWED_FKGL,
393                | ReadabilityType::FRES => MAX_ALLOWED_FRES,
394                | ReadabilityType::GFI => MAX_ALLOWED_GFI,
395                | ReadabilityType::Lix => MAX_ALLOWED_LIX,
396                | ReadabilityType::SMOG => MAX_ALLOWED_SMOG,
397            },
398        }
399    }
400}
401/// Count the number of "complex words"[^complex] in a given text
402///
403/// [^complex]: Words with 3 or more syllables
404pub fn complex_word_count(text: &str) -> u32 {
405    words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
406}
407/// Count the number of letters in a given text
408///
409/// Does NOT count white space or punctuation
410pub fn letter_count(text: &str) -> u32 {
411    text.chars()
412        .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
413        .count() as u32
414}
415/// Count the number of "long words"[^long] in a given text
416///
417/// [^long]: Words with more than 6 letters
418pub fn long_word_count(text: &str) -> u32 {
419    words(text).iter().filter(|word| word.len() > 6).count() as u32
420}
421/// Count the number of sentences in a given text
422pub fn sentence_count(text: &str) -> u32 {
423    text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
424}
425/// Get list of words in a given text
426pub fn words(text: &str) -> Vec<String> {
427    text.split_whitespace().map(String::from).collect()
428}
429/// Count the number of words in a given text
430///
431/// See [`words`]
432pub fn word_count(text: &str) -> u32 {
433    words(text).len() as u32
434}
435/// Automated Readability Index (ARI)
436///
437/// The formula was derived from a large dataset of texts used in US schools.
438/// The result is a number that corresponds with a US grade level.
439///
440/// Requires counting letters, words, and sentences
441///
442/// See <https://en.wikipedia.org/wiki/Automated_readability_index> for more information
443pub fn automated_readability_index(text: &str) -> f64 {
444    let letters = letter_count(text);
445    let words = word_count(text);
446    let sentences = sentence_count(text);
447    debug!(letters, words, sentences, "=> {}", Label::using());
448    let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
449    format!("{score:.2}").parse().unwrap()
450}
451/// Coleman-Liau Index (CLI)
452///
453/// Requires counting letters, words, and sentences
454pub fn coleman_liau_index(text: &str) -> f64 {
455    let letters = letter_count(text);
456    let words = word_count(text);
457    let sentences = sentence_count(text);
458    debug!(letters, words, sentences, "=> {}", Label::using());
459    let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
460    format!("{score:.2}").parse().unwrap()
461}
462/// Flesch-Kincaid Grade Level (FKGL)
463///
464/// Arguably the most popular readability test.
465/// The result is a number that corresponds with a US grade level.
466///
467/// Requires counting words, sentences, and syllables
468///
469/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
470pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
471    let words = word_count(text);
472    let sentences = sentence_count(text);
473    let syllables = syllable_count(text);
474    debug!(words, sentences, syllables, "=> {}", Label::using());
475    let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
476    format!("{score:.2}").parse().unwrap()
477}
478/// Flesch Reading Ease Score (FRES)
479///
480/// FRES range is 100 (very easy) - 0 (extremely difficult)
481///
482/// Requires counting words, sentences, and syllables
483///
484/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
485pub fn flesch_reading_ease_score(text: &str) -> f64 {
486    let words = word_count(text);
487    let sentences = sentence_count(text);
488    let syllables = syllable_count(text);
489    debug!(words, sentences, syllables, "=> {}", Label::using());
490    let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
491    format!("{score:.2}").parse().unwrap()
492}
493/// Gunning Fog Index (GFI)
494///
495/// Estimates the years of formal education a person needs to understand the text on the first reading
496///
497/// Requires counting words, sentences, and "complex words" (see [complex_word_count])
498///
499/// See <https://en.wikipedia.org/wiki/Gunning_fog_index> for more information
500pub fn gunning_fog_index(text: &str) -> f64 {
501    let words = word_count(text);
502    let complex_words = complex_word_count(text);
503    let sentences = sentence_count(text);
504    let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
505    format!("{score:.2}").parse().unwrap()
506}
507/// Lix (abbreviation of Swedish läsbarhetsindex)
508///
509/// Indicates the difficulty of reading a text
510///
511/// Requires counting words, sentences, and long words (see [long_word_count])
512///
513/// "Lix" is an abbreviation of *läsbarhetsindex*, which means "readability index" in Swedish
514///
515/// See <https://en.wikipedia.org/wiki/Lix_(readability_test)> for more information
516pub fn lix(text: &str) -> f64 {
517    let words = word_count(text);
518    let sentences = sentence_count(text);
519    let long_words = long_word_count(text);
520    let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
521    format!("{score:.2}").parse().unwrap()
522}
523/// Simple Measure of Gobbledygook (SMOG)
524///
525/// Estimates the years of education needed to understand a piece of writing
526///
527/// **Caution**: SMOG formula was normalized on 30-sentence samples
528///
529/// Requires counting sentences, and "complex words" (see [complex_word_count])
530///
531/// See <https://en.wikipedia.org/wiki/SMOG> for more information
532pub fn smog(text: &str) -> f64 {
533    let sentences = sentence_count(text);
534    let complex_words = complex_word_count(text);
535    let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
536    format!("{score:.2}").parse().unwrap()
537}
538/// Get the singular form of a word (e.g. "people" -> "person")
539///
540/// Adapted from the PHP library, [Text-Statistics](https://github.com/DaveChild/Text-Statistics)
541pub fn singular_form(word: &str) -> String {
542    match word.to_lowercase().as_str() {
543        | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
544        | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
545        | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
546            | Some(value) => value.to_string(),
547            | None => value.to_string(),
548        },
549        | value => {
550            let pair = PLURAL_TO_SINGULAR
551                .iter()
552                .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
553                    | Ok(true) => true,
554                    | Ok(false) | Err(_) => false,
555                });
556            match pair {
557                | Some((pattern, replacement)) => {
558                    trace!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
559                    let re = Regex::new(pattern).unwrap();
560                    re.replace_all(value, *replacement).to_string()
561                }
562                | None => value.to_string(),
563            }
564        }
565    }
566}
567/// Count the number of syllables in a given text
568pub fn syllable_count(text: &str) -> usize {
569    fn syllables(word: String) -> usize {
570        let singular = singular_form(&word);
571        match word.as_str() {
572            | "" => 0,
573            | value if value.len() < 3 => 1,
574            | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
575                | Some(x) => *x,
576                | None => 0,
577            },
578            | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
579                | Some(x) => *x,
580                | None => 0,
581            },
582            | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
583                | Some(x) => *x,
584                | None => 0,
585            },
586            | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
587                | Some(x) => *x,
588                | None => 0,
589            },
590            | _ => {
591                let mut count: isize = 0;
592                let mut input = word;
593                count += 3 * TRIPLE.find_iter(&input).count() as isize;
594                input = TRIPLE.replace_all(&input, "").to_string();
595                count += 2 * DOUBLE.find_iter(&input).count() as isize;
596                input = DOUBLE.replace_all(&input, "").to_string();
597                count += SINGLE.find_iter(&input).count() as isize;
598                input = SINGLE.replace_all(&input, "").to_string();
599                count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
600                count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
601                count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
602                count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
603                count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
604                count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
605                count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
606                count as usize
607            }
608        }
609    }
610    let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
611    tokens.into_iter().map(syllables).sum()
612}
613// TODO: Expand acronyms into words
614/// Break text into tokens
615///
616/// Currently replaces `é` and `ë` with `-e`, splits on hyphens, and removes non-alphabetic characters.
617///
618/// This function is a good entry point for adding support for the nuacnces of 'scientific" texts
619pub fn tokenize(value: &str) -> Vec<String> {
620    value
621        .replace("é", "-e")
622        .replace("ë", "-e")
623        .split('-')
624        .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
625        .collect::<Vec<_>>()
626}