acorn_lib/analyzer/readability/
mod.rs

1//! # Readability utilities
2//!
3//! Analyze readabilty of prose using modern readability metrics.
4use crate::util::constants::{
5    MAX_ALLOWED_ARI, MAX_ALLOWED_CLI, MAX_ALLOWED_FKGL, MAX_ALLOWED_FRES, MAX_ALLOWED_GFI, MAX_ALLOWED_LIX, MAX_ALLOWED_SMOG,
6};
7use crate::util::find_first;
8use crate::util::Label;
9use derive_more::Display;
10use dotenvy::dotenv;
11use fancy_regex::Regex;
12use tracing::warn;
13use tracing::{debug, trace};
14
15pub mod constants;
16use constants::{
17    DOUBLE, DOUBLE_SYLLABIC_FOUR, DOUBLE_SYLLABIC_ONE, DOUBLE_SYLLABIC_THREE, DOUBLE_SYLLABIC_TWO, IRREGULAR_NOUNS, IRREGULAR_NOUNS_INVERTED,
18    NEED_TO_BE_FIXED, NON_ALPHABETIC, PLURAL_TO_SINGULAR, PROBLEMATIC_WORDS, SAME_SINGULAR_PLURAL, SINGLE, SINGLE_SYLLABIC_ONE, SINGLE_SYLLABIC_TWO,
19    TRIPLE, VOWEL,
20};
21
22/// Readability Type
23#[derive(Clone, Copy, Debug, Default, Display, PartialEq)]
24pub enum ReadabilityType {
25    /// Automated Readability Index (ARI)
26    ///
27    /// See [`automated_readability_index`]
28    #[display("ari")]
29    ARI,
30    /// Coleman-Liau Index (CLI)
31    ///
32    /// See [`coleman_liau_index`]
33    #[display("cli")]
34    CLI,
35    /// Flesch-Kincaid Grade Level (FKGL)
36    ///
37    /// See [`flesch_kincaid_grade_level`]
38    #[default]
39    #[display("fkgl")]
40    FKGL,
41    /// Flesch Reading Ease (FRES)
42    ///
43    /// See [`flesch_reading_ease_score`]
44    #[display("fres")]
45    FRES,
46    /// Gunning Fog Index (GFI)
47    ///
48    /// See [`gunning_fog_index`]
49    #[display("gfi")]
50    GFI,
51    /// Lix (abbreviation of Swedish läsbarhetsindex)
52    ///
53    /// See [`lix`]
54    #[display("lix")]
55    Lix,
56    /// SMOG Index (SMOG)
57    ///
58    /// See [`smog`]
59    #[display("smog")]
60    SMOG,
61}
62impl From<ReadabilityType> for String {
63    fn from(value: ReadabilityType) -> Self {
64        value.to_string()
65    }
66}
67impl From<String> for ReadabilityType {
68    fn from(value: String) -> Self {
69        ReadabilityType::from_string(&value)
70    }
71}
72impl From<&str> for ReadabilityType {
73    fn from(value: &str) -> Self {
74        ReadabilityType::from_string(value)
75    }
76}
77impl ReadabilityType {
78    /// Calculate Readability for a given text and readability type
79    pub fn calculate(self, text: &str) -> f64 {
80        match self {
81            | ReadabilityType::ARI => automated_readability_index(text),
82            | ReadabilityType::CLI => coleman_liau_index(text),
83            | ReadabilityType::FKGL => flesch_kincaid_grade_level(text),
84            | ReadabilityType::FRES => flesch_reading_ease_score(text),
85            | ReadabilityType::GFI => gunning_fog_index(text),
86            | ReadabilityType::Lix => lix(text),
87            | ReadabilityType::SMOG => smog(text),
88        }
89    }
90    /// Get Readability Type from string
91    pub fn from_string(value: &str) -> ReadabilityType {
92        match value.to_lowercase().replace("-", " ").as_str() {
93            | "ari" | "automated readability index" => ReadabilityType::ARI,
94            | "cli" | "coleman liau index" => ReadabilityType::CLI,
95            | "fkgl" | "flesch kincaid grade level" => ReadabilityType::FKGL,
96            | "fres" | "flesch reading ease score" => ReadabilityType::FRES,
97            | "gfi" | "gunning fog index" => ReadabilityType::GFI,
98            | "lix" => ReadabilityType::Lix,
99            | "smog" | "simple measure of gobbledygook" => ReadabilityType::SMOG,
100            | _ => {
101                warn!(value, "=> {} Unknown Readability Type", Label::using());
102                ReadabilityType::default()
103            }
104        }
105    }
106    /// Get maximum allowed value for a given readability type
107    pub fn maximum_allowed(self) -> f64 {
108        match self {
109            | ReadabilityType::ARI => MAX_ALLOWED_ARI,
110            | ReadabilityType::CLI => MAX_ALLOWED_CLI,
111            | ReadabilityType::FKGL => MAX_ALLOWED_FKGL,
112            | ReadabilityType::FRES => MAX_ALLOWED_FRES,
113            | ReadabilityType::GFI => MAX_ALLOWED_GFI,
114            | ReadabilityType::Lix => MAX_ALLOWED_LIX,
115            | ReadabilityType::SMOG => MAX_ALLOWED_SMOG,
116        }
117    }
118    /// Get maximum allowed value for a given readability type, from environment file
119    pub fn maximum_allowed_from_env(self) -> Option<f64> {
120        match dotenv() {
121            | Ok(_) => {
122                let variables = dotenvy::vars().collect::<Vec<(String, String)>>();
123                let pair = match self {
124                    | ReadabilityType::ARI => find_first(variables, "MAX_ALLOWED_ARI"),
125                    | ReadabilityType::CLI => find_first(variables, "MAX_ALLOWED_CLI"),
126                    | ReadabilityType::FKGL => find_first(variables, "MAX_ALLOWED_FKGL"),
127                    | ReadabilityType::FRES => find_first(variables, "MAX_ALLOWED_FRES"),
128                    | ReadabilityType::GFI => find_first(variables, "MAX_ALLOWED_GFI"),
129                    | ReadabilityType::Lix => find_first(variables, "MAX_ALLOWED_LIX"),
130                    | ReadabilityType::SMOG => find_first(variables, "MAX_ALLOWED_SMOG"),
131                };
132                match pair {
133                    | Some((_, value)) => Some(value.parse::<f64>().unwrap()),
134                    | None => None,
135                }
136            }
137            | Err(_) => None,
138        }
139    }
140}
141/// Count the number of "complex words"[^complex] in a given text
142///
143/// [^complex]: Words with 3 or more syllables
144pub fn complex_word_count(text: &str) -> u32 {
145    words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
146}
147/// Count the number of letters in a given text
148///
149/// Does NOT count white space or punctuation
150pub fn letter_count(text: &str) -> u32 {
151    text.chars()
152        .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
153        .count() as u32
154}
155/// Count the number of "long words"[^long] in a given text
156///
157/// [^long]: Words with more than 6 letters
158pub fn long_word_count(text: &str) -> u32 {
159    words(text).iter().filter(|word| word.len() > 6).count() as u32
160}
161/// Count the number of sentences in a given text
162pub fn sentence_count(text: &str) -> u32 {
163    text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
164}
165/// Get list of words in a given text
166pub fn words(text: &str) -> Vec<String> {
167    text.split_whitespace().map(String::from).collect()
168}
169/// Count the number of words in a given text
170///
171/// See [`words`]
172pub fn word_count(text: &str) -> u32 {
173    words(text).len() as u32
174}
175/// Automated Readability Index (ARI)
176///
177/// The formula was derived from a large dataset of texts used in US schools.
178/// The result is a number that corresponds with a US grade level.
179///
180/// Requires counting letters, words, and sentences
181///
182/// See <https://en.wikipedia.org/wiki/Automated_readability_index> for more information
183pub fn automated_readability_index(text: &str) -> f64 {
184    let letters = letter_count(text);
185    let words = word_count(text);
186    let sentences = sentence_count(text);
187    debug!(letters, words, sentences, "=> {}", Label::using());
188    let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
189    format!("{score:.2}").parse().unwrap()
190}
191/// Coleman-Liau Index (CLI)
192///
193/// Requires counting letters, words, and sentences
194pub fn coleman_liau_index(text: &str) -> f64 {
195    let letters = letter_count(text);
196    let words = word_count(text);
197    let sentences = sentence_count(text);
198    debug!(letters, words, sentences, "=> {}", Label::using());
199    let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
200    format!("{score:.2}").parse().unwrap()
201}
202/// Flesch-Kincaid Grade Level (FKGL)[^cite]
203///
204/// Arguably the most popular readability test.
205///
206/// The result is a number that corresponds with a US grade level.
207///
208/// Requires counting words, sentences, and syllables
209///
210/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
211///
212/// [^cite]: Flesch, R. (1948). A new readability yardstick. Journal of Applied Psychology, 32(3), 221–233. <https://doi.org/10.1037/h0057532>
213pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
214    let words = word_count(text);
215    let sentences = sentence_count(text);
216    let syllables = syllable_count(text);
217    debug!(words, sentences, syllables, "=> {}", Label::using());
218    let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
219    format!("{score:.2}").parse().unwrap()
220}
221/// Flesch Reading Ease Score (FRES)
222///
223/// FRES range is 100 (very easy) - 0 (extremely difficult)
224///
225/// Requires counting words, sentences, and syllables
226///
227/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
228pub fn flesch_reading_ease_score(text: &str) -> f64 {
229    let words = word_count(text);
230    let sentences = sentence_count(text);
231    let syllables = syllable_count(text);
232    debug!(words, sentences, syllables, "=> {}", Label::using());
233    let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
234    format!("{score:.2}").parse().unwrap()
235}
236/// Gunning Fog Index (GFI)
237///
238/// Estimates the years of formal education a person needs to understand the text on the first reading
239///
240/// Requires counting words, sentences, and "complex words" (see [complex_word_count])
241///
242/// See <https://en.wikipedia.org/wiki/Gunning_fog_index> for more information
243pub fn gunning_fog_index(text: &str) -> f64 {
244    let words = word_count(text);
245    let complex_words = complex_word_count(text);
246    let sentences = sentence_count(text);
247    let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
248    format!("{score:.2}").parse().unwrap()
249}
250/// Lix (abbreviation of Swedish läsbarhetsindex)
251///
252/// Indicates the difficulty of reading a text
253///
254/// Requires counting words, sentences, and long words (see [long_word_count])
255///
256/// "Lix" is an abbreviation of *läsbarhetsindex*, which means "readability index" in Swedish
257///
258/// See <https://en.wikipedia.org/wiki/Lix_(readability_test)> for more information
259pub fn lix(text: &str) -> f64 {
260    let words = word_count(text);
261    let sentences = sentence_count(text);
262    let long_words = long_word_count(text);
263    let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
264    format!("{score:.2}").parse().unwrap()
265}
266/// Simple Measure of Gobbledygook (SMOG)
267///
268/// Estimates the years of education needed to understand a piece of writing
269///
270/// **Caution**: SMOG formula was normalized on 30-sentence samples
271///
272/// Requires counting sentences, and "complex words" (see [complex_word_count])
273///
274/// See <https://en.wikipedia.org/wiki/SMOG> for more information
275pub fn smog(text: &str) -> f64 {
276    let sentences = sentence_count(text);
277    let complex_words = complex_word_count(text);
278    let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
279    format!("{score:.2}").parse().unwrap()
280}
281/// Get the singular form of a word (e.g. "people" -> "person")
282///
283/// Adapted from the PHP library, [Text-Statistics](https://github.com/DaveChild/Text-Statistics)
284pub fn singular_form(word: &str) -> String {
285    match word.to_lowercase().as_str() {
286        | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
287        | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
288        | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
289            | Some(value) => value.to_string(),
290            | None => value.to_string(),
291        },
292        | value => {
293            let pair = PLURAL_TO_SINGULAR
294                .iter()
295                .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
296                    | Ok(true) => true,
297                    | Ok(false) | Err(_) => false,
298                });
299            match pair {
300                | Some((pattern, replacement)) => {
301                    trace!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
302                    let re = Regex::new(pattern).unwrap();
303                    re.replace_all(value, *replacement).to_string()
304                }
305                | None => value.to_string(),
306            }
307        }
308    }
309}
310/// Count the number of syllables in a given text
311/// ### Example
312/// ```rust
313/// use acorn_lib::analyzer::readability::syllable_count;
314///
315/// let sentence = "The quick brown fox jumps over the lazy dog.";
316/// assert_eq!(syllable_count(sentence), 11);
317/// ```
318pub fn syllable_count(text: &str) -> usize {
319    fn syllables(word: String) -> usize {
320        let singular = singular_form(&word);
321        match word.as_str() {
322            | "" => 0,
323            | value if value.len() < 3 => 1,
324            | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
325                | Some(x) => *x,
326                | None => 0,
327            },
328            | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
329                | Some(x) => *x,
330                | None => 0,
331            },
332            | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
333                | Some(x) => *x,
334                | None => 0,
335            },
336            | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
337                | Some(x) => *x,
338                | None => 0,
339            },
340            | _ => {
341                let mut count: isize = 0;
342                let mut input = word;
343                count += 3 * TRIPLE.find_iter(&input).count() as isize;
344                input = TRIPLE.replace_all(&input, "").to_string();
345                count += 2 * DOUBLE.find_iter(&input).count() as isize;
346                input = DOUBLE.replace_all(&input, "").to_string();
347                count += SINGLE.find_iter(&input).count() as isize;
348                input = SINGLE.replace_all(&input, "").to_string();
349                count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
350                count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
351                count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
352                count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
353                count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
354                count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
355                count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
356                count as usize
357            }
358        }
359    }
360    let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
361    tokens.into_iter().map(syllables).sum()
362}
363// TODO: Expand acronyms into words
364/// Break text into tokens
365///
366/// Currently replaces `é` and `ë` with `-e`, splits on hyphens, and removes non-alphabetic characters.
367///
368/// This function is a good entry point for adding support for the nuacnces of 'scientific" texts
369pub fn tokenize(value: &str) -> Vec<String> {
370    value
371        .replace("é", "-e")
372        .replace("ë", "-e")
373        .split('-')
374        .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
375        .collect::<Vec<_>>()
376}
377
378#[cfg(test)]
379mod tests;