acorn_lib/analyzer/readability/
mod.rs1use crate::util::constants::{
5 MAX_ALLOWED_ARI, MAX_ALLOWED_CLI, MAX_ALLOWED_FKGL, MAX_ALLOWED_FRES, MAX_ALLOWED_GFI, MAX_ALLOWED_LIX, MAX_ALLOWED_SMOG,
6};
7use crate::util::find_first;
8use crate::util::Label;
9use derive_more::Display;
10use dotenvy::dotenv;
11use fancy_regex::Regex;
12use tracing::warn;
13use tracing::{debug, trace};
14
15pub mod constants;
16use constants::{
17 DOUBLE, DOUBLE_SYLLABIC_FOUR, DOUBLE_SYLLABIC_ONE, DOUBLE_SYLLABIC_THREE, DOUBLE_SYLLABIC_TWO, IRREGULAR_NOUNS, IRREGULAR_NOUNS_INVERTED,
18 NEED_TO_BE_FIXED, NON_ALPHABETIC, PLURAL_TO_SINGULAR, PROBLEMATIC_WORDS, SAME_SINGULAR_PLURAL, SINGLE, SINGLE_SYLLABIC_ONE, SINGLE_SYLLABIC_TWO,
19 TRIPLE, VOWEL,
20};
21
22#[derive(Clone, Copy, Debug, Default, Display, PartialEq)]
24pub enum ReadabilityType {
25 #[display("ari")]
29 ARI,
30 #[display("cli")]
34 CLI,
35 #[default]
39 #[display("fkgl")]
40 FKGL,
41 #[display("fres")]
45 FRES,
46 #[display("gfi")]
50 GFI,
51 #[display("lix")]
55 Lix,
56 #[display("smog")]
60 SMOG,
61}
62impl From<ReadabilityType> for String {
63 fn from(value: ReadabilityType) -> Self {
64 value.to_string()
65 }
66}
67impl From<String> for ReadabilityType {
68 fn from(value: String) -> Self {
69 ReadabilityType::from_string(&value)
70 }
71}
72impl From<&str> for ReadabilityType {
73 fn from(value: &str) -> Self {
74 ReadabilityType::from_string(value)
75 }
76}
77impl ReadabilityType {
78 pub fn calculate(self, text: &str) -> f64 {
80 match self {
81 | ReadabilityType::ARI => automated_readability_index(text),
82 | ReadabilityType::CLI => coleman_liau_index(text),
83 | ReadabilityType::FKGL => flesch_kincaid_grade_level(text),
84 | ReadabilityType::FRES => flesch_reading_ease_score(text),
85 | ReadabilityType::GFI => gunning_fog_index(text),
86 | ReadabilityType::Lix => lix(text),
87 | ReadabilityType::SMOG => smog(text),
88 }
89 }
90 pub fn from_string(value: &str) -> ReadabilityType {
92 match value.to_lowercase().replace("-", " ").as_str() {
93 | "ari" | "automated readability index" => ReadabilityType::ARI,
94 | "cli" | "coleman liau index" => ReadabilityType::CLI,
95 | "fkgl" | "flesch kincaid grade level" => ReadabilityType::FKGL,
96 | "fres" | "flesch reading ease score" => ReadabilityType::FRES,
97 | "gfi" | "gunning fog index" => ReadabilityType::GFI,
98 | "lix" => ReadabilityType::Lix,
99 | "smog" | "simple measure of gobbledygook" => ReadabilityType::SMOG,
100 | _ => {
101 warn!(value, "=> {} Unknown Readability Type", Label::using());
102 ReadabilityType::default()
103 }
104 }
105 }
106 pub fn maximum_allowed(self) -> f64 {
108 match self {
109 | ReadabilityType::ARI => MAX_ALLOWED_ARI,
110 | ReadabilityType::CLI => MAX_ALLOWED_CLI,
111 | ReadabilityType::FKGL => MAX_ALLOWED_FKGL,
112 | ReadabilityType::FRES => MAX_ALLOWED_FRES,
113 | ReadabilityType::GFI => MAX_ALLOWED_GFI,
114 | ReadabilityType::Lix => MAX_ALLOWED_LIX,
115 | ReadabilityType::SMOG => MAX_ALLOWED_SMOG,
116 }
117 }
118 pub fn maximum_allowed_from_env(self) -> Option<f64> {
120 match dotenv() {
121 | Ok(_) => {
122 let variables = dotenvy::vars().collect::<Vec<(String, String)>>();
123 let pair = match self {
124 | ReadabilityType::ARI => find_first(variables, "MAX_ALLOWED_ARI"),
125 | ReadabilityType::CLI => find_first(variables, "MAX_ALLOWED_CLI"),
126 | ReadabilityType::FKGL => find_first(variables, "MAX_ALLOWED_FKGL"),
127 | ReadabilityType::FRES => find_first(variables, "MAX_ALLOWED_FRES"),
128 | ReadabilityType::GFI => find_first(variables, "MAX_ALLOWED_GFI"),
129 | ReadabilityType::Lix => find_first(variables, "MAX_ALLOWED_LIX"),
130 | ReadabilityType::SMOG => find_first(variables, "MAX_ALLOWED_SMOG"),
131 };
132 match pair {
133 | Some((_, value)) => Some(value.parse::<f64>().unwrap()),
134 | None => None,
135 }
136 }
137 | Err(_) => None,
138 }
139 }
140}
141pub fn complex_word_count(text: &str) -> u32 {
145 words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
146}
147pub fn letter_count(text: &str) -> u32 {
151 text.chars()
152 .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
153 .count() as u32
154}
155pub fn long_word_count(text: &str) -> u32 {
159 words(text).iter().filter(|word| word.len() > 6).count() as u32
160}
161pub fn sentence_count(text: &str) -> u32 {
163 text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
164}
165pub fn words(text: &str) -> Vec<String> {
167 text.split_whitespace().map(String::from).collect()
168}
169pub fn word_count(text: &str) -> u32 {
173 words(text).len() as u32
174}
175pub fn automated_readability_index(text: &str) -> f64 {
184 let letters = letter_count(text);
185 let words = word_count(text);
186 let sentences = sentence_count(text);
187 debug!(letters, words, sentences, "=> {}", Label::using());
188 let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
189 format!("{score:.2}").parse().unwrap()
190}
191pub fn coleman_liau_index(text: &str) -> f64 {
195 let letters = letter_count(text);
196 let words = word_count(text);
197 let sentences = sentence_count(text);
198 debug!(letters, words, sentences, "=> {}", Label::using());
199 let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
200 format!("{score:.2}").parse().unwrap()
201}
202pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
214 let words = word_count(text);
215 let sentences = sentence_count(text);
216 let syllables = syllable_count(text);
217 debug!(words, sentences, syllables, "=> {}", Label::using());
218 let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
219 format!("{score:.2}").parse().unwrap()
220}
221pub fn flesch_reading_ease_score(text: &str) -> f64 {
229 let words = word_count(text);
230 let sentences = sentence_count(text);
231 let syllables = syllable_count(text);
232 debug!(words, sentences, syllables, "=> {}", Label::using());
233 let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
234 format!("{score:.2}").parse().unwrap()
235}
236pub fn gunning_fog_index(text: &str) -> f64 {
244 let words = word_count(text);
245 let complex_words = complex_word_count(text);
246 let sentences = sentence_count(text);
247 let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
248 format!("{score:.2}").parse().unwrap()
249}
250pub fn lix(text: &str) -> f64 {
260 let words = word_count(text);
261 let sentences = sentence_count(text);
262 let long_words = long_word_count(text);
263 let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
264 format!("{score:.2}").parse().unwrap()
265}
266pub fn smog(text: &str) -> f64 {
276 let sentences = sentence_count(text);
277 let complex_words = complex_word_count(text);
278 let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
279 format!("{score:.2}").parse().unwrap()
280}
281pub fn singular_form(word: &str) -> String {
285 match word.to_lowercase().as_str() {
286 | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
287 | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
288 | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
289 | Some(value) => value.to_string(),
290 | None => value.to_string(),
291 },
292 | value => {
293 let pair = PLURAL_TO_SINGULAR
294 .iter()
295 .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
296 | Ok(true) => true,
297 | Ok(false) | Err(_) => false,
298 });
299 match pair {
300 | Some((pattern, replacement)) => {
301 trace!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
302 let re = Regex::new(pattern).unwrap();
303 re.replace_all(value, *replacement).to_string()
304 }
305 | None => value.to_string(),
306 }
307 }
308 }
309}
310pub fn syllable_count(text: &str) -> usize {
319 fn syllables(word: String) -> usize {
320 let singular = singular_form(&word);
321 match word.as_str() {
322 | "" => 0,
323 | value if value.len() < 3 => 1,
324 | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
325 | Some(x) => *x,
326 | None => 0,
327 },
328 | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
329 | Some(x) => *x,
330 | None => 0,
331 },
332 | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
333 | Some(x) => *x,
334 | None => 0,
335 },
336 | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
337 | Some(x) => *x,
338 | None => 0,
339 },
340 | _ => {
341 let mut count: isize = 0;
342 let mut input = word;
343 count += 3 * TRIPLE.find_iter(&input).count() as isize;
344 input = TRIPLE.replace_all(&input, "").to_string();
345 count += 2 * DOUBLE.find_iter(&input).count() as isize;
346 input = DOUBLE.replace_all(&input, "").to_string();
347 count += SINGLE.find_iter(&input).count() as isize;
348 input = SINGLE.replace_all(&input, "").to_string();
349 count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
350 count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
351 count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
352 count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
353 count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
354 count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
355 count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
356 count as usize
357 }
358 }
359 }
360 let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
361 tokens.into_iter().map(syllables).sum()
362}
363pub fn tokenize(value: &str) -> Vec<String> {
370 value
371 .replace("é", "-e")
372 .replace("ë", "-e")
373 .split('-')
374 .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
375 .collect::<Vec<_>>()
376}
377
378#[cfg(test)]
379mod tests;