Skip to content

Commit

Permalink
Merge pull request #33 from chris-ha458/refactor_1
Browse files Browse the repository at this point in the history
Further refactors
  • Loading branch information
nickspring authored Oct 8, 2023
2 parents 9250db9 + 8b24cd4 commit d0d32ae
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 184 deletions.
218 changes: 38 additions & 180 deletions src/md.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
use crate::consts::{COMMON_SAFE_ASCII_CHARACTERS, UTF8_MAXIMAL_ALLOCATION};
use crate::utils::unicode_range;
use cached::proc_macro::cached;
use cached::UnboundCache;
use log::trace;
use log::{log_enabled, trace};
use ordered_float::OrderedFloat;
use unic::char::property::EnumeratedCharProperty;
use unic::ucd::{GeneralCategory, Name};

pub(crate) mod plugins;
pub(crate) mod structs;

use plugins::{
ArchaicUpperLowerPlugin, CjkInvalidStopPlugin, MessDetectorPlugin, SuperWeirdWordPlugin,
SuspiciousDuplicateAccentPlugin, SuspiciousRangePlugin, TooManyAccentuatedPlugin,
TooManySymbolOrPunctuationPlugin, UnprintablePlugin,
};
use structs::MessDetectorChar;

use self::structs::MessDetectorCharFlags;

//
// Mess detection module
//
Expand All @@ -41,9 +35,9 @@ pub(crate) fn mess_ratio(
];

let length = decoded_sequence.chars().count();
let mut mean_mess_ratio: f32 = 0.0;
let intermediary_mean_mess_ratio_calc: usize = match length {
0..=510 => 32,
let mut mean_mess_ratio: Option<f32> = None;
let early_calc_period: usize = match length {
..=510 => 32,
511..=1023 => 64,
_ => 128,
};
Expand All @@ -59,187 +53,51 @@ pub(crate) fn mess_ratio(
.filter(|detector| detector.eligible(&mess_char))
.for_each(|detector| detector.feed(&mess_char));

if (index > 0 && index.rem_euclid(intermediary_mean_mess_ratio_calc) == 0)
|| index == length
{
mean_mess_ratio = detectors.iter().map(|x| x.ratio()).sum();
if mean_mess_ratio >= maximum_threshold {
if index.rem_euclid(early_calc_period) == early_calc_period - 1 {
let early_mess_ratio: f32 = detectors.iter().map(|x| x.ratio()).sum();
if early_mess_ratio >= maximum_threshold {
mean_mess_ratio = Some(early_mess_ratio);
break;
}
}
}
let return_ratio = mean_mess_ratio.unwrap_or(detectors.iter().map(|x| x.ratio()).sum());

trace!(
"Mess-detector extended-analysis start: \
intermediary_mean_mess_ratio_calc={}, \
mean_mess_ratio={}, \
maximum_threshold={}",
intermediary_mean_mess_ratio_calc,
mean_mess_ratio,
maximum_threshold,
);

/*if decoded_sequence.len() > 16 {
if log_enabled!(log::Level::Trace) {
trace!(
"Chunk: {} ..... {}",
&decoded_sequence[..decoded_sequence
.char_indices()
.nth(16)
.map(|(i, _)| i)
.unwrap_or(decoded_sequence.chars().count())],
&decoded_sequence[decoded_sequence
.char_indices()
.nth(decoded_sequence.chars().count() - 16)
.map(|(i, _)| i)
.unwrap_or(decoded_sequence.chars().count())..],
"Mess-detector extended-analysis start: \
early_calc_period={}, \
mean_mess_ratio={}, \
maximum_threshold={}",
early_calc_period,
return_ratio,
maximum_threshold,
);
}
*/

for detector in detectors {
if detector.ratio() > 0.0 {
trace!("{} produces ratio: {}", detector.name(), detector.ratio());
/*if decoded_sequence.len() > 16 {
trace!(
"Chunk: {} ..... {}",
&decoded_sequence[..decoded_sequence
.char_indices()
.nth(16)
.map(|(i, _)| i)
.unwrap_or(decoded_sequence.chars().count())],
&decoded_sequence[decoded_sequence
.char_indices()
.nth(decoded_sequence.chars().count() - 16)
.map(|(i, _)| i)
.unwrap_or(decoded_sequence.chars().count())..],
);
}
}
trace!("===");

mean_mess_ratio
}

#[cached(
type = "UnboundCache<char, MessDetectorChar>",
create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }",
convert = r#"{ character }"#
)]
pub fn new_mess_detector_character(character: char) -> MessDetectorChar {
let mut flags = MessDetectorCharFlags::empty();

// PLEASE NOTE! In case of idiomatic refactoring
// take in account performance. Sometimes match could be used but it
// will require calculate all conditions and can decrease performance
// in comparison to usual if then else
*/

// ascii probing
if character.is_ascii() {
flags.insert(MessDetectorCharFlags::ASCII);
if character.is_ascii_graphic() {
flags.insert(MessDetectorCharFlags::ASCII_GRAPHIC);
if character.is_ascii_alphabetic() {
flags.insert(MessDetectorCharFlags::ASCII_ALPHABETIC);
} else if character.is_ascii_digit() {
flags.insert(MessDetectorCharFlags::ASCII_DIGIT);
for detector in &detectors {
if detector.ratio() > 0.0 {
trace!("{} produces ratio: {}", detector.name(), detector.ratio());
}
}
trace!("===");
}

// unicode information
let name = Name::of(character);
let category = GeneralCategory::of(character).abbr_name();
let range = unicode_range(character);

// whitespace
if character.is_whitespace() {
flags.insert(MessDetectorCharFlags::WHITESPACE);
flags.insert(MessDetectorCharFlags::SEPARATOR);
} else {
// safe symbols (non-whitespace)
if COMMON_SAFE_ASCII_CHARACTERS.contains(character) {
flags.insert(MessDetectorCharFlags::COMMON_SAFE);
}
if "<>-=~|_".contains(character) {
flags.insert(MessDetectorCharFlags::WEIRD_SAFE);
}

// numeric
if flags.contains(MessDetectorCharFlags::ASCII_DIGIT) || character.is_numeric() {
flags.insert(MessDetectorCharFlags::NUMERIC);
} else if flags.contains(MessDetectorCharFlags::ASCII_ALPHABETIC)
|| character.is_alphabetic()
{
// alphabetic
flags.insert(MessDetectorCharFlags::ALPHABETIC);
if character.is_lowercase() {
flags.insert(MessDetectorCharFlags::LOWERCASE);
flags.insert(MessDetectorCharFlags::CASE_VARIABLE);
} else if character.is_uppercase() {
flags.insert(MessDetectorCharFlags::UPPERCASE);
flags.insert(MessDetectorCharFlags::CASE_VARIABLE);
}
} else if !flags.contains(MessDetectorCharFlags::ASCII_GRAPHIC)
&& !['\x1A', '\u{FEFF}'].contains(&character)
&& MessDetectorChar::in_category(category, range, &["Cc"], &[], &["Control character"])
{
flags.insert(MessDetectorCharFlags::UNPRINTABLE);
}

// emoticon
if MessDetectorChar::in_category(category, range, &[], &[], &["Emoticons"]) {
flags.insert(MessDetectorCharFlags::EMOTICON);
}

// separator
if ['|', '+', '<', '>'].contains(&character)
|| MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[])
{
flags.insert(MessDetectorCharFlags::SEPARATOR);
}
}

// punctuation
if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) {
flags.insert(MessDetectorCharFlags::PUNCTUATION);
}

// symbol
if MessDetectorChar::in_category(category, range, &[], &["N", "S"], &["Forms"]) {
flags.insert(MessDetectorCharFlags::SYMBOL);
}

// latin
if MessDetectorChar::in_description(name, &["LATIN"]) {
flags.insert(MessDetectorCharFlags::LATIN);
} else {
// cjk
if MessDetectorChar::in_description(name, &["CJK"]) {
flags.insert(MessDetectorCharFlags::CJK);
}
// hangul
if MessDetectorChar::in_description(name, &["HANGUL"]) {
flags.insert(MessDetectorCharFlags::HANGUL);
}
// katakana
if MessDetectorChar::in_description(name, &["KATAKANA"]) {
flags.insert(MessDetectorCharFlags::KATAKANA);
}
// hiragana
if MessDetectorChar::in_description(name, &["HIRAGANA"]) {
flags.insert(MessDetectorCharFlags::HIRAGANA);
}
// thai
if MessDetectorChar::in_description(name, &["THAI"]) {
flags.insert(MessDetectorCharFlags::THAI);
}
}

// accentuated
if MessDetectorChar::in_description(
name,
&[
"WITH GRAVE",
"WITH ACUTE",
"WITH CEDILLA",
"WITH DIAERESIS",
"WITH CIRCUMFLEX",
"WITH TILDE",
],
) {
flags.insert(MessDetectorCharFlags::ACCENTUATED);
}

// create new object
MessDetectorChar {
character,
flags,
unicode_range: range,
}
return_ratio
}
2 changes: 1 addition & 1 deletion src/md/plugins.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{

// Base abstract trait used for mess detection plugins.
// All detectors MUST extend and implement given methods.
pub trait MessDetectorPlugin {
pub(super) trait MessDetectorPlugin {
// Name of plugin
fn name(&self) -> &str {
std::any::type_name::<Self>().split("::").last().unwrap()
Expand Down
Loading

0 comments on commit d0d32ae

Please sign in to comment.