Skip to content

Commit

Permalink
Integrated some operations into main loop
Browse files Browse the repository at this point in the history
  • Loading branch information
aafrecct committed Jul 18, 2024
1 parent 73219d8 commit 87abdd3
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "simple_unicode_normalization_forms"
version = "0.1.0"
version = "0.1.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
165 changes: 165 additions & 0 deletions src/emoji.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
("Emoji_Modifier", EMOJI_MODIFIER),
("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE),
("Emoji_Presentation", EMOJI_PRESENTATION),
];

pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('🏻', '🏿')];
pub const EXTRA_CHARS: &'static [(char, char)] =
&[('\u{FE0E}', '\u{FE0F}'), ('\u{20E2}', '\u{20E4}')];

pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[
('☝', '☝'),
('⛹', '⛹'),
('✊', '✍'),
('🎅', '🎅'),
('🏂', '🏄'),
('🏇', '🏇'),
('🏊', '🏌'),
('👂', '👃'),
('👆', '👐'),
('👦', '👸'),
('👼', '👼'),
('💁', '💃'),
('💅', '💇'),
('💏', '💏'),
('💑', '💑'),
('💪', '💪'),
('🕴', '🕵'),
('🕺', '🕺'),
('🖐', '🖐'),
('🖕', '🖖'),
('🙅', '🙇'),
('🙋', '🙏'),
('🚣', '🚣'),
('🚴', '🚶'),
('🛀', '🛀'),
('🛌', '🛌'),
('🤌', '🤌'),
('🤏', '🤏'),
('🤘', '🤟'),
('🤦', '🤦'),
('🤰', '🤹'),
('🤼', '🤾'),
('🥷', '🥷'),
('🦵', '🦶'),
('🦸', '🦹'),
('🦻', '🦻'),
('🧍', '🧏'),
('🧑', '🧝'),
('🫃', '🫅'),
('🫰', '🫸'),
];

pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[
('⌚', '⌛'),
('⏩', '⏬'),
('⏰', '⏰'),
('⏳', '⏳'),
('◽', '◾'),
('☔', '☕'),
('♈', '♓'),
('♿', '♿'),
('⚓', '⚓'),
('⚡', '⚡'),
('⚪', '⚫'),
('⚽', '⚾'),
('⛄', '⛅'),
('⛎', '⛎'),
('⛔', '⛔'),
('⛪', '⛪'),
('⛲', '⛳'),
('⛵', '⛵'),
('⛺', '⛺'),
('⛽', '⛽'),
('✅', '✅'),
('✊', '✋'),
('✨', '✨'),
('❌', '❌'),
('❎', '❎'),
('❓', '❕'),
('❗', '❗'),
('➕', '➗'),
('➰', '➰'),
('➿', '➿'),
('⬛', '⬜'),
('⭐', '⭐'),
('⭕', '⭕'),
('🀄', '🀄'),
('🃏', '🃏'),
('🆎', '🆎'),
('🆑', '🆚'),
('🇦', '🇿'),
('🈁', '🈁'),
('🈚', '🈚'),
('🈯', '🈯'),
('🈲', '🈶'),
('🈸', '🈺'),
('🉐', '🉑'),
('🌀', '🌠'),
('🌭', '🌵'),
('🌷', '🍼'),
('🍾', '🎓'),
('🎠', '🏊'),
('🏏', '🏓'),
('🏠', '🏰'),
('🏴', '🏴'),
('🏸', '🐾'),
('👀', '👀'),
('👂', '📼'),
('📿', '🔽'),
('🕋', '🕎'),
('🕐', '🕧'),
('🕺', '🕺'),
('🖕', '🖖'),
('🖤', '🖤'),
('🗻', '🙏'),
('🚀', '🛅'),
('🛌', '🛌'),
('🛐', '🛒'),
('🛕', '🛗'),
('🛜', '🛟'),
('🛫', '🛬'),
('🛴', '🛼'),
('🟠', '🟫'),
('🟰', '🟰'),
('🤌', '🤺'),
('🤼', '🥅'),
('🥇', '🧿'),
('🩰', '🩼'),
('🪀', '🪈'),
('🪐', '🪽'),
('🪿', '🫅'),
('🫎', '🫛'),
('🫠', '🫨'),
('🫰', '🫸'),
];

pub trait IsEmoji {
fn is_emoji(&self) -> bool;
}
impl IsEmoji for char {
fn is_emoji(&self) -> bool {
for (lc, hc) in EMOJI_PRESENTATION {
if self >= lc && self <= hc {
return true;
}
}
for (lc, hc) in EMOJI_MODIFIER {
if self >= lc && self <= hc {
return true;
}
}
for (lc, hc) in EMOJI_MODIFIER_BASE {
if self >= lc && self <= hc {
return true;
}
}
for (lc, hc) in EXTRA_CHARS {
if self >= lc && self <= hc {
return true;
}
}
false
}
}
83 changes: 64 additions & 19 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,47 +1,78 @@
// Copyright (c) 2024 Future Internet Consulting and Development Solutions S.L.
mod emoji;

use emoji::IsEmoji;
use lazy_static::lazy_static;
use regex::Regex;
use pyo3::prelude::*;
use std::collections::HashSet;
use regex::Regex;
use unicode_normalization::char::decompose_compatible;
use unicode_normalization::UnicodeNormalization;

lazy_static! {
static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap();
static ref EMOJI_RE: Regex = Regex::new(r"[\p{Emoji_Presentation}\p{Emoji_Modifier}\p{Emoji_Modifier_Base}\{Cc}\uFE0E\uFE0F\u20E2\u20E3\u20E4]").unwrap();
}

/// Gives the normalized form of a string skipping some characters.
fn nfkc_normalization(str: String, allow_chars: HashSet<char>) -> String {
fn custom_normalization(
str: String,
allow_chars: Vec<char>,
collapse_whitespace: bool,
remove_emojis: bool,
) -> String {
let mut result = String::with_capacity(str.len());
let mut previous_whitespace = false;
for c in str.chars() {
if allow_chars.contains(&c) {
result.push(c)
custom_character_normalization(
&mut result,
c,
&allow_chars,
collapse_whitespace,
previous_whitespace,
remove_emojis,
);
previous_whitespace = c.is_whitespace();
}
result.nfc().collect::<String>()
}

fn custom_character_normalization(
str: &mut String,
c: char,
allow_chars: &Vec<char>,
collapse_whitespace: bool,
previous_whitespace: bool,
remove_emojis: bool,
) {
if allow_chars.contains(&c) {
str.push(c)
} else if c.is_whitespace() {
if collapse_whitespace && previous_whitespace {
return;
} else {
decompose_compatible(c, |r| {
// Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set
if r <= '\u{FFFF}' {
result.push(r)
}
})
str.push(' ')
}
} else if remove_emojis && c.is_emoji() {
return;
} else {
decompose_compatible(c, |r| {
// Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set
if r <= '\u{FFFF}' {
str.push(r)
}
})
}

result.nfc().collect::<String>()
}

#[pyfunction]
fn basic_string_clean(value: String) -> PyResult<String> {
Ok(nfkc_normalization(value, HashSet::from(['º', 'ª'])).trim().to_string())
Ok(custom_normalization(value, vec!['º', 'ª'], false, false)
.trim()
.to_string())
}

#[pyfunction]
fn remove_emojis(value: String) -> PyResult<String> {
let cleaned_value = nfkc_normalization(value, HashSet::from(['º', 'ª']));
let whitespace_cleaned_value = WHITESPACE_RE.replace_all(&cleaned_value, " ");
let result = EMOJI_RE.replace_all(&whitespace_cleaned_value, "");

let result = custom_normalization(value, vec!['º', 'ª'], true, true);
Ok(result.trim().to_string())
}

Expand All @@ -52,3 +83,17 @@ fn simple_unicode_normalization_forms(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(remove_emojis, m)?)?;
Ok(())
}

#[cfg(test)]
mod tests {
use super::remove_emojis;
use std::time::Instant;

#[test]
fn timeit() {
let t1 = Instant::now();
remove_emojis(" a\t name with ❤️✳️0️⃣#️⃣ #©*1 ".to_string());
let t2 = Instant::now();
println!("{:?}", t2 - t1);
}
}

0 comments on commit 87abdd3

Please sign in to comment.