Skip to content

Commit

Permalink
Progress
Browse files Browse the repository at this point in the history
  • Loading branch information
aarranz committed Jul 18, 2024
1 parent 75e1996 commit 73219d8
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 8 deletions.
52 changes: 52 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,15 @@ name = "simple_unicode_normalization_forms"
crate-type = ["cdylib"]

[dependencies]
pyo3 = "0.22.0"
lazy_static = "1.5.0"
regex = "1.10.5"
unicode-normalization = "0.1.23"

[dependencies.pyo3]
version = "0.22.0"
# "abi3-py38" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.8
features = ["abi3-py38"]

[target.aarch64-apple-darwin]
rustflags = [
"-C", "link-arg=-undefined",
Expand Down
45 changes: 38 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,54 @@
// Copyright (c) 2024 Future Internet Consulting and Development Solutions S.L.

use lazy_static::lazy_static;
use regex::Regex;
use pyo3::prelude::*;
use std::collections::HashSet;
use unicode_normalization::char::decompose_compatible;
use unicode_normalization::UnicodeNormalization;

lazy_static! {
static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap();
static ref EMOJI_RE: Regex = Regex::new(r"[\p{Emoji_Presentation}\p{Emoji_Modifier}\p{Emoji_Modifier_Base}\{Cc}\uFE0E\uFE0F\u20E2\u20E3\u20E4]").unwrap();
}

/// Gives the normalized form of a string skipping some characters.
#[pyfunction]
fn nfkc_normalization(str: String, allow_chars: Vec<char>) -> PyResult<String> {
let mut result = String::with_capacity(str.len() * 2);
fn nfkc_normalization(str: String, allow_chars: HashSet<char>) -> String {
let mut result = String::with_capacity(str.len());
for c in str.chars() {
if allow_chars.contains(&c) {
result.push(c)
} else {
decompose_compatible(c, |r| result.push(r))
decompose_compatible(c, |r| {
// Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set
if r <= '\u{FFFF}' {
result.push(r)
}
})
}
}
Ok(result)

result.nfc().collect::<String>()
}

#[pyfunction]
fn basic_string_clean(value: String) -> PyResult<String> {
Ok(nfkc_normalization(value, HashSet::from(['º', 'ª'])).trim().to_string())
}

#[pyfunction]
fn remove_emojis(value: String) -> PyResult<String> {
let cleaned_value = nfkc_normalization(value, HashSet::from(['º', 'ª']));
let whitespace_cleaned_value = WHITESPACE_RE.replace_all(&cleaned_value, " ");
let result = EMOJI_RE.replace_all(&whitespace_cleaned_value, "");

Ok(result.trim().to_string())
}

/// A Python module implemented in Rust.
#[pymodule]
fn ficodes_string_normalization(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(nfkc_normalization, m)?)?;
fn simple_unicode_normalization_forms(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(basic_string_clean, m)?)?;
m.add_function(wrap_pyfunction!(remove_emojis, m)?)?;
Ok(())
}

0 comments on commit 73219d8

Please sign in to comment.