Skip to content

Commit

Permalink
Merge pull request #30 from chris-ha458/lib
Browse files Browse the repository at this point in the history
md fixes
  • Loading branch information
nickspring authored Oct 5, 2023
2 parents cbe086f + 8090443 commit 684787e
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 90 deletions.
17 changes: 17 additions & 0 deletions src/entity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,20 @@ impl Debug for CharsetMatch {
}
}

impl Default for CharsetMatch {
fn default() -> Self {
CharsetMatch {
payload: vec![],
encoding: "utf-8".to_string(),
mean_mess_ratio: 0.0,
coherence_matches: vec![],
has_sig_or_bom: false,
submatch: vec![],
decoded_payload: None,
}
}
}

impl PartialEq<Self> for CharsetMatch {
fn eq(&self, other: &Self) -> bool {
self.encoding == other.encoding && self.decoded_payload == other.decoded_payload
Expand Down Expand Up @@ -299,6 +313,9 @@ impl CharsetMatches {
CharsetMatches::resort(&mut items);
CharsetMatches { items }
}
pub fn from_single(item: CharsetMatch) -> Self {
CharsetMatches { items: vec![item] }
}
// Insert a single match. Will be inserted accordingly to preserve sort.
// Can be inserted as a submatch.
pub fn append(&mut self, item: CharsetMatch) {
Expand Down
111 changes: 46 additions & 65 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ use crate::entity::{CharsetMatch, CharsetMatches, CoherenceMatches, NormalizerSe
use crate::md::mess_ratio;
use crate::utils::{
any_specified_encoding, decode, iana_name, identify_sig_or_bom, is_cp_similar,
is_multi_byte_encoding, should_strip_sig_or_bom,
is_invalid_chunk, is_multi_byte_encoding,
};
use encoding::DecoderTrap;
use log::{debug, trace};
Expand Down Expand Up @@ -203,14 +203,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
let bytes_length = bytes.len();
if bytes_length == 0 {
debug!("Encoding detection on empty bytes, assuming utf_8 intention.");
return CharsetMatches::new(Some(vec![CharsetMatch::new(
bytes,
"utf-8",
0.0,
false,
&vec![],
None,
)]));
return CharsetMatches::from_single(CharsetMatch::default());
}

// check min length
Expand Down Expand Up @@ -248,7 +241,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
}

// start to build prioritized encodings array
let mut prioritized_encodings: Vec<String> = vec![];
let mut prioritized_encodings: Vec<&str> = vec![];

// search for encoding in the content
let mut specified_encoding: String = String::new();
Expand All @@ -259,7 +252,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
&enc
);
specified_encoding = enc.to_string();
prioritized_encodings.push(enc);
prioritized_encodings.push(&specified_encoding);
}
}

Expand All @@ -271,16 +264,16 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
sig_pay.len(),
sig_enc,
);
prioritized_encodings.push(sig_enc.clone());
prioritized_encodings.push(sig_enc);
}

// add ascii & utf-8
prioritized_encodings.extend(["ascii".to_string(), "utf-8".to_string()]);
prioritized_encodings.extend(&["ascii", "utf-8"]);

// generate array of encodings for probing with prioritizing
let mut iana_encodings: VecDeque<&str> = VecDeque::from(IANA_SUPPORTED.clone());
for pe in prioritized_encodings.iter().rev() {
if let Some(index) = iana_encodings.iter().position(|x| *x == pe) {
if let Some(index) = iana_encodings.iter().position(|x| x == pe) {
let value = iana_encodings.remove(index).unwrap();
iana_encodings.push_front(value);
}
Expand All @@ -306,8 +299,8 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
{
continue;
}
let bom_or_sig_available: bool = sig_encoding == Some(encoding_iana.to_string());
let strip_sig_or_bom: bool = bom_or_sig_available && should_strip_sig_or_bom(encoding_iana);
let bom_or_sig_available: bool = sig_encoding.as_deref() == Some(encoding_iana);
// let strip_sig_or_bom = true // unlike python version this is always true in rust
let is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana);

// utf-16le & utf-16be cannot be identified without BOM
Expand All @@ -320,34 +313,34 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
}

// fast pre-check
let mut decoded_payload: Option<&str> = None;
let decoded_payload_result = decode(
&bytes[if strip_sig_or_bom {
sig_payload.unwrap().len()
} else {
0
}..if is_too_large_sequence && !is_multi_byte_decoder {
*MAX_PROCESSED_BYTES
} else {
bytes_length
}],
let start_idx = if bom_or_sig_available {
sig_payload.unwrap().len()
} else {
0
};
let end_idx = if is_too_large_sequence && !is_multi_byte_decoder {
*MAX_PROCESSED_BYTES
} else {
bytes_length
};
let decoded_payload: Option<String> = match decode(
&bytes[start_idx..end_idx],
encoding_iana,
DecoderTrap::Strict,
is_too_large_sequence && !is_multi_byte_decoder,
false,
);
if let Ok(payload) = decoded_payload_result.as_ref() {
if !is_too_large_sequence || is_multi_byte_decoder {
decoded_payload = Some(payload);
) {
Ok(payload) if !is_too_large_sequence || is_multi_byte_decoder => Some(payload),
Ok(_) => None,
Err(_) => {
trace!(
"Code page {} does not fit given bytes sequence at ALL.",
encoding_iana,
);
tested_but_hard_failure.push(encoding_iana);
continue 'iana_encodings_loop;
}
} else {
trace!(
"Code page {} does not fit given bytes sequence at ALL.",
encoding_iana,
);
tested_but_hard_failure.push(encoding_iana);
continue 'iana_encodings_loop;
}
};

// soft failed pre-check
// important thing! it occurs sometimes fail detection
Expand Down Expand Up @@ -381,11 +374,11 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset

// main loop over chunks in our input
// we go over bytes or chars - it depends on previous code
let seq_len = match decoded_payload {
let seq_len = match &decoded_payload {
Some(payload) => payload.chars().count(),
None => bytes_length,
};
let starting_offset = match (bom_or_sig_available, decoded_payload) {
let starting_offset = match (bom_or_sig_available, &decoded_payload) {
(true, None) => sig_payload.as_ref().unwrap().len(),
_ => 0,
};
Expand All @@ -405,13 +398,9 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
// Bytes processing
None => {
let offset_end = (offset + settings.chunk_size).min(seq_len);
let cut_bytes_vec: Vec<u8> = if bom_or_sig_available && !strip_sig_or_bom {
[sig_payload.as_ref().unwrap(), &bytes[offset..offset_end]].concat()
} else {
bytes[offset..offset_end].to_vec()
};
let cut_bytes_vec: &[u8] = &bytes[offset..offset_end];
decode(
&cut_bytes_vec,
cut_bytes_vec,
encoding_iana,
DecoderTrap::Strict,
false,
Expand All @@ -420,11 +409,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
}
};

// ascii in encodings means windows-1252 codepage with supports diacritis
// because of this we will check additionally it with is_ascii method
if decoded_chunk_result.is_err()
|| (encoding_iana == "ascii" && !decoded_chunk_result.as_ref().unwrap().is_ascii())
{
if is_invalid_chunk(&decoded_chunk_result, encoding_iana) {
trace!(
"LazyStr Loading: After MD chunk decode, code page {} \
does not fit given bytes sequence at ALL. {}",
Expand All @@ -446,8 +431,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
if md_ratios.last().unwrap() >= &settings.threshold {
early_stop_count += 1;
}
if early_stop_count >= max_chunk_gave_up || (bom_or_sig_available && !strip_sig_or_bom)
{
if early_stop_count >= max_chunk_gave_up {
break 'chunks_loop;
}
}
Expand All @@ -462,9 +446,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
false,
false,
);
if decoded_chunk_result.is_err()
|| (encoding_iana == "ascii" && !decoded_chunk_result.as_ref().unwrap().is_ascii())
{
if is_invalid_chunk(&decoded_chunk_result, encoding_iana) {
trace!(
"LazyStr Loading: After final lookup, code page {} does not fit \
given bytes sequence at ALL. {}",
Expand Down Expand Up @@ -494,15 +476,15 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
// Preparing those fallbacks in case we got nothing.
if settings.enable_fallback
&& !lazy_str_hard_failure
&& prioritized_encodings.contains(&encoding_iana.to_string())
&& prioritized_encodings.contains(&encoding_iana)
{
let fallback_entry = Some(CharsetMatch::new(
bytes,
encoding_iana,
f32::from(settings.threshold),
false,
&vec![],
decoded_payload,
decoded_payload.as_deref(),
));

match encoding_iana {
Expand Down Expand Up @@ -551,20 +533,19 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
mean_mess_ratio,
bom_or_sig_available,
&cd_ratios_merged,
decoded_payload,
decoded_payload.as_deref(),
));

if (mean_mess_ratio < 0.1 && prioritized_encodings.contains(&encoding_iana.to_string()))
if (mean_mess_ratio < 0.1 && prioritized_encodings.contains(&encoding_iana))
|| encoding_iana == sig_encoding.clone().unwrap_or_default()
{
debug!(
"Encoding detection: {} is most likely the one.",
encoding_iana
);
return CharsetMatches::new(Some(vec![results
.get_by_encoding(encoding_iana)
.unwrap()
.clone()]));
return CharsetMatches::from_single(
results.get_by_encoding(encoding_iana).unwrap().clone(),
);
}
}

Expand Down
36 changes: 16 additions & 20 deletions src/md.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ impl MessDetectorPlugin for TooManySymbolOrPunctuationPlugin {
}
fn feed(&mut self, character: char) {
self.character_count += 1;
if (self.last_printable_char.is_none() || character != self.last_printable_char.unwrap())
&& !COMMON_SAFE_ASCII_CHARACTERS.contains(character)
{
let is_different_char = self
.last_printable_char
.map_or(true, |last_char| character != last_char);
if is_different_char && !COMMON_SAFE_ASCII_CHARACTERS.contains(character) {
if is_punctuation(character) {
self.punctuation_count += 1;
} else if !character.is_numeric() && is_symbol(character) && !is_emoticon(character) {
Expand Down Expand Up @@ -148,17 +149,15 @@ impl MessDetectorPlugin for SuspiciousDuplicateAccentPlugin {
}
fn feed(&mut self, character: char) {
self.character_count += 1;
if self.last_latin_character.is_some()
&& is_accentuated(character)
&& is_accentuated(self.last_latin_character.unwrap())
{
if character.is_uppercase() && self.last_latin_character.unwrap().is_uppercase() {
self.successive_count += 1;
}

// Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self.last_latin_character.unwrap()) {
self.successive_count += 1;
if let Some(last_latin_char) = self.last_latin_character {
if is_accentuated(character) && is_accentuated(last_latin_char) {
if character.is_uppercase() && last_latin_char.is_uppercase() {
self.successive_count += 1;
}
// Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(last_latin_char) {
self.successive_count += 1;
}
}
}
self.last_latin_character = Some(character);
Expand Down Expand Up @@ -249,16 +248,13 @@ impl MessDetectorPlugin for SuperWeirdWordPlugin {
if is_accentuated(character) {
self.buffer_accent_count += 1;
}
if !self.foreign_long_watch
&& (!is_latin(character) || is_accentuated(character))
self.foreign_long_watch |= (!is_latin(character) || is_accentuated(character))
&& !is_cjk(character)
&& !is_hangul(character)
&& !is_katakana(character)
&& !is_hiragana(character)
&& !is_thai(character)
{
self.foreign_long_watch = true;
}
&& !is_thai(character);

return;
}
if self.buffer.is_empty() {
Expand Down
15 changes: 10 additions & 5 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,6 @@ pub(crate) fn remove_accent(ch: char) -> char {
base_char.map_or(ch, |c| c)
}

pub(crate) fn should_strip_sig_or_bom(_iana_encoding: &str) -> bool {
// it looks like we always remove it in Rust (but in Python version no)
true
}

// Verify is a specific encoding is a multi byte one based on it IANA name
pub fn is_multi_byte_encoding(name: &str) -> bool {
[
Expand Down Expand Up @@ -512,6 +507,16 @@ pub(crate) fn get_language_data(language: &Language) -> Result<(&'static str, bo
Err(String::from("Language wasn't found"))
}

// ascii in encodings means windows-1252 codepage with supports diacritis
// because of this we will check additionally it with is_ascii method
pub(super) fn is_invalid_chunk(
decoded_chunk_result: &Result<String, String>,
encoding_iana: &str,
) -> bool {
decoded_chunk_result.is_err()
|| (encoding_iana == "ascii" && !decoded_chunk_result.as_ref().is_ok_and(|s| s.is_ascii()))
}

// Get large datasets
fn collect_large_sets(dir: &Path) -> Vec<PathBuf> {
let mut files = Vec::new();
Expand Down

0 comments on commit 684787e

Please sign in to comment.