Skip to content

Commit

Permalink
Merge pull request #24 from chris-ha458/fixes
Browse files Browse the repository at this point in the history
Fixes
  • Loading branch information
nickspring authored Oct 2, 2023
2 parents 94e7b0c + 12057b3 commit f6f3bd2
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 133 deletions.
49 changes: 21 additions & 28 deletions src/cd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,17 @@ pub(crate) fn encoding_unicode_range(iana_name: &str) -> Result<Vec<&str>, Strin
let byte_range = 0x40..0xFF; // utf8 range. range.len()==191
let mut result: HashMap<&str, u8> = HashMap::with_capacity(byte_range.len());

for i in byte_range {
encoder
byte_range.for_each(|i| {
if let Some(range) = encoder
.decode(&[i], DecoderTrap::Ignore)
.ok()
.and_then(|chunk| chunk.chars().next())
.and_then(|first_char| unicode_range(&first_char))
.filter(|&range| !is_unicode_range_secondary(range))
.map(|range| {
*result.entry(range).or_insert(0) += 1;
});
}
{
*result.entry(range).or_insert(0) += 1;
}
});
let character_count: u8 = result.values().sum();
let threshold = 0.15;
let mut result: Vec<&str> = result
Expand Down Expand Up @@ -94,7 +94,7 @@ pub(crate) fn alphabet_languages(
ignore_non_latin: bool,
) -> Vec<&'static Language> {
let mut languages: Vec<(&Language, f32)> = vec![];
let source_characters_set: HashSet<_> = characters.iter().cloned().copied().collect();
let source_characters_set: HashSet<char> = characters.iter().copied().copied().collect(); //take a look why copied/cloned is needed twice
let source_has_accents = source_characters_set.iter().any(is_accentuated);

for (language, language_characters, target_have_accents, target_pure_latin) in LANGUAGES.iter()
Expand Down Expand Up @@ -129,21 +129,16 @@ pub(crate) fn alpha_unicode_split(decoded_sequence: &str) -> Vec<String> {

for ch in decoded_sequence.chars().filter(|c| c.is_alphabetic()) {
if let Some(character_range) = unicode_range(&ch) {
let mut layer_target_range: Option<&str> = None;
for discovered_range in layers.keys() {
if !is_suspiciously_successive_range(Some(discovered_range), Some(character_range))
{
layer_target_range = Some(discovered_range);
break;
}
}
let layer = layers
.entry(layer_target_range.get_or_insert(character_range))
.or_default();
let layer_key: &str = layers
.keys()
.find(|key| !is_suspiciously_successive_range(Some(key), Some(character_range)))
.copied()
.unwrap_or(character_range);
let layer = layers.entry(layer_key).or_default();
layer.extend(ch.to_lowercase());
}
}
layers.values().cloned().collect()
layers.into_values().collect()
}

// Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
Expand All @@ -164,7 +159,7 @@ pub(crate) fn characters_popularity_compare(
pub(crate) fn filter_alt_coherence_matches(results: &CoherenceMatches) -> CoherenceMatches {
let mut index: HashMap<&Language, f32> = HashMap::with_capacity(results.len());
for result in results {
let score = index.entry(result.language).or_insert(0.0);
let score = index.entry(result.language).or_default();
*score = result.score.max(*score);
}
index
Expand Down Expand Up @@ -204,9 +199,8 @@ pub(crate) fn coherence_ratio(
include_languages: Option<Vec<&'static Language>>,
) -> Result<CoherenceMatches, String> {
let threshold = f32::from(threshold.unwrap_or(OrderedFloat(0.1)));
let mut include_languages = include_languages.unwrap_or_default();
let ignore_non_latin =
include_languages.len() == 1 && include_languages.first() == Some(&&Language::Unknown);
let mut include_languages: Vec<&Language> = include_languages.unwrap_or_default();
let ignore_non_latin = include_languages == vec![&Language::Unknown];
if ignore_non_latin {
include_languages.clear();
}
Expand Down Expand Up @@ -235,11 +229,10 @@ pub(crate) fn coherence_ratio(
let ratio: f32 =
characters_popularity_compare(language, &popular_character_ordered_as_string)?;

if ratio < threshold {
continue;
}
if ratio >= 0.8 {
sufficient_match_count += 1;
match ratio {
r if r < threshold => continue,
r if r >= 0.8 => sufficient_match_count += 1,
_ => {}
}

results.push(CoherenceMatch {
Expand Down
2 changes: 1 addition & 1 deletion src/consts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ lazy_static! {
pub static ref TOO_BIG_SEQUENCE: usize = 1_000_000; // 10E6

pub(crate) static ref UTF8_MAXIMAL_ALLOCATION: usize = 128;
pub(crate) static ref UNICODE_RANGES_COMBINED: Vec<(&'static str, RangeInclusive<u32>)> = vec![
pub(crate) static ref UNICODE_RANGES_COMBINED: [(&'static str, RangeInclusive<u32>);279] = [
("Control character", 0..=31),
("Basic Latin", 32..=127),
("Latin-1 Supplement", 128..=255),
Expand Down
36 changes: 18 additions & 18 deletions src/entity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ impl CharsetMatch {
}

// Add submatch
pub fn add_submatch(&mut self, submatch: CharsetMatch) {
pub fn add_submatch(&mut self, submatch: &CharsetMatch) {
self.submatch.push(submatch.clone());
//self.decoded_payload = None;
}
Expand All @@ -182,22 +182,22 @@ impl CharsetMatch {
// Most probable language found in decoded sequence. If none were detected or inferred, the property will return
// Language::Unknown
pub fn most_probably_language(&self) -> &'static Language {
self.coherence_matches
.first()
.map(|lang| lang.language)
.unwrap_or_else(|| {
// Trying to infer the language based on the given encoding
// It's either English or we should not pronounce ourselves in certain cases.
self.coherence_matches.first().map_or_else(
// Default case: Trying to infer the language based on the given encoding
|| {
if self.suitable_encodings().contains(&String::from("ascii")) {
return &Language::English;
}
let languages = if is_multi_byte_encoding(&self.encoding) {
mb_encoding_languages(&self.encoding)
&Language::English
} else {
encoding_languages(self.encoding.clone())
};
languages.first().unwrap_or(&&Language::Unknown)
})
let languages = if is_multi_byte_encoding(&self.encoding) {
mb_encoding_languages(&self.encoding)
} else {
encoding_languages(self.encoding.clone())
};
languages.first().copied().unwrap_or(&Language::Unknown)
}
},
|lang| lang.language,
)
}
// Return the complete list of possible languages found in decoded sequence.
// Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
Expand Down Expand Up @@ -297,11 +297,11 @@ impl CharsetMatches {
// We should disable the submatch factoring when the input file is too heavy
// (conserve RAM usage)
if item.payload.len() <= *TOO_BIG_SEQUENCE {
for m in self.items.iter_mut() {
for m in &mut self.items {
if m.decoded_payload() == item.decoded_payload()
&& m.mean_mess_ratio == item.mean_mess_ratio
&& (m.mean_mess_ratio - item.mean_mess_ratio).abs() < f32::EPSILON
{
m.add_submatch(item.clone());
m.add_submatch(&item);
return;
}
}
Expand Down
60 changes: 29 additions & 31 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,14 @@ use crate::utils::{
};
use encoding::DecoderTrap;
use log::{debug, trace};
use std::collections::VecDeque;
use std::fs::{metadata, File};
use std::io::Read;
use std::path::Path;

pub mod assets;
// TODO: Revisit float conversions when we want to push for accuracy
#[allow(clippy::cast_lossless, clippy::cast_precision_loss)]
mod cd;
pub mod consts;
pub mod entity;
Expand Down Expand Up @@ -272,14 +275,14 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
}

// add ascii & utf-8
prioritized_encodings.extend(["ascii", "utf-8"].iter().map(|s| s.to_string()));
prioritized_encodings.extend(["ascii".to_string(), "utf-8".to_string()]);

// generate array of encodings for probing with prioritizing
let mut iana_encodings = IANA_SUPPORTED.clone();
let mut iana_encodings: VecDeque<&str> = VecDeque::from(IANA_SUPPORTED.clone());
for pe in prioritized_encodings.iter().rev() {
if let Some(index) = iana_encodings.iter().position(|x| *x == pe) {
let value = iana_encodings.remove(index);
iana_encodings.insert(0, value);
let value = iana_encodings.remove(index).unwrap();
iana_encodings.push_front(value);
}
}

Expand Down Expand Up @@ -504,12 +507,10 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
decoded_payload,
));

if encoding_iana == specified_encoding {
fallback_specified = fallback_entry;
} else if encoding_iana == "ascii" {
fallback_ascii = fallback_entry;
} else {
fallback_u8 = fallback_entry;
match encoding_iana {
e if e == specified_encoding => fallback_specified = fallback_entry,
"ascii" => fallback_ascii = fallback_entry,
_ => fallback_u8 = fallback_entry,
}
}
continue 'iana_encodings_loop;
Expand All @@ -525,15 +526,14 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
// Most of the time its not relevant to run "language-detection" on it.
let mut cd_ratios: Vec<CoherenceMatches> = vec![];
if encoding_iana != "ascii" {
for chunk in md_chunks {
if let Ok(chunk_coherence_matches) = coherence_ratio(
chunk,
cd_ratios.extend(md_chunks.iter().filter_map(|chunk| {
coherence_ratio(
chunk.clone(),
Some(settings.language_threshold),
Some(target_languages.clone()),
) {
cd_ratios.push(chunk_coherence_matches);
}
}
)
.ok()
}));
}

// process cd ratios
Expand Down Expand Up @@ -572,26 +572,24 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset

// fallbacks
if results.is_empty() {
let mut fb: Option<&CharsetMatch> = None;
if fallback_specified.is_some() {
fb = Some(fallback_specified.as_ref().unwrap());
} else if fallback_u8.is_some()
&& (fallback_ascii.is_none()
|| (fallback_ascii.is_some()
&& fallback_u8.as_ref().unwrap().decoded_payload()
!= fallback_ascii.as_ref().unwrap().decoded_payload()))
{
fb = Some(fallback_u8.as_ref().unwrap());
} else if fallback_ascii.is_some() {
fb = Some(fallback_ascii.as_ref().unwrap());
}
let fb = match (&fallback_specified, &fallback_u8, &fallback_ascii) {
(Some(specified), _, _) => Some(specified),
(None, Some(u8_fallback), None) => Some(u8_fallback),
(None, Some(u8_fallback), Some(ascii))
if u8_fallback.decoded_payload() != ascii.decoded_payload() =>
{
Some(u8_fallback)
}
(None, _, Some(ascii)) => Some(ascii),
_ => None,
};
if let Some(fb_to_pass) = fb {
debug!(
"Encoding detection: will be used as a fallback match {}",
fb_to_pass.encoding()
);
results.append(fb_to_pass.clone());
}
};
}

// final logger information
Expand Down
Loading

0 comments on commit f6f3bd2

Please sign in to comment.