From 7994b9eff3af22b86e53f03b61142d69368db0f8 Mon Sep 17 00:00:00 2001 From: Teo Date: Sun, 8 Jan 2023 17:14:07 +0100 Subject: [PATCH] allowing to change tokenize fct --- src/handler.rs | 8 ++++---- src/idiom/idiom.rs | 14 ++++++++++---- src/idiom/mod.rs | 3 ++- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/handler.rs b/src/handler.rs index d8fc26e..0ac711e 100644 --- a/src/handler.rs +++ b/src/handler.rs @@ -16,7 +16,7 @@ use serenity::{ prelude::*, utils::Color }; use wordcloud_rs::{Token, WordCloud, Colors}; -use crate::idiom::Idioms; +use crate::idiom::{Idioms, tokenize}; const READ_PAST: u64 = 10000; fn convert_color(color: Color) -> Rgb { @@ -39,7 +39,7 @@ impl Handler { } pub fn message(&self, guild_id: GuildId, channel_id: ChannelId, member_id: UserId, message: String) { - self.idioms.get_mut(&guild_id).unwrap().update(channel_id, member_id, message); + self.idioms.get_mut(&guild_id).unwrap().update(channel_id, member_id, tokenize(message)); } fn to_wc_tokens(&self, tokens: Vec<(String, f32)>) -> Vec<(Token, f32)> { @@ -115,7 +115,7 @@ impl Handler { ).await { for message in messages { idioms.get_mut(&guild.id).unwrap().update( - channel_id, message.author.id, message.content + channel_id, message.author.id, tokenize(message.content) ); } info!(target: "Wordy", "Read {} past messages in {}/{}", READ_PAST, guild.name, channel.name()) @@ -127,7 +127,7 @@ impl Handler { } pub async fn register_commands(&self, http: Arc, guild_id: GuildId) { - println!("Registering slash commands for Guild {}", guild_id); + trace!("Registering slash commands for Guild {}", guild_id); if let Err(why) = GuildId::set_application_commands(&guild_id, http, |commands| { commands diff --git a/src/idiom/idiom.rs b/src/idiom/idiom.rs index 3219a24..b91d6e3 100644 --- a/src/idiom/idiom.rs +++ b/src/idiom/idiom.rs @@ -15,9 +15,15 @@ lazy_static! { static ref RE_TOKEN: Regex = Regex::new(r"\w+").unwrap(); } -fn tokenize(text: String) -> Vec<(String, f32)> { +pub fn tokenize(text: String) -> Vec { + RE_TOKEN.find_iter(&text) + .map(|token| token.as_str().to_string()) + .collect_vec() +} + +fn counts(tokens: Vec) -> Vec<(String, f32)> { let mut counts: HashMap = HashMap::new(); - for token in RE_TOKEN.find_iter(&text) { + for token in tokens { *counts.entry(token.as_str().to_string()).or_default() += 1; } counts.into_iter().map(|(k, v)| (k, v as f32)).collect() @@ -40,10 +46,10 @@ impl Idioms { } } - pub fn update(&mut self, place: P, person: U, message: String) { + pub fn update(&mut self, place: P, person: U, tokens: Vec) { let place_voc = self.places.entry(place).or_insert(TopFreqs::new()); let user_voc = self.people.entry(person).or_insert(TopFreqs::new()); - let tokens = tokenize(message); + let tokens = counts(tokens); for (token, value) in tokens { let idx = match self.tokens.get_by_left(&token) { Some(v) => *v, diff --git a/src/idiom/mod.rs b/src/idiom/mod.rs index 06d8af2..20ed129 100644 --- a/src/idiom/mod.rs +++ b/src/idiom/mod.rs @@ -1,3 +1,4 @@ mod idiom; mod top_freqs; -pub use idiom::Idioms; \ No newline at end of file +pub use idiom::Idioms; +pub use idiom::tokenize; \ No newline at end of file