From 923d40c94fbe1627b8e59954bfe5bd4c23c78479 Mon Sep 17 00:00:00 2001 From: Yoonchae Lee Date: Tue, 25 Jun 2024 15:31:43 +0900 Subject: [PATCH] dev: remove extra/tokenizers --- Cargo.lock | 124 ++++-------------------------- Cargo.toml | 1 - extra/tokenizers/.gitignore | 1 - extra/tokenizers/Cargo.toml | 19 ----- extra/tokenizers/README.md | 10 --- extra/tokenizers/src/lib.rs | 3 - extra/tokenizers/src/lindera.rs | 69 ----------------- extra/tokenizers/src/main.rs | 106 ------------------------- extra/tokenizers/src/types.rs | 20 ----- extra/tokenizers/src/vaporetto.rs | 45 ----------- 10 files changed, 14 insertions(+), 384 deletions(-) delete mode 100644 extra/tokenizers/.gitignore delete mode 100644 extra/tokenizers/Cargo.toml delete mode 100644 extra/tokenizers/README.md delete mode 100644 extra/tokenizers/src/lib.rs delete mode 100644 extra/tokenizers/src/lindera.rs delete mode 100644 extra/tokenizers/src/main.rs delete mode 100644 extra/tokenizers/src/types.rs delete mode 100644 extra/tokenizers/src/vaporetto.rs diff --git a/Cargo.lock b/Cargo.lock index 3b5f3136..8ff93e22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,18 +24,6 @@ dependencies = [ "cpufeatures", ] -[[package]] -name = "ahash" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" -dependencies = [ - "cfg-if", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "aho-corasick" version = "1.1.3" @@ -211,25 +199,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bincode" -version = "2.0.0-rc.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f11ea1a0346b94ef188834a65c068a03aec181c94896d481d7a0a40d85b0ce95" -dependencies = [ - "bincode_derive", - "serde", -] - -[[package]] -name = "bincode_derive" -version = "2.0.0-rc.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e30759b3b99a1b802a7a3aa21c85c3ded5c28e1c83170d82d70f08bbf7f3e4c" -dependencies = [ - "virtue", -] - [[package]] name = "bitflags" version = "1.3.2" @@ -536,12 +505,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "daachorse" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36" - [[package]] name = "dashmap" version = "5.5.3" @@ -549,7 +512,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" dependencies = [ "cfg-if", - "hashbrown 0.14.3", + "hashbrown", "lock_api", "once_cell", "parking_lot_core", @@ -833,15 +796,6 @@ dependencies = [ "scroll", ] -[[package]] -name = "hashbrown" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" -dependencies = [ - "ahash", -] - [[package]] name = "hashbrown" version = "0.14.3" @@ -948,7 +902,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown", ] [[package]] @@ -1031,7 +985,7 @@ version = "0.28.0" source = "git+https://github.com/BlueGreenMagick/lindera.git?branch=yomikiri#aae9574ba437c5e0b3997444861673a8363ac9e7" dependencies = [ "anyhow", - "bincode 1.3.3", + "bincode", "byteorder", "csv", "encoding", @@ -1049,7 +1003,7 @@ version = "0.28.0" source = "git+https://github.com/BlueGreenMagick/lindera.git?branch=yomikiri#aae9574ba437c5e0b3997444861673a8363ac9e7" dependencies = [ "anyhow", - "bincode 1.3.3", + "bincode", "byteorder", "encoding_rs", "log", @@ -1075,7 +1029,7 @@ version = "0.28.0" source = "git+https://github.com/BlueGreenMagick/lindera.git?branch=yomikiri#aae9574ba437c5e0b3997444861673a8363ac9e7" dependencies = [ "anyhow", - "bincode 1.3.3", + "bincode", "byteorder", "lindera-cc-cedict-builder", "lindera-core", @@ -1092,7 +1046,7 @@ version = "0.28.0" source = "git+https://github.com/BlueGreenMagick/lindera.git?branch=yomikiri#aae9574ba437c5e0b3997444861673a8363ac9e7" dependencies = [ "anyhow", - "bincode 1.3.3", + "bincode", "byteorder", "csv", "encoding_rs", @@ -1112,7 +1066,7 @@ version = "0.28.0" source = "git+https://github.com/BlueGreenMagick/lindera.git?branch=yomikiri#aae9574ba437c5e0b3997444861673a8363ac9e7" dependencies = [ "anyhow", - "bincode 1.3.3", + "bincode", "byteorder", "csv", "encoding_rs", @@ -1132,7 +1086,7 @@ version = "0.28.0" source = "git+https://github.com/BlueGreenMagick/lindera.git?branch=yomikiri#aae9574ba437c5e0b3997444861673a8363ac9e7" dependencies = [ "anyhow", - "bincode 1.3.3", + "bincode", "byteorder", "csv", "encoding", @@ -1149,7 +1103,7 @@ name = "lindera-tokenizer" version = "0.28.0" source = "git+https://github.com/BlueGreenMagick/lindera.git?branch=yomikiri#aae9574ba437c5e0b3997444861673a8363ac9e7" dependencies = [ - "bincode 1.3.3", + "bincode", "lindera-core", "lindera-dictionary", "once_cell", @@ -1163,7 +1117,7 @@ version = "0.28.0" source = "git+https://github.com/BlueGreenMagick/lindera.git?branch=yomikiri#aae9574ba437c5e0b3997444861673a8363ac9e7" dependencies = [ "anyhow", - "bincode 1.3.3", + "bincode", "byteorder", "csv", "encoding", @@ -1871,19 +1825,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tokenizers" -version = "0.1.0" -dependencies = [ - "anyhow", - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", - "log", - "vaporetto", - "yomikiri-unidic-types", -] - [[package]] name = "toml" version = "0.5.11" @@ -2054,7 +1995,7 @@ name = "uniffi_macros" version = "0.27.1" source = "git+https://github.com/BlueGreenMagick/uniffi-rs.git?branch=custom#e6512c150410854e1d50b4dadf1350bedcb25cfb" dependencies = [ - "bincode 1.3.3", + "bincode", "camino", "fs-err", "once_cell", @@ -2147,17 +2088,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" -[[package]] -name = "vaporetto" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e19c41effc32127fdfb9dcfa256627e5c4c750e1cb23f94ce8f685711d0fe17" -dependencies = [ - "bincode 2.0.0-rc.3", - "daachorse", - "hashbrown 0.13.2", -] - [[package]] name = "vec_map" version = "0.8.2" @@ -2170,12 +2100,6 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" -[[package]] -name = "virtue" -version = "0.0.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dcc60c0624df774c82a0ef104151231d37da4962957d691c011c852b2473314" - [[package]] name = "walkdir" version = "2.5.0" @@ -2510,7 +2434,7 @@ checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" name = "yomikiri-dictionary" version = "0.1.0" dependencies = [ - "bincode 1.3.3", + "bincode", "byteorder", "cfg_aliases 0.1.1", "chrono", @@ -2535,7 +2459,7 @@ name = "yomikiri-jmdict" version = "0.1.0" dependencies = [ "RustyXML", - "bincode 1.3.3", + "bincode", "regex", "thiserror", ] @@ -2544,7 +2468,7 @@ dependencies = [ name = "yomikiri-rs" version = "0.1.0" dependencies = [ - "bincode 1.3.3", + "bincode", "cfg_aliases 0.2.0", "console_error_panic_hook", "flate2", @@ -2591,26 +2515,6 @@ dependencies = [ name = "yomikiri-unidic-types" version = "0.1.0" -[[package]] -name = "zerocopy" -version = "0.7.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.7.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.60", -] - [[package]] name = "zeroize" version = "1.7.0" diff --git a/Cargo.toml b/Cargo.toml index 03c52afd..05e186d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,6 @@ members = [ "unidic", "unidic-types", "extra/generate-license/rust", - "extra/tokenizers", ] resolver = "2" diff --git a/extra/tokenizers/.gitignore b/extra/tokenizers/.gitignore deleted file mode 100644 index 09bc9883..00000000 --- a/extra/tokenizers/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/resources \ No newline at end of file diff --git a/extra/tokenizers/Cargo.toml b/extra/tokenizers/Cargo.toml deleted file mode 100644 index 2278e819..00000000 --- a/extra/tokenizers/Cargo.toml +++ /dev/null @@ -1,19 +0,0 @@ -[package] -name = "tokenizers" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -anyhow = "1.0" -log = "0.4" -vaporetto = { version = "0.6" } -lindera-tokenizer = { git = "https://github.com/BlueGreenMagick/lindera.git", branch = "yomikiri" } -lindera-dictionary = { git = "https://github.com/BlueGreenMagick/lindera.git", branch = "yomikiri" } -lindera-core = { git = "https://github.com/BlueGreenMagick/lindera.git", branch = "yomikiri" } -yomikiri-unidic-types = { path = "../../unidic-types" } - - -[lints] -workspace = true diff --git a/extra/tokenizers/README.md b/extra/tokenizers/README.md deleted file mode 100644 index 0195931e..00000000 --- a/extra/tokenizers/README.md +++ /dev/null @@ -1,10 +0,0 @@ -Compare tokenizers with tokenizer accuracy, startup time, tokenization time, memory usage, file size. - -### Vaporetto - -[Vaporetto](https://github.com/daac-tools/vaporetto) -Using model from https://github.com/daac-tools/vaporetto-models/releases/tag/v0.5.0 - -Vaporetto seemed like a good fit for Yomikiri with its model size quite small at 58MB, and with better accuracy than mecab unidic v3.1. - -However, Vaporetto turned out to be not suitable for use in Yomikiri. It has a very slow startup time, at 37s (compared to Lindera's 0.03ms). The small model size was due to using bincode VarIntEncoding, and using FixedIntEncoding balloned the size up to 120MB. Memory allocation of Vaporetto Predictor is estimated to be about 250MB (estimated using jemalloc-ctl), so it cannot fit in ios extension's 80MB memory limit. It seems difficult for Vaporetto to use static memory like Lindera can do. diff --git a/extra/tokenizers/src/lib.rs b/extra/tokenizers/src/lib.rs deleted file mode 100644 index dbf4887c..00000000 --- a/extra/tokenizers/src/lib.rs +++ /dev/null @@ -1,3 +0,0 @@ -pub mod lindera; -pub mod types; -pub mod vaporetto; diff --git a/extra/tokenizers/src/lindera.rs b/extra/tokenizers/src/lindera.rs deleted file mode 100644 index 980148dd..00000000 --- a/extra/tokenizers/src/lindera.rs +++ /dev/null @@ -1,69 +0,0 @@ -use std::borrow::Cow; - -use anyhow::Result; -use lindera_core::mode::Mode; -use lindera_core::{ - character_definition::CharacterDefinitions, connection::ConnectionCostMatrix, - dictionary::Dictionary, prefix_dict::PrefixDict, unknown_dictionary::UnknownDictionary, -}; -use lindera_tokenizer::tokenizer::Tokenizer; -use yomikiri_unidic_types::UnidicPos; - -use crate::types::Token; - -pub struct Lindera { - pub tokenizer: Tokenizer, -} - -impl Lindera { - pub fn load( - unidic_data: Cow<'static, [u8]>, - unidic_vals: Cow<'static, [u8]>, - connection_data: Cow<'static, [u8]>, - char_definitions: Cow<'static, [u8]>, - unknown_data: Cow<'static, [u8]>, - words_idx: Cow<'static, [u8]>, - words_data: Cow<'static, [u8]>, - ) -> Result { - let dictionary = Dictionary { - dict: PrefixDict::from_static_slice(&unidic_data, &unidic_vals), - cost_matrix: match connection_data { - Cow::Owned(owned) => ConnectionCostMatrix::load(&owned), - Cow::Borrowed(borrowed) => ConnectionCostMatrix::load_static(&borrowed), - }, - char_definitions: CharacterDefinitions::load(&char_definitions)?, - unknown_dictionary: UnknownDictionary::load(&unknown_data)?, - words_idx_data: words_idx, - words_data: words_data, - }; - let tokenizer = Tokenizer::new(dictionary, None, Mode::Normal); - - Ok(Self { tokenizer }) - } - - pub fn tokenize(&self, text: &str) -> Result> { - let mut ltokens = self.tokenizer.tokenize(text)?; - let tokens: Vec = ltokens - .iter_mut() - .map(|t| { - let surface = t.text.to_string(); - let details = t.get_details().unwrap_or(vec![]); - let (pos, pos2) = details - .get(0) - .and_then(|p| p.as_bytes().get(0)) - .and_then(|short| UnidicPos::from_short(*short).ok()) - .map(|pos| pos.to_unidic()) - .unwrap_or(("UNK", "*")); - let pos = pos.to_string(); - let pos2 = pos2.to_string(); - Token { - surface, - pos, - reading: details.get(2).unwrap_or(&"").to_string(), - others: vec![pos2], - } - }) - .collect(); - Ok(tokens) - } -} diff --git a/extra/tokenizers/src/main.rs b/extra/tokenizers/src/main.rs deleted file mode 100644 index 80441357..00000000 --- a/extra/tokenizers/src/main.rs +++ /dev/null @@ -1,106 +0,0 @@ -use std::fs; -use std::path::Path; -use std::time::{SystemTime, UNIX_EPOCH}; - -use anyhow::{Context, Result}; -use tokenizers::lindera::Lindera; -use tokenizers::vaporetto::{create_tokenizer, Vaporetto}; - -macro_rules! test { - ( - $($tokenizer:ident)+; - $($sentence:literal)+ - ) => { - let sentences = vec![$($sentence),+]; - for sentence in &sentences { - println!(""); - println!("Tokenizing: {}", sentence); - $( - let start = time_now(); - let tokens = $tokenizer.tokenize(&sentence)?; - let end = time_now(); - println!("{}: {:.2}ms", stringify!($tokenizer), end - start); - println!("{:?}", &tokens); - )+ - }; - } -} - -fn main() -> Result<()> { - println!("\nPreparing Lindera..."); - let lindera = prepare_lindera()?; - println!("\n"); - - test!( - lindera; - " " - ); - - Ok(()) -} - -fn prepare_vaporetto() -> Result { - let path = Path::new("./resources/bccwj-suw+unidic_pos+kana.model"); - let bytes = fs::read(&path).with_context(|| format!("Could not open file at {:?}", &path))?; - let start = time_now(); - let vaporetto = create_tokenizer(&bytes[..])?; - let end = time_now(); - println!("Time taken to initialize vaporetto: {:.2}ms", end - start); - Ok(vaporetto) -} - -fn prepare_lindera() -> Result { - let start = time_now(); - let lindera = load_lindera::load()?; - let end = time_now(); - println!("Time taken to initialize lindera: {:.2}ms", end - start); - Ok(lindera) -} - -// in ms. -fn time_now() -> f64 { - let micro = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_micros(); - (micro as f64) / 1000.0 -} - -mod load_lindera { - use anyhow::Result; - use std::borrow::Cow; - use tokenizers::lindera::Lindera; - - macro_rules! lindera_data { - ($name: ident, $filename: literal) => { - const $name: &'static [u8] = - include_bytes!(concat!("../resources/lindera/", $filename)); - }; - } - - macro_rules! cow { - ($name: ident) => { - Cow::Borrowed($name) - }; - } - - lindera_data!(CHAR_DEFINITION_DATA, "char_def.bin"); - lindera_data!(CONNECTION_DATA, "matrix.mtx"); - lindera_data!(UNIDIC_DATA, "dict.da"); - lindera_data!(UNIDIC_VALS, "dict.vals"); - lindera_data!(UNKNOWN_DATA, "unk.bin"); - lindera_data!(WORDS_IDX_DATA, "dict.wordsidx"); - lindera_data!(WORDS_DATA, "dict.words"); - - pub fn load() -> Result { - Lindera::load( - cow!(UNIDIC_DATA), - cow!(UNIDIC_VALS), - cow!(CONNECTION_DATA), - cow!(CHAR_DEFINITION_DATA), - cow!(UNKNOWN_DATA), - cow!(WORDS_IDX_DATA), - cow!(WORDS_DATA), - ) - } -} diff --git a/extra/tokenizers/src/types.rs b/extra/tokenizers/src/types.rs deleted file mode 100644 index 8e7b2bda..00000000 --- a/extra/tokenizers/src/types.rs +++ /dev/null @@ -1,20 +0,0 @@ -use std::fmt::{Debug, Write}; - -#[derive(Clone)] -pub struct Token { - pub surface: String, - pub pos: String, - pub reading: String, - pub others: Vec, -} - -impl Debug for Token { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(&self.surface)?; - //f.write_char('/')?; - // f.write_str(&self.pos.split('-').next().unwrap_or(""))?; - f.write_char(':')?; - f.write_str(&self.reading)?; - Ok(()) - } -} diff --git a/extra/tokenizers/src/vaporetto.rs b/extra/tokenizers/src/vaporetto.rs deleted file mode 100644 index a8332308..00000000 --- a/extra/tokenizers/src/vaporetto.rs +++ /dev/null @@ -1,45 +0,0 @@ -use crate::types::Token; -use anyhow::Result; -use std::borrow::Cow; -use std::io::Read; -use vaporetto::{Model, Predictor, Sentence}; - -pub struct Vaporetto { - predictor: Predictor, -} - -pub fn create_tokenizer(reader: R) -> Result { - let model = Model::read(reader)?; - let predictor = Predictor::new(model, true)?; - let tokenizer = Vaporetto::new(predictor); - Ok(tokenizer) -} - -impl Vaporetto { - pub fn new(predictor: Predictor) -> Self { - Vaporetto { predictor } - } - - pub fn tokenize(&self, text: &str) -> Result> { - let mut s = Sentence::from_raw(text)?; - self.predictor.predict(&mut s); - s.fill_tags(); - let tokens = s - .iter_tokens() - .map(|t| { - let tags = t.tags(); - Token { - surface: t.surface().into(), - pos: tags[0].as_ref().unwrap_or(&Cow::Borrowed("")).to_string(), - reading: tags[1].as_ref().unwrap_or(&Cow::Borrowed("")).to_string(), - others: tags - .iter() - .skip(2) - .map(|s| s.as_ref().unwrap_or(&Cow::Borrowed("")).to_string()) - .collect(), - } - }) - .collect(); - Ok(tokens) - } -}