Skip to content

Commit

Permalink
feat!: api adjustment (#48)
Browse files Browse the repository at this point in the history
  • Loading branch information
ChieloNewctle authored Mar 27, 2024
1 parent 1b3241e commit f9b2712
Show file tree
Hide file tree
Showing 13 changed files with 326 additions and 307 deletions.
29 changes: 15 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ flowchart LR
## Examples

```rust
use general_sam::{GeneralSAM, BTreeTransTable};
use general_sam::{GeneralSam, BTreeTransTable};

let sam = GeneralSAM::<BTreeTransTable<_>>::from_bytes("abcbc");
let sam = GeneralSam::<BTreeTransTable<_>>::from_bytes("abcbc");

// "cbc" is a suffix of "abcbc"
assert!(sam.get_root_state().feed_bytes("cbc").is_accepting());
Expand All @@ -49,38 +49,38 @@ assert!(!sam.get_root_state().feed_bytes("bcb").is_accepting());
```

```rust
use general_sam::{GeneralSAM, BTreeTransTable};
use general_sam::{GeneralSam, BTreeTransTable};

let sam = GeneralSAM::<BTreeTransTable<_>>::from_chars("abcbc".chars());
let sam = GeneralSam::<BTreeTransTable<_>>::from_chars("abcbc");

let state = sam.get_root_state();
let mut state = sam.get_root_state();

// "b" is not a suffix but at least a substring of "abcbc"
let state = state.feed_chars("b");
state.feed_chars("b");
assert!(!state.is_accepting());

// "bc" is a suffix of "abcbc"
let state = state.feed_chars("c");
state.feed_chars("c");
assert!(state.is_accepting());

// "bcbc" is a suffix of "abcbc"
let state = state.feed_chars("bc");
state.feed_chars("bc");
assert!(state.is_accepting());

// "bcbcbc" is not a substring, much less a suffix of "abcbc"
let state = state.feed_chars("bc");
state.feed_chars("bc");
assert!(!state.is_accepting() && state.is_nil());
```

```rust
// NOTE: This example requires the `trie` feature.
use general_sam::{GeneralSAM, Trie, BTreeTransTable};
# #[cfg(feature = "trie")] {
use general_sam::{GeneralSam, Trie, BTreeTransTable};

let mut trie = Trie::<BTreeTransTable<_>>::default();
trie.insert_iter("hello".chars());
trie.insert_iter("Chielo".chars());
trie.insert("hello".chars());
trie.insert("Chielo".chars());

let sam = GeneralSAM::<BTreeTransTable<_>>::from_trie(trie.get_root_state());
let sam = GeneralSam::<BTreeTransTable<_>>::from_trie(trie.get_root_state());

assert!(sam.get_root_state().feed_chars("lo").is_accepting());
assert!(sam.get_root_state().feed_chars("ello").is_accepting());
Expand All @@ -91,6 +91,7 @@ assert!(!sam.get_root_state().feed_chars("el").is_nil());

assert!(!sam.get_root_state().feed_chars("bye").is_accepting());
assert!(sam.get_root_state().feed_chars("bye").is_nil());
# }
```

## References
Expand Down
8 changes: 4 additions & 4 deletions benches/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
use general_sam::{
table::{BoxBisectTable, HashTransTable, VecBisectTable},
tokenize::{trie::greedy_tokenize_with_trie, GreedyTokenizer},
BTreeTransTable, GeneralSAM, TransitionTable, Trie,
BTreeTransTable, GeneralSam, TransitionTable, Trie,
};
use rand::{
distributions::{Alphanumeric, DistString},
Expand Down Expand Up @@ -128,7 +128,7 @@ fn tokenize_with_hf(tokenizer: &HFTokenizer, seq: &str) -> Vec<u32> {
}

fn tokenize_with_sam<T: TransitionTable<KeyType = char>>(
tokenizer: &GreedyTokenizer<T, u32, &GeneralSAM<T>>,
tokenizer: &GreedyTokenizer<T, u32, &GeneralSam<T>>,
seq: &str,
) -> Vec<u32> {
tokenizer
Expand All @@ -153,7 +153,7 @@ fn build_trie<T: TransitionTable<KeyType = char>>(vocab: &Vocab) -> (Trie<T>, Ve
let mut trie = Trie::<BTreeTransTable<_>>::default();
let mut trie_id_and_token_id = Vec::new();
for (k, v) in vocab.iter() {
let node_id = trie.insert_iter(k.chars());
let node_id = trie.insert_chars(k);
trie_id_and_token_id.push((node_id, *v));
}
let mut trie_to_token = vec![0; trie.num_of_nodes()];
Expand All @@ -178,7 +178,7 @@ fn criterion_benchmark<TransTable: TransitionTable<KeyType = char>>(c: &mut Crit
println!("building trie...");
let (trie, trie_to_token) = build_trie::<TransTable>(&vocab);
println!("building sam...");
let sam = GeneralSAM::<BTreeTransTable<_>>::from_trie(trie.get_root_state())
let sam = GeneralSam::<BTreeTransTable<_>>::from_trie(trie.get_root_state())
.alter_trans_table_into::<TransTable>();
println!("building greedy tokenizer...");
let tokenizer =
Expand Down
34 changes: 20 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
//! # Examples
//!
//! ```rust
//! use general_sam::{GeneralSAM, BTreeTransTable};
//! use general_sam::{GeneralSam, BTreeTransTable};
//!
//! let sam = GeneralSAM::<BTreeTransTable<_>>::from_bytes("abcbc");
//! let sam = GeneralSam::<BTreeTransTable<_>>::from_bytes("abcbc");
//!
//! // "cbc" is a suffix of "abcbc"
//! assert!(sam.get_root_state().feed_bytes("cbc").is_accepting());
Expand All @@ -39,38 +39,38 @@
//! ```
//!
//! ```rust
//! use general_sam::{GeneralSAM, BTreeTransTable};
//! use general_sam::{GeneralSam, BTreeTransTable};
//!
//! let sam = GeneralSAM::<BTreeTransTable<_>>::from_chars("abcbc".chars());
//! let sam = GeneralSam::<BTreeTransTable<_>>::from_chars("abcbc");
//!
//! let state = sam.get_root_state();
//! let mut state = sam.get_root_state();
//!
//! // "b" is not a suffix but at least a substring of "abcbc"
//! let state = state.feed_chars("b");
//! state.feed_chars("b");
//! assert!(!state.is_accepting());
//!
//! // "bc" is a suffix of "abcbc"
//! let state = state.feed_chars("c");
//! state.feed_chars("c");
//! assert!(state.is_accepting());
//!
//! // "bcbc" is a suffix of "abcbc"
//! let state = state.feed_chars("bc");
//! state.feed_chars("bc");
//! assert!(state.is_accepting());
//!
//! // "bcbcbc" is not a substring, much less a suffix of "abcbc"
//! let state = state.feed_chars("bc");
//! state.feed_chars("bc");
//! assert!(!state.is_accepting() && state.is_nil());
//! ```
//!
//! ```rust
//! # #[cfg(feature = "trie")] {
//! use general_sam::{GeneralSAM, Trie, BTreeTransTable};
//! use general_sam::{GeneralSam, Trie, BTreeTransTable};
//!
//! let mut trie = Trie::<BTreeTransTable<_>>::default();
//! trie.insert_iter("hello".chars());
//! trie.insert_iter("Chielo".chars());
//! trie.insert("hello".chars());
//! trie.insert("Chielo".chars());
//!
//! let sam = GeneralSAM::<BTreeTransTable<_>>::from_trie(trie.get_root_state());
//! let sam = GeneralSam::<BTreeTransTable<_>>::from_trie(trie.get_root_state());
//!
//! assert!(sam.get_root_state().feed_chars("lo").is_accepting());
//! assert!(sam.get_root_state().feed_chars("ello").is_accepting());
Expand Down Expand Up @@ -101,7 +101,7 @@ pub mod trie_alike;

pub use {
sam::{
GeneralSAM, GeneralSAMNode, GeneralSAMNodeID, GeneralSAMState, SAM_NIL_NODE_ID,
GeneralSam, GeneralSamNode, GeneralSamNodeID, GeneralSamState, SAM_NIL_NODE_ID,
SAM_ROOT_NODE_ID,
},
table::{
Expand All @@ -127,3 +127,9 @@ pub use utils::{rope, suffixwise, tokenize, tokenize::GreedyTokenizer};

#[cfg(test)]
mod tests;

#[cfg(doctest)]
mod _doctest_readme {
#[doc = include_str!("../README.md")]
struct ReadMe;
}
Loading

0 comments on commit f9b2712

Please sign in to comment.