From e0dd1b54e05b42aace5705aa3cb4d7bc6efcf8af Mon Sep 17 00:00:00 2001 From: Huanghe Date: Sat, 21 Oct 2023 13:26:53 -0500 Subject: [PATCH] make termination only need one exhausted path --- Cargo.lock | 2 +- README.md | 6 +++--- assets/grammar.bnf | 4 ++-- bnf_sampler/Cargo.toml | 2 +- bnf_sampler/src/sampler.rs | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7df33ba..a9776c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,7 +95,7 @@ dependencies = [ [[package]] name = "bnf_sampler" -version = "0.3.2" +version = "0.3.3" dependencies = [ "anyhow", "bit-set", diff --git a/README.md b/README.md index f681c9a..09b41e1 100644 --- a/README.md +++ b/README.md @@ -79,10 +79,10 @@ In this project, a slightly modified version of BNF is used. The key differences The possible tokens listed are the tokens that can be accepted by the sampler in its current state. The following rule defines whether a token is listed in the return value of `Sampler::all_possible_tokens` with a given BNF: -- The sampler has not terminated or gets into an invalid state. In other words, there are still terms not consumed in the sampler, and the current input token can be accepted by the sampler. +- The sampler has not terminated or gets into an invalid state. In other words, the current input token can be accepted by the sampler, and no path exists such that all the terminals and nonterminals are consumed in the path. - - e.g. With `::=, ::='cryscan', ::='hex', ::='wanicca'`,`` will create a sampler that terminates after `cryscan`,`hex`,`wanicca` are inputed in this exact sequence, and goes into an invalid state otherwise. - - e.g. `::=|` will create a sampler that never terminate as `` can always become ``. + - e.g. With `::=, ::='boy', ::='next', ::='door'`,`` will create a sampler that terminates after `boy`,`next`,`door` are inputed in this exact sequence, and goes into an invalid state otherwise. + - e.g. `::=|` will create a sampler that terminates after any input token because of the path where `` become ``. In other words, `` is the only nonterminal in the path and is consumed. - For a given terminal, only the longest possible token is listed. diff --git a/assets/grammar.bnf b/assets/grammar.bnf index 3256930..faaac56 100644 --- a/assets/grammar.bnf +++ b/assets/grammar.bnf @@ -1,4 +1,4 @@ -::= -::=' ''"' +::='*' +::=| ::=| ::='a'|'b'|'c'|'d'|'e'|'f'|'g'|'h'|'i'|'j'|'k'|'l'|'m'|'n'|'o'|'p'|'q'|'r'|'s'|'t'|'u'|'v'|'w'|'x'|'y'|'z'|'A'|'B'|'C'|'D'|'E'|'F'|'G'|'H'|'I'|'J'|'K'|'L'|'M'|'N'|'P'|'P'|'Q'|'R'|'S'|'T'|'U'|'V'|'W'|'X'|'Y'|'Z' \ No newline at end of file diff --git a/bnf_sampler/Cargo.toml b/bnf_sampler/Cargo.toml index 432cb47..50cbf84 100644 --- a/bnf_sampler/Cargo.toml +++ b/bnf_sampler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bnf_sampler" -version = "0.3.2" +version = "0.3.3" edition = "2021" license = "MIT OR Apache-2.0" description = "A crate that uses recursive descent algorithm to ensure tokens produced by a large language model follow a Backus Naur Form schema." diff --git a/bnf_sampler/src/sampler.rs b/bnf_sampler/src/sampler.rs index 36108f8..a606800 100644 --- a/bnf_sampler/src/sampler.rs +++ b/bnf_sampler/src/sampler.rs @@ -311,7 +311,7 @@ impl Sampler { self.stack_arena.clear(); // println!("failed: {:?}",failed_prefixs); } - println!("stack: {:?}, {:?}", stack, now.elapsed()); + // println!("stack: {:?}, {:?}", stack, now.elapsed()); // println!("{:?}",failed_prefixs); } entry.insert(self.token_ids.clone()); @@ -374,7 +374,7 @@ impl Sampler { self.stacks.swap_remove(i); } if accepted { - if self.stacks.is_empty() || self.stacks.iter().all(|x| x.is_empty()) { + if self.stacks.is_empty() || self.stacks.iter().any(|x| x.is_empty()) { return Ok(AcceptTokenResult::End); } Ok(AcceptTokenResult::Continue)