-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Refactor] Major API Refactor for Release (#71)
This PR refactors the architecture of XGrammar to prepare for release. It notably does these things 1. Reorganize APIs for simplicity. See python files under `python/` folder for the new APIs. 2. Reorganize file structures. Separate xgrammar.py and xgrammar.h into several files, one for each functionality. 3. Enhance multi-thread compiling of grammars 4. Rename. Notably `BNFGrammar` -> `Grammar`, `RulePositionTree` -> `PersistentStack`, `CatagorizedTokens` -> `AdaptiveTokenMask` 5. Provide a new kernel for applying mask with better performance. See #35 (comment) for benchmark results JS bindings could be broken and need to be fixed. cc @tqchen @MasterJH5574 @CharlieFRuan @DarkSharpness @merrymercy
- Loading branch information
Showing
60 changed files
with
3,827 additions
and
3,300 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
/*! | ||
* Copyright (c) 2024 by Contributors | ||
* \file xgrammar/compiled_grammar_data_structure.h | ||
* \brief The header for the data structures of the compiled grammar. | ||
*/ | ||
#ifndef XGRAMMAR_COMPILED_GRAMMAR_DATA_STRUCTURE_H_ | ||
#define XGRAMMAR_COMPILED_GRAMMAR_DATA_STRUCTURE_H_ | ||
|
||
#include <xgrammar/grammar.h> | ||
|
||
#include <cstdint> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <utility> | ||
#include <vector> | ||
|
||
// matcher_data_structure.h is included to use RulePosition | ||
#include "matcher_data_structure.h" | ||
#include "support/dynamic_bitset.h" | ||
#include "support/utils.h" | ||
|
||
namespace xgrammar { | ||
|
||
/******************* CompiledGrammar Datastructures *******************/ | ||
|
||
/*! | ||
* \brief Preprocessed information, for a given specific RulePosition, divides the token set | ||
* into three categories: accepted, rejected, and uncertain. | ||
* Accepted: tokens that can be determined by the current RulePosition to be acceptable | ||
* Rejected: tokens that can be determined by the current RulePosition to be unacceptable | ||
* Uncertain: tokens that need the state of the parent RulePositions to determine if acceptable | ||
* | ||
* \note uncertain indices are stored directly. Accepted / rejected indices have three ways to | ||
* store to reduce memory and computation usage. See StoreType. | ||
* \note These indices are the indices of sorted_decoded_vocab in the CompiledGrammar | ||
* object, instead of the token ids. That helps the matching process. | ||
*/ | ||
struct AdaptiveTokenMask { | ||
enum class StoreType { | ||
// Only store all accepted token indices. Then rejected indices = all_indices - accepted_indices | ||
// - uncertain_indices. This is useful when |accepted_indices| < |rejected_indices|. | ||
kAccepted = 0, | ||
// Only store all accepted token indices. Then accepted indices = all_indices - rejected_indices | ||
// - uncertain_indices. This is useful when |accepted_indices| > |rejected_indices|. | ||
kRejected = 1, | ||
// Store all accepted token indices in a bitset. This is useful when both |accepted_indices| and | ||
// |rejected_indices| are large. | ||
kAcceptedBitset = 2 | ||
}; | ||
StoreType store_type; | ||
|
||
static constexpr int USE_BITSET_THRESHOLD = 200; | ||
|
||
std::vector<int32_t> accepted_indices; | ||
std::vector<int32_t> rejected_indices; | ||
DynamicBitset accepted_bitset; | ||
|
||
std::vector<int32_t> uncertain_indices; | ||
|
||
AdaptiveTokenMask() = default; | ||
|
||
AdaptiveTokenMask( | ||
size_t vocab_size, | ||
const std::vector<std::pair<int32_t, std::string>>& sorted_decoded_vocab, | ||
const std::vector<int32_t>& accepted_indices, | ||
const std::vector<int32_t>& rejected_indices, | ||
const std::vector<int32_t>& uncertain_indices | ||
); | ||
}; | ||
|
||
/*! | ||
* \brief All information that we need to match tokens in the tokenizer to the specified grammar. | ||
* It is the result of preprocessing. | ||
* \sa xgrammar::GrammarMatcher | ||
*/ | ||
class CompiledGrammar::Impl { | ||
public: | ||
/*! \brief The grammar for the GrammarMatcher. */ | ||
Grammar grammar; | ||
/*! \brief The tokenizer information. */ | ||
TokenizerInfo tokenizer_info; | ||
|
||
Grammar GetGrammar() const { return grammar; } | ||
|
||
TokenizerInfo GetTokenizerInfo() const { return tokenizer_info; } | ||
|
||
/******************* The adaptive token mask cache *******************/ | ||
|
||
struct RulePositionEqual { | ||
std::size_t operator()(const RulePosition& lhs, const RulePosition& rhs) const noexcept { | ||
return lhs.sequence_id == rhs.sequence_id && lhs.element_id == rhs.element_id && | ||
lhs.left_utf8_bytes == rhs.left_utf8_bytes && | ||
lhs.element_in_string == rhs.element_in_string; | ||
} | ||
}; | ||
|
||
struct RulePositionHash { | ||
std::size_t operator()(const RulePosition& rule_position) const noexcept { | ||
return HashCombine( | ||
rule_position.sequence_id, | ||
rule_position.element_id, | ||
rule_position.left_utf8_bytes, | ||
rule_position.element_in_string | ||
); | ||
} | ||
}; | ||
|
||
/*! \brief Mapping from RulePositions to the adaptive token mask. */ | ||
std::unordered_map<RulePosition, AdaptiveTokenMask, RulePositionHash, RulePositionEqual> | ||
adaptive_token_mask_cache; | ||
}; | ||
|
||
} // namespace xgrammar | ||
|
||
#endif // XGRAMMAR_COMPILED_GRAMMAR_DATA_STRUCTURE_H_ |
Oops, something went wrong.