Merge pull request #16 from OpenSesame/fix/updates

current updates
rssdev10 · Dec 7, 2023 · d24144e · d24144e
2 parents 160bb35 + 6d62006
commit d24144e
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 14 deletions.
diff --git a/Project.toml b/Project.toml
@@ -25,7 +25,7 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [compat]
 Aqua = "0.6"
-BytePairEncoding = "0.3"
+BytePairEncoding = "0.5"
 DebugDataWriter = "0.1"
 DocOpt = "0.5"
 ElasticsearchClient = "0.2"
@@ -36,7 +36,7 @@ Mocking = "0.7"
 Mustache = "1"
 OpenAI = "0.8"
 OpenAPI = "0.1"
-TextEncodeBase = "0.6"
+TextEncodeBase = "0.8"
 TimeZones = "1"
 URIs = "1"
 

diff --git a/src/auth/auth.jl b/src/auth/auth.jl
@@ -25,7 +25,7 @@ function validate_bearer_token(
  req.target in except_list && return true
 
  auth_header_index = findfirst(req.headers) do (title, _)
- isequal(title, "Authorization")
+ isequal(lowercase(title), "authorization")
  end
 
  isnothing(auth_header_index) && return false

diff --git a/src/services/chunks.jl b/src/services/chunks.jl
@@ -3,28 +3,33 @@ using .GptPluginServer: Document, DocumentChunk, DocumentChunkMetadata
 using UUIDs
 
 using BytePairEncoding: gpt2_codemap, GPT2Tokenization, Merge, BPE, BPETokenization
+using BytePairEncoding: tiktoken2bbpe, load_tiktoken
 using TextEncodeBase: TextEncodeBase, FlatTokenizer, CodeNormalizer, Sentence, getvalue, CodeUnMap
 using HuggingFaceApi
 
 # Global variables
-tokenizer = let
- url = HuggingFaceURL("gpt2", "merges.txt")
- file = HuggingFaceApi.cached_download(url)
- bpe = BPE(file)
- FlatTokenizer(CodeNormalizer(BPETokenization(GPT2Tokenization(), bpe), gpt2_codemap()))
-
- # tiktoken.get_encoding(
- # "cl100k_base"
- # ) # The encoding scheme to use for tokenization
-end
+# const tokenizer = let
+# url = HuggingFaceURL("gpt2", "merges.txt")
+# file = HuggingFaceApi.cached_download(url)
+# bpe = BPE(file)
+# FlatTokenizer(CodeNormalizer(BPETokenization(GPT2Tokenization(), bpe), gpt2_codemap()))
+# end
+# const unmap = CodeUnMap(tokenizer.tokenization.codemap)
+
+const codemap = gpt2_codemap()
+const tokenizer = tiktoken2bbpe(load_tiktoken("cl100k_base"), codemap)
+const unmap = CodeUnMap(codemap)
 
 encode(text::AbstractString) = tokenizer(Sentence(text))
 
 function decode(tokens::Vector{TextEncodeBase.TokenStage})::String
- unmap = CodeUnMap(tokenizer.tokenization.codemap)
  map(unmap ∘ getvalue, tokens) |> join
 end
 
+function decode(tokens::Vector{<:AbstractString})::String
+ map(unmap, tokens) |> join
+end
+
 # Constants
 const CHUNK_SIZE = 200 # The target size of each text chunk in tokens
 const MIN_CHUNK_SIZE_CHARS = 350 # The minimum size of each text chunk in characters