Merge pull request #13 from OpenSesame/fix/downloader_api

chunker: Downloads.jl replaced by HuggingFaceApi.jl
rssdev10 · Aug 8, 2023 · 86b9ff4 · 86b9ff4
2 parents ba0468b + 12da50e
commit 86b9ff4
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 4 deletions.
diff --git a/Project.toml b/Project.toml
@@ -9,9 +9,9 @@ BytePairEncoding = "a4280ba5-8788-555a-8ca8-4a8c3d966a71"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DebugDataWriter = "810e33c6-efd6-4462-86b1-f71ae88af720"
 DocOpt = "968ba79b-81e4-546f-ab3a-2eecfa62a9db"
-Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 ElasticsearchClient = "e586a49d-aa29-4ce5-8f91-fa4f824367bd"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+HuggingFaceApi = "3cc741c3-0c9d-4fbe-84fa-cdec264173de"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Mocking = "78c3b35d-d492-501b-9361-3d52fe80e533"
 Mustache = "ffc61752-8dc7-55ee-8c37-f3e9cdd09e70"
@@ -30,6 +30,7 @@ DebugDataWriter = "0.1"
 DocOpt = "0.5"
 ElasticsearchClient = "0.2"
 HTTP = "1"
+HuggingFaceApi = "0.1"
 JSON = "0.21"
 Mocking = "0.7"
 Mustache = "1"

diff --git a/src/services/chunks.jl b/src/services/chunks.jl
@@ -4,11 +4,13 @@ using UUIDs
 
 using BytePairEncoding: gpt2_codemap, GPT2Tokenization, Merge, BPE, BPETokenization
 using TextEncodeBase: TextEncodeBase, FlatTokenizer, CodeNormalizer, Sentence, getvalue, CodeUnMap
-using Downloads
+using HuggingFaceApi
 
 # Global variables
 tokenizer = let
- bpe = BPE(Downloads.download("https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"))
+ url = HuggingFaceURL("gpt2", "merges.txt")
+ file = HuggingFaceApi.cached_download(url)
+ bpe = BPE(file)
  FlatTokenizer(CodeNormalizer(BPETokenization(GPT2Tokenization(), bpe), gpt2_codemap()))
 
  # tiktoken.get_encoding(
@@ -108,7 +110,7 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
  if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED ||
  any(c -> sizeof(c) > 2, chunk_text_to_append) # there are 3-bytes hieroglyphs
  # Append the chunk text to the list of chunks
- push!(chunks, chunk_text_to_append)
+ push!(chunks, filter(isvalid, chunk_text_to_append))
  end
 
  # Remove the tokens corresponding to the chunk text from the remaining tokens