Skip to content

Commit

Permalink
Merge pull request #13 from OpenSesame/fix/downloader_api
Browse files Browse the repository at this point in the history
chunker: Downloads.jl replaced by HuggingFaceApi.jl
  • Loading branch information
os-rss authored Aug 8, 2023
2 parents ba0468b + 12da50e commit 86b9ff4
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ BytePairEncoding = "a4280ba5-8788-555a-8ca8-4a8c3d966a71"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
DebugDataWriter = "810e33c6-efd6-4462-86b1-f71ae88af720"
DocOpt = "968ba79b-81e4-546f-ab3a-2eecfa62a9db"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
ElasticsearchClient = "e586a49d-aa29-4ce5-8f91-fa4f824367bd"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
HuggingFaceApi = "3cc741c3-0c9d-4fbe-84fa-cdec264173de"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Mocking = "78c3b35d-d492-501b-9361-3d52fe80e533"
Mustache = "ffc61752-8dc7-55ee-8c37-f3e9cdd09e70"
Expand All @@ -30,6 +30,7 @@ DebugDataWriter = "0.1"
DocOpt = "0.5"
ElasticsearchClient = "0.2"
HTTP = "1"
HuggingFaceApi = "0.1"
JSON = "0.21"
Mocking = "0.7"
Mustache = "1"
Expand Down
8 changes: 5 additions & 3 deletions src/services/chunks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ using UUIDs

using BytePairEncoding: gpt2_codemap, GPT2Tokenization, Merge, BPE, BPETokenization
using TextEncodeBase: TextEncodeBase, FlatTokenizer, CodeNormalizer, Sentence, getvalue, CodeUnMap
using Downloads
using HuggingFaceApi

# Global variables
tokenizer = let
bpe = BPE(Downloads.download("https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"))
url = HuggingFaceURL("gpt2", "merges.txt")
file = HuggingFaceApi.cached_download(url)
bpe = BPE(file)
FlatTokenizer(CodeNormalizer(BPETokenization(GPT2Tokenization(), bpe), gpt2_codemap()))

# tiktoken.get_encoding(
Expand Down Expand Up @@ -108,7 +110,7 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED ||
any(c -> sizeof(c) > 2, chunk_text_to_append) # there are 3-bytes hieroglyphs
# Append the chunk text to the list of chunks
push!(chunks, chunk_text_to_append)
push!(chunks, filter(isvalid, chunk_text_to_append))
end

# Remove the tokens corresponding to the chunk text from the remaining tokens
Expand Down

0 comments on commit 86b9ff4

Please sign in to comment.