Skip to content

Commit

Permalink
chunking: fixed processing of hieroglyphs. GPT-Tokenizer generates co…
Browse files Browse the repository at this point in the history
…mponents of a hieroglyph, and these can be assembled incorrectly.
  • Loading branch information
os-rss committed Aug 8, 2023
1 parent d7c6947 commit c1313b6
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 8 deletions.
24 changes: 18 additions & 6 deletions src/services/chunks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,19 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
# Skip the chunk if it is empty or whitespace
if isempty(chunk_text)
# Remove the tokens corresponding to the chunk text from the remaining tokens
tokens = tokens[length(chunk) :end]
tokens = tokens[(length(chunk) + 1) :end]
# Continue to the next iteration of the loop
continue
end

if !isvalid(chunk_text[end]) # check wrond multibyte unicode
chunk_text = chunk_text[1:prevind(chunk_text, end)]
end

# Space-token value depends on a Korean hieroglyph after it.
# Ending space should be a start of the next chunk.
chunk_text = strip(chunk_text)

# Find the last period or punctuation mark in the chunk
last_punctuation =
filter(!isnothing,
Expand All @@ -97,13 +105,14 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
# Remove any newline characters and strip any leading or trailing whitespace
chunk_text_to_append = replace(chunk_text, "\n" => " ") |> strip

if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED
if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED ||
any(c -> sizeof(c) > 2, chunk_text_to_append) # there are 3-bytes hieroglyphs
# Append the chunk text to the list of chunks
push!(chunks, chunk_text_to_append)
end

# Remove the tokens corresponding to the chunk text from the remaining tokens
tokens = tokens[length(encode(chunk_text)):end]
tokens = tokens[length(encode(chunk_text))+1:end]

# Increment the number of chunks
num_chunks += 1
Expand All @@ -113,7 +122,8 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
if !isempty(tokens)
remaining_text = decode(tokens) |> str -> replace(str, "\n" => " ") |> strip
if length(remaining_text) > MIN_CHUNK_LENGTH_TO_EMBED
push!(chunks, remaining_text)
# remove wrong unicode symbols from incomplete token sequences
push!(chunks, filter(isvalid, remaining_text))
end
end

Expand All @@ -137,11 +147,13 @@ function create_document_chunks(
# Generate a document id if not provided
doc_id = !isnothing(doc.id) && !isempty(doc.id) ? doc.id : string(UUIDs.uuid4())

text = doc.text

# Check if the document text is empty or whitespace
!isempty(doc.text) || return ([], doc_id)
!isempty(text) || return ([], doc_id)

# Split the document text into chunks
text_chunks = get_text_chunks(doc.text, chunk_token_size)
text_chunks = get_text_chunks(text, chunk_token_size)

metadata = DocumentChunkMetadata()
if !isnothing(doc.metadata)
Expand Down
25 changes: 23 additions & 2 deletions test/test_chunks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ chunk_size = 5

text_chunks = GptSearchPlugin.AppServer.get_text_chunks(text, chunk_size)

doc = GptSearchPlugin.AppServer.Document(text=text)
doc = GptSearchPlugin.AppServer.Document(text = text)
doc_chunks = GptSearchPlugin.AppServer.create_document_chunks(doc, chunk_size)

@test length(first(doc_chunks)) == length(text_chunks)
Expand All @@ -31,9 +31,30 @@ patch = @patch create_embeddings(api_key::String, text_vectors::AbstractVector)
)
)
arr_chunks = apply(patch) do
GptSearchPlugin.AppServer.get_document_chunks(repeat([doc], 5), chunk_size)
GptSearchPlugin.AppServer.get_document_chunks(repeat([doc], 5), chunk_size)
end

@test length(first(arr_chunks) |> values |> last) == length(text_chunks)

# @show first(arr_chunks)

texts = [
"А теперь проверим двухбайтовые символы.",
# test 3-bytes unicode chunking
"保留和晋升不应使任何团体或个人处于不利地位。",
"특히 직장에서 변화를 만나게 되면 부정하게 되기가 쉽습니다.",
]
for text in texts
arr_chunks = apply(patch) do
doc = GptSearchPlugin.AppServer.Document(text = text)
GptSearchPlugin.AppServer.get_document_chunks([doc], 10) #chunk_size * 3)
end

@test !isempty(arr_chunks)

recovered_str = map(x -> x.text, first(arr_chunks) |> values |> last) |> join
@show recovered_str

@test text[begin] == recovered_str[begin]
@test text[end] == recovered_str[end]
end

0 comments on commit c1313b6

Please sign in to comment.