From c1313b61970533b50507674d346fb4a7802e15a3 Mon Sep 17 00:00:00 2001 From: os-rss Date: Mon, 7 Aug 2023 18:03:23 -0700 Subject: [PATCH] chunking: fixed processing of hieroglyphs. GPT-Tokenizer generates components of a hieroglyph, and these can be assembled incorrectly. --- src/services/chunks.jl | 24 ++++++++++++++++++------ test/test_chunks.jl | 25 +++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/services/chunks.jl b/src/services/chunks.jl index 4142e7e..e52d1b3 100644 --- a/src/services/chunks.jl +++ b/src/services/chunks.jl @@ -68,11 +68,19 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr # Skip the chunk if it is empty or whitespace if isempty(chunk_text) # Remove the tokens corresponding to the chunk text from the remaining tokens - tokens = tokens[length(chunk) :end] + tokens = tokens[(length(chunk) + 1) :end] # Continue to the next iteration of the loop continue end + if !isvalid(chunk_text[end]) # check wrond multibyte unicode + chunk_text = chunk_text[1:prevind(chunk_text, end)] + end + + # Space-token value depends on a Korean hieroglyph after it. + # Ending space should be a start of the next chunk. + chunk_text = strip(chunk_text) + # Find the last period or punctuation mark in the chunk last_punctuation = filter(!isnothing, @@ -97,13 +105,14 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr # Remove any newline characters and strip any leading or trailing whitespace chunk_text_to_append = replace(chunk_text, "\n" => " ") |> strip - if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED + if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED || + any(c -> sizeof(c) > 2, chunk_text_to_append) # there are 3-bytes hieroglyphs # Append the chunk text to the list of chunks push!(chunks, chunk_text_to_append) end # Remove the tokens corresponding to the chunk text from the remaining tokens - tokens = tokens[length(encode(chunk_text)):end] + tokens = tokens[length(encode(chunk_text))+1:end] # Increment the number of chunks num_chunks += 1 @@ -113,7 +122,8 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr if !isempty(tokens) remaining_text = decode(tokens) |> str -> replace(str, "\n" => " ") |> strip if length(remaining_text) > MIN_CHUNK_LENGTH_TO_EMBED - push!(chunks, remaining_text) + # remove wrong unicode symbols from incomplete token sequences + push!(chunks, filter(isvalid, remaining_text)) end end @@ -137,11 +147,13 @@ function create_document_chunks( # Generate a document id if not provided doc_id = !isnothing(doc.id) && !isempty(doc.id) ? doc.id : string(UUIDs.uuid4()) + text = doc.text + # Check if the document text is empty or whitespace - !isempty(doc.text) || return ([], doc_id) + !isempty(text) || return ([], doc_id) # Split the document text into chunks - text_chunks = get_text_chunks(doc.text, chunk_token_size) + text_chunks = get_text_chunks(text, chunk_token_size) metadata = DocumentChunkMetadata() if !isnothing(doc.metadata) diff --git a/test/test_chunks.jl b/test/test_chunks.jl index 5da4587..af72a9a 100644 --- a/test/test_chunks.jl +++ b/test/test_chunks.jl @@ -13,7 +13,7 @@ chunk_size = 5 text_chunks = GptSearchPlugin.AppServer.get_text_chunks(text, chunk_size) -doc = GptSearchPlugin.AppServer.Document(text=text) +doc = GptSearchPlugin.AppServer.Document(text = text) doc_chunks = GptSearchPlugin.AppServer.create_document_chunks(doc, chunk_size) @test length(first(doc_chunks)) == length(text_chunks) @@ -31,9 +31,30 @@ patch = @patch create_embeddings(api_key::String, text_vectors::AbstractVector) ) ) arr_chunks = apply(patch) do - GptSearchPlugin.AppServer.get_document_chunks(repeat([doc], 5), chunk_size) + GptSearchPlugin.AppServer.get_document_chunks(repeat([doc], 5), chunk_size) end @test length(first(arr_chunks) |> values |> last) == length(text_chunks) # @show first(arr_chunks) + +texts = [ + "А теперь проверим двухбайтовые символы.", + # test 3-bytes unicode chunking + "保留和晋升不应使任何团体或个人处于不利地位。", + "특히 직장에서 변화를 만나게 되면 부정하게 되기가 쉽습니다.", +] +for text in texts + arr_chunks = apply(patch) do + doc = GptSearchPlugin.AppServer.Document(text = text) + GptSearchPlugin.AppServer.get_document_chunks([doc], 10) #chunk_size * 3) + end + + @test !isempty(arr_chunks) + + recovered_str = map(x -> x.text, first(arr_chunks) |> values |> last) |> join + @show recovered_str + + @test text[begin] == recovered_str[begin] + @test text[end] == recovered_str[end] +end