From c1313b61970533b50507674d346fb4a7802e15a3 Mon Sep 17 00:00:00 2001
From: os-rss <roman.samarev@opensesame.com>
Date: Mon, 7 Aug 2023 18:03:23 -0700
Subject: [PATCH] chunking: fixed processing of hieroglyphs. GPT-Tokenizer
 generates components of a hieroglyph, and these can be assembled incorrectly.

---
 src/services/chunks.jl | 24 ++++++++++++++++++------
 test/test_chunks.jl    | 25 +++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/src/services/chunks.jl b/src/services/chunks.jl
index 4142e7e..e52d1b3 100644
--- a/src/services/chunks.jl
+++ b/src/services/chunks.jl
@@ -68,11 +68,19 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
         # Skip the chunk if it is empty or whitespace
         if isempty(chunk_text)
             # Remove the tokens corresponding to the chunk text from the remaining tokens
-            tokens = tokens[length(chunk) :end]
+            tokens = tokens[(length(chunk) + 1) :end]
             # Continue to the next iteration of the loop
             continue
         end
 
+        if !isvalid(chunk_text[end]) # check wrond multibyte unicode
+            chunk_text = chunk_text[1:prevind(chunk_text, end)]
+        end
+
+        # Space-token value depends on a Korean hieroglyph after it.
+        # Ending space should be a start of the next chunk.
+        chunk_text = strip(chunk_text)
+
         # Find the last period or punctuation mark in the chunk
         last_punctuation =
             filter(!isnothing,
@@ -97,13 +105,14 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
         # Remove any newline characters and strip any leading or trailing whitespace
         chunk_text_to_append = replace(chunk_text, "\n" => " ") |> strip
 
-        if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED
+        if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED ||
+            any(c -> sizeof(c) > 2, chunk_text_to_append) # there are 3-bytes hieroglyphs
             # Append the chunk text to the list of chunks
             push!(chunks, chunk_text_to_append)
         end
 
         # Remove the tokens corresponding to the chunk text from the remaining tokens
-        tokens = tokens[length(encode(chunk_text)):end]
+        tokens = tokens[length(encode(chunk_text))+1:end]
 
         # Increment the number of chunks
         num_chunks += 1
@@ -113,7 +122,8 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
     if !isempty(tokens)
         remaining_text = decode(tokens) |> str -> replace(str, "\n" => " ") |> strip
         if length(remaining_text) > MIN_CHUNK_LENGTH_TO_EMBED
-            push!(chunks, remaining_text)
+            # remove wrong unicode symbols from incomplete token sequences
+            push!(chunks, filter(isvalid, remaining_text))
         end
     end
 
@@ -137,11 +147,13 @@ function create_document_chunks(
     # Generate a document id if not provided
     doc_id = !isnothing(doc.id) && !isempty(doc.id) ? doc.id : string(UUIDs.uuid4())
 
+    text = doc.text
+
     # Check if the document text is empty or whitespace
-    !isempty(doc.text) || return ([], doc_id)
+    !isempty(text) || return ([], doc_id)
 
     # Split the document text into chunks
-    text_chunks = get_text_chunks(doc.text, chunk_token_size)
+    text_chunks = get_text_chunks(text, chunk_token_size)
 
     metadata = DocumentChunkMetadata()
     if !isnothing(doc.metadata)
diff --git a/test/test_chunks.jl b/test/test_chunks.jl
index 5da4587..af72a9a 100644
--- a/test/test_chunks.jl
+++ b/test/test_chunks.jl
@@ -13,7 +13,7 @@ chunk_size = 5
 
 text_chunks = GptSearchPlugin.AppServer.get_text_chunks(text, chunk_size)
 
-doc = GptSearchPlugin.AppServer.Document(text=text)
+doc = GptSearchPlugin.AppServer.Document(text = text)
 doc_chunks = GptSearchPlugin.AppServer.create_document_chunks(doc, chunk_size)
 
 @test length(first(doc_chunks)) == length(text_chunks)
@@ -31,9 +31,30 @@ patch = @patch create_embeddings(api_key::String, text_vectors::AbstractVector)
     )
 )
 arr_chunks = apply(patch) do
-    GptSearchPlugin.AppServer.get_document_chunks(repeat([doc], 5), chunk_size)
+	GptSearchPlugin.AppServer.get_document_chunks(repeat([doc], 5), chunk_size)
 end
 
 @test length(first(arr_chunks) |> values |> last) == length(text_chunks)
 
 # @show first(arr_chunks)
+
+texts = [
+	"А теперь проверим двухбайтовые символы.",
+	# test 3-bytes unicode chunking
+	"保留和晋升不应使任何团体或个人处于不利地位。",
+	"특히 직장에서 변화를 만나게 되면 부정하게 되기가 쉽습니다.",
+]
+for text in texts
+	arr_chunks = apply(patch) do
+		doc = GptSearchPlugin.AppServer.Document(text = text)
+		GptSearchPlugin.AppServer.get_document_chunks([doc], 10) #chunk_size * 3)
+	end
+
+	@test !isempty(arr_chunks)
+
+	recovered_str = map(x -> x.text, first(arr_chunks) |> values |> last) |> join
+    @show recovered_str
+
+	@test text[begin] == recovered_str[begin]
+	@test text[end] == recovered_str[end]
+end