chunking: fixed processing of hieroglyphs. GPT-Tokenizer generates co…

…mponents of a hieroglyph, and these can be assembled incorrectly.
rssdev10 · Aug 8, 2023 · c1313b6 · c1313b6
1 parent d7c6947
commit c1313b6
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 8 deletions.
diff --git a/src/services/chunks.jl b/src/services/chunks.jl
@@ -68,11 +68,19 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
  # Skip the chunk if it is empty or whitespace
  if isempty(chunk_text)
  # Remove the tokens corresponding to the chunk text from the remaining tokens
- tokens = tokens[length(chunk) :end]
+ tokens = tokens[(length(chunk) + 1) :end]
  # Continue to the next iteration of the loop
  continue
  end
 
+ if !isvalid(chunk_text[end]) # check wrond multibyte unicode
+ chunk_text = chunk_text[1:prevind(chunk_text, end)]
+ end
+
+ # Space-token value depends on a Korean hieroglyph after it.
+ # Ending space should be a start of the next chunk.
+ chunk_text = strip(chunk_text)
+
  # Find the last period or punctuation mark in the chunk
  last_punctuation =
  filter(!isnothing,
@@ -97,13 +105,14 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
  # Remove any newline characters and strip any leading or trailing whitespace
  chunk_text_to_append = replace(chunk_text, "\n" => " ") |> strip
 
- if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED
+ if length(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED ||
+ any(c -> sizeof(c) > 2, chunk_text_to_append) # there are 3-bytes hieroglyphs
  # Append the chunk text to the list of chunks
  push!(chunks, chunk_text_to_append)
  end
 
  # Remove the tokens corresponding to the chunk text from the remaining tokens
- tokens = tokens[length(encode(chunk_text)):end]
+ tokens = tokens[length(encode(chunk_text))+1:end]
 
  # Increment the number of chunks
  num_chunks += 1
@@ -113,7 +122,8 @@ function get_text_chunks(text::String, chunk_token_size=0)::Vector{<:AbstractStr
  if !isempty(tokens)
  remaining_text = decode(tokens) |> str -> replace(str, "\n" => " ") |> strip
  if length(remaining_text) > MIN_CHUNK_LENGTH_TO_EMBED
- push!(chunks, remaining_text)
+ # remove wrong unicode symbols from incomplete token sequences
+ push!(chunks, filter(isvalid, remaining_text))
  end
  end
 
@@ -137,11 +147,13 @@ function create_document_chunks(
  # Generate a document id if not provided
  doc_id = !isnothing(doc.id) && !isempty(doc.id) ? doc.id : string(UUIDs.uuid4())
 
+ text = doc.text
+
  # Check if the document text is empty or whitespace
- !isempty(doc.text) || return ([], doc_id)
+ !isempty(text) || return ([], doc_id)
 
  # Split the document text into chunks
- text_chunks = get_text_chunks(doc.text, chunk_token_size)
+ text_chunks = get_text_chunks(text, chunk_token_size)
 
  metadata = DocumentChunkMetadata()
  if !isnothing(doc.metadata)

diff --git a/test/test_chunks.jl b/test/test_chunks.jl
@@ -13,7 +13,7 @@ chunk_size = 5
 
 text_chunks = GptSearchPlugin.AppServer.get_text_chunks(text, chunk_size)
 
-doc = GptSearchPlugin.AppServer.Document(text=text)
+doc = GptSearchPlugin.AppServer.Document(text = text)
 doc_chunks = GptSearchPlugin.AppServer.create_document_chunks(doc, chunk_size)
 
 @test length(first(doc_chunks)) == length(text_chunks)
@@ -31,9 +31,30 @@ patch = @patch create_embeddings(api_key::String, text_vectors::AbstractVector)
  )
 )
 arr_chunks = apply(patch) do
- GptSearchPlugin.AppServer.get_document_chunks(repeat([doc], 5), chunk_size)
+ GptSearchPlugin.AppServer.get_document_chunks(repeat([doc], 5), chunk_size)
 end
 
 @test length(first(arr_chunks) |> values |> last) == length(text_chunks)
 
 # @show first(arr_chunks)
+
+texts = [
+ "А теперь проверим двухбайтовые символы.",
+ # test 3-bytes unicode chunking
+ "保留和晋升不应使任何团体或个人处于不利地位。",
+ "특히 직장에서 변화를 만나게 되면 부정하게 되기가 쉽습니다.",
+]
+for text in texts
+ arr_chunks = apply(patch) do
+ doc = GptSearchPlugin.AppServer.Document(text = text)
+ GptSearchPlugin.AppServer.get_document_chunks([doc], 10) #chunk_size * 3)
+ end
+
+ @test !isempty(arr_chunks)
+
+ recovered_str = map(x -> x.text, first(arr_chunks) |> values |> last) |> join
+ @show recovered_str
+
+ @test text[begin] == recovered_str[begin]
+ @test text[end] == recovered_str[end]
+end