From 2b226bef7810fc7fb0a567d2e49775feb558a25a Mon Sep 17 00:00:00 2001
From: hagen-danswer <hagen@danswer.ai>
Date: Fri, 22 Nov 2024 08:57:44 -0800
Subject: [PATCH 1/2] Each section is now cleaned before being chunked

---
 backend/danswer/indexing/chunker.py           | 49 +++++++++++--------
 .../search_nlp_models.py                      | 28 -----------
 backend/danswer/utils/text_processing.py      | 22 +++++++++
 3 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py
index 35dd919af6b..56ddb9dc993 100644
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -14,6 +14,7 @@
 from danswer.indexing.models import DocAwareChunk
 from danswer.natural_language_processing.utils import BaseTokenizer
 from danswer.utils.logger import setup_logger
+from danswer.utils.text_processing import clean_text
 from danswer.utils.text_processing import shared_precompare_cleanup
 from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
 
@@ -221,8 +222,15 @@ def _create_chunk(
             )
 
         for section in document.sections:
-            section_text = section.text
+            section_text = clean_text(section.text)
             section_link_text = section.link or ""
+            if not section_text:
+                logger.warning(
+                    f"Skipping section {section.text} from document "
+                    f"{document.semantic_identifier} due to empty text after cleaning "
+                    f" with link {section_link_text}"
+                )
+                continue
 
             section_token_count = len(self.tokenizer.tokenize(section_text))
 
@@ -238,31 +246,26 @@ def _create_chunk(
                 split_texts = self.chunk_splitter.split_text(section_text)
 
                 for i, split_text in enumerate(split_texts):
-                    split_token_count = len(self.tokenizer.tokenize(split_text))
-
-                    if STRICT_CHUNK_TOKEN_LIMIT:
-                        split_token_count = len(self.tokenizer.tokenize(split_text))
-                        if split_token_count > content_token_limit:
-                            # Further split the oversized chunk
-                            smaller_chunks = self._split_oversized_chunk(
-                                split_text, content_token_limit
-                            )
-                            for i, small_chunk in enumerate(smaller_chunks):
-                                chunks.append(
-                                    _create_chunk(
-                                        text=small_chunk,
-                                        links={0: section_link_text},
-                                        is_continuation=(i != 0),
-                                    )
-                                )
-                        else:
+                    if (
+                        STRICT_CHUNK_TOKEN_LIMIT
+                        and
+                        # Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
+                        len(self.tokenizer.tokenize(split_text)) > content_token_limit
+                    ):
+                        # If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
+                        # the token count of each split text to ensure it is
+                        # not larger than the content_token_limit
+                        smaller_chunks = self._split_oversized_chunk(
+                            split_text, content_token_limit
+                        )
+                        for i, small_chunk in enumerate(smaller_chunks):
                             chunks.append(
                                 _create_chunk(
-                                    text=split_text,
+                                    text=small_chunk,
                                     links={0: section_link_text},
+                                    is_continuation=(i != 0),
                                 )
                             )
-
                     else:
                         chunks.append(
                             _create_chunk(
@@ -354,6 +357,10 @@ def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
         return normal_chunks
 
     def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
+        """
+        Takes in a list of documents and chunks them into smaller chunks for indexing
+        while persisting the document metadata.
+        """
         final_chunks: list[DocAwareChunk] = []
         for document in documents:
             if self.callback:
diff --git a/backend/danswer/natural_language_processing/search_nlp_models.py b/backend/danswer/natural_language_processing/search_nlp_models.py
index 9a3d575c0f9..ee80292de63 100644
--- a/backend/danswer/natural_language_processing/search_nlp_models.py
+++ b/backend/danswer/natural_language_processing/search_nlp_models.py
@@ -1,4 +1,3 @@
-import re
 import threading
 import time
 from collections.abc import Callable
@@ -50,28 +49,6 @@ def clean_model_name(model_str: str) -> str:
     return model_str.replace("/", "_").replace("-", "_").replace(".", "_")
 
 
-_INITIAL_FILTER = re.compile(
-    "["
-    "\U0000FFF0-\U0000FFFF"  # Specials
-    "\U0001F000-\U0001F9FF"  # Emoticons
-    "\U00002000-\U0000206F"  # General Punctuation
-    "\U00002190-\U000021FF"  # Arrows
-    "\U00002700-\U000027BF"  # Dingbats
-    "]+",
-    flags=re.UNICODE,
-)
-
-
-def clean_openai_text(text: str) -> str:
-    # Remove specific Unicode ranges that might cause issues
-    cleaned = _INITIAL_FILTER.sub("", text)
-
-    # Remove any control characters except for newline and tab
-    cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
-
-    return cleaned
-
-
 def build_model_server_url(
     model_server_host: str,
     model_server_port: int,
@@ -215,11 +192,6 @@ def encode(
                 for text in texts
             ]
 
-        if self.provider_type == EmbeddingProvider.OPENAI:
-            # If the provider is openai, we need to clean the text
-            # as a temporary workaround for the openai API
-            texts = [clean_openai_text(text) for text in texts]
-
         batch_size = (
             api_embedding_batch_size
             if self.provider_type
diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py
index da9776990ff..d26b5f357fb 100644
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -126,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str:
     return text
 
 
+_INITIAL_FILTER = re.compile(
+    "["
+    "\U0000FFF0-\U0000FFFF"  # Specials
+    "\U0001F000-\U0001F9FF"  # Emoticons
+    "\U00002000-\U0000206F"  # General Punctuation
+    "\U00002190-\U000021FF"  # Arrows
+    "\U00002700-\U000027BF"  # Dingbats
+    "]+",
+    flags=re.UNICODE,
+)
+
+
+def clean_text(text: str) -> str:
+    # Remove specific Unicode ranges that might cause issues
+    cleaned = _INITIAL_FILTER.sub("", text)
+
+    # Remove any control characters except for newline and tab
+    cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
+
+    return cleaned
+
+
 def is_valid_email(text: str) -> bool:
     """Can use a library instead if more detailed checks are needed"""
     regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

From d9031ea97071ea27180f5e47d171f5ab178302d0 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Fri, 22 Nov 2024 10:00:37 -0800
Subject: [PATCH 2/2] k

---
 backend/danswer/indexing/chunker.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py
index 56ddb9dc993..287d3ba2d5e 100644
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -221,10 +221,14 @@ def _create_chunk(
                 mini_chunk_texts=self._get_mini_chunk_texts(text),
             )
 
-        for section in document.sections:
+        for section_idx, section in enumerate(document.sections):
             section_text = clean_text(section.text)
             section_link_text = section.link or ""
-            if not section_text:
+            # If there is no useful content, not even the title, just drop it
+            if not section_text and (not document.title or section_idx > 0):
+                # If a section is empty and the document has no title, we can just drop it. We return a list of
+                # DocAwareChunks where each one contains the necessary information needed down the line for indexing.
+                # There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
                 logger.warning(
                     f"Skipping section {section.text} from document "
                     f"{document.semantic_identifier} due to empty text after cleaning "