diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py index 35dd919af6b..287d3ba2d5e 100644 --- a/backend/danswer/indexing/chunker.py +++ b/backend/danswer/indexing/chunker.py @@ -14,6 +14,7 @@ from danswer.indexing.models import DocAwareChunk from danswer.natural_language_processing.utils import BaseTokenizer from danswer.utils.logger import setup_logger +from danswer.utils.text_processing import clean_text from danswer.utils.text_processing import shared_precompare_cleanup from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT @@ -220,9 +221,20 @@ def _create_chunk( mini_chunk_texts=self._get_mini_chunk_texts(text), ) - for section in document.sections: - section_text = section.text + for section_idx, section in enumerate(document.sections): + section_text = clean_text(section.text) section_link_text = section.link or "" + # If there is no useful content, not even the title, just drop it + if not section_text and (not document.title or section_idx > 0): + # If a section is empty and the document has no title, we can just drop it. We return a list of + # DocAwareChunks where each one contains the necessary information needed down the line for indexing. + # There is no concern about dropping whole documents from this list, it should not cause any indexing failures. + logger.warning( + f"Skipping section {section.text} from document " + f"{document.semantic_identifier} due to empty text after cleaning " + f" with link {section_link_text}" + ) + continue section_token_count = len(self.tokenizer.tokenize(section_text)) @@ -238,31 +250,26 @@ def _create_chunk( split_texts = self.chunk_splitter.split_text(section_text) for i, split_text in enumerate(split_texts): - split_token_count = len(self.tokenizer.tokenize(split_text)) - - if STRICT_CHUNK_TOKEN_LIMIT: - split_token_count = len(self.tokenizer.tokenize(split_text)) - if split_token_count > content_token_limit: - # Further split the oversized chunk - smaller_chunks = self._split_oversized_chunk( - split_text, content_token_limit - ) - for i, small_chunk in enumerate(smaller_chunks): - chunks.append( - _create_chunk( - text=small_chunk, - links={0: section_link_text}, - is_continuation=(i != 0), - ) - ) - else: + if ( + STRICT_CHUNK_TOKEN_LIMIT + and + # Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true + len(self.tokenizer.tokenize(split_text)) > content_token_limit + ): + # If STRICT_CHUNK_TOKEN_LIMIT is true, manually check + # the token count of each split text to ensure it is + # not larger than the content_token_limit + smaller_chunks = self._split_oversized_chunk( + split_text, content_token_limit + ) + for i, small_chunk in enumerate(smaller_chunks): chunks.append( _create_chunk( - text=split_text, + text=small_chunk, links={0: section_link_text}, + is_continuation=(i != 0), ) ) - else: chunks.append( _create_chunk( @@ -354,6 +361,10 @@ def _handle_single_document(self, document: Document) -> list[DocAwareChunk]: return normal_chunks def chunk(self, documents: list[Document]) -> list[DocAwareChunk]: + """ + Takes in a list of documents and chunks them into smaller chunks for indexing + while persisting the document metadata. + """ final_chunks: list[DocAwareChunk] = [] for document in documents: if self.callback: diff --git a/backend/danswer/natural_language_processing/search_nlp_models.py b/backend/danswer/natural_language_processing/search_nlp_models.py index 9a3d575c0f9..ee80292de63 100644 --- a/backend/danswer/natural_language_processing/search_nlp_models.py +++ b/backend/danswer/natural_language_processing/search_nlp_models.py @@ -1,4 +1,3 @@ -import re import threading import time from collections.abc import Callable @@ -50,28 +49,6 @@ def clean_model_name(model_str: str) -> str: return model_str.replace("/", "_").replace("-", "_").replace(".", "_") -_INITIAL_FILTER = re.compile( - "[" - "\U0000FFF0-\U0000FFFF" # Specials - "\U0001F000-\U0001F9FF" # Emoticons - "\U00002000-\U0000206F" # General Punctuation - "\U00002190-\U000021FF" # Arrows - "\U00002700-\U000027BF" # Dingbats - "]+", - flags=re.UNICODE, -) - - -def clean_openai_text(text: str) -> str: - # Remove specific Unicode ranges that might cause issues - cleaned = _INITIAL_FILTER.sub("", text) - - # Remove any control characters except for newline and tab - cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t") - - return cleaned - - def build_model_server_url( model_server_host: str, model_server_port: int, @@ -215,11 +192,6 @@ def encode( for text in texts ] - if self.provider_type == EmbeddingProvider.OPENAI: - # If the provider is openai, we need to clean the text - # as a temporary workaround for the openai API - texts = [clean_openai_text(text) for text in texts] - batch_size = ( api_embedding_batch_size if self.provider_type diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py index da9776990ff..d26b5f357fb 100644 --- a/backend/danswer/utils/text_processing.py +++ b/backend/danswer/utils/text_processing.py @@ -126,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str: return text +_INITIAL_FILTER = re.compile( + "[" + "\U0000FFF0-\U0000FFFF" # Specials + "\U0001F000-\U0001F9FF" # Emoticons + "\U00002000-\U0000206F" # General Punctuation + "\U00002190-\U000021FF" # Arrows + "\U00002700-\U000027BF" # Dingbats + "]+", + flags=re.UNICODE, +) + + +def clean_text(text: str) -> str: + # Remove specific Unicode ranges that might cause issues + cleaned = _INITIAL_FILTER.sub("", text) + + # Remove any control characters except for newline and tab + cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t") + + return cleaned + + def is_valid_email(text: str) -> bool: """Can use a library instead if more detailed checks are needed""" regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"