Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Each section is now cleaned before being chunked #3210

Merged
merged 2 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 33 additions & 22 deletions backend/danswer/indexing/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from danswer.indexing.models import DocAwareChunk
from danswer.natural_language_processing.utils import BaseTokenizer
from danswer.utils.logger import setup_logger
from danswer.utils.text_processing import clean_text
from danswer.utils.text_processing import shared_precompare_cleanup
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT

Expand Down Expand Up @@ -220,9 +221,20 @@ def _create_chunk(
mini_chunk_texts=self._get_mini_chunk_texts(text),
)

for section in document.sections:
section_text = section.text
for section_idx, section in enumerate(document.sections):
section_text = clean_text(section.text)
section_link_text = section.link or ""
# If there is no useful content, not even the title, just drop it
if not section_text and (not document.title or section_idx > 0):
# If a section is empty and the document has no title, we can just drop it. We return a list of
# DocAwareChunks where each one contains the necessary information needed down the line for indexing.
# There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
logger.warning(
f"Skipping section {section.text} from document "
f"{document.semantic_identifier} due to empty text after cleaning "
f" with link {section_link_text}"
)
continue

section_token_count = len(self.tokenizer.tokenize(section_text))

Expand All @@ -238,31 +250,26 @@ def _create_chunk(
split_texts = self.chunk_splitter.split_text(section_text)

for i, split_text in enumerate(split_texts):
split_token_count = len(self.tokenizer.tokenize(split_text))

if STRICT_CHUNK_TOKEN_LIMIT:
split_token_count = len(self.tokenizer.tokenize(split_text))
if split_token_count > content_token_limit:
# Further split the oversized chunk
smaller_chunks = self._split_oversized_chunk(
split_text, content_token_limit
)
for i, small_chunk in enumerate(smaller_chunks):
chunks.append(
_create_chunk(
text=small_chunk,
links={0: section_link_text},
is_continuation=(i != 0),
)
)
else:
if (
STRICT_CHUNK_TOKEN_LIMIT
and
# Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
len(self.tokenizer.tokenize(split_text)) > content_token_limit
):
# If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
# the token count of each split text to ensure it is
# not larger than the content_token_limit
smaller_chunks = self._split_oversized_chunk(
split_text, content_token_limit
)
for i, small_chunk in enumerate(smaller_chunks):
chunks.append(
_create_chunk(
text=split_text,
text=small_chunk,
links={0: section_link_text},
is_continuation=(i != 0),
)
)

else:
chunks.append(
_create_chunk(
Expand Down Expand Up @@ -354,6 +361,10 @@ def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
return normal_chunks

def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
"""
Takes in a list of documents and chunks them into smaller chunks for indexing
while persisting the document metadata.
"""
final_chunks: list[DocAwareChunk] = []
for document in documents:
if self.callback:
Expand Down
28 changes: 0 additions & 28 deletions backend/danswer/natural_language_processing/search_nlp_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
import threading
import time
from collections.abc import Callable
Expand Down Expand Up @@ -50,28 +49,6 @@ def clean_model_name(model_str: str) -> str:
return model_str.replace("/", "_").replace("-", "_").replace(".", "_")


_INITIAL_FILTER = re.compile(
"["
"\U0000FFF0-\U0000FFFF" # Specials
"\U0001F000-\U0001F9FF" # Emoticons
"\U00002000-\U0000206F" # General Punctuation
"\U00002190-\U000021FF" # Arrows
"\U00002700-\U000027BF" # Dingbats
"]+",
flags=re.UNICODE,
)


def clean_openai_text(text: str) -> str:
# Remove specific Unicode ranges that might cause issues
cleaned = _INITIAL_FILTER.sub("", text)

# Remove any control characters except for newline and tab
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")

return cleaned


def build_model_server_url(
model_server_host: str,
model_server_port: int,
Expand Down Expand Up @@ -215,11 +192,6 @@ def encode(
for text in texts
]

if self.provider_type == EmbeddingProvider.OPENAI:
# If the provider is openai, we need to clean the text
# as a temporary workaround for the openai API
texts = [clean_openai_text(text) for text in texts]

batch_size = (
api_embedding_batch_size
if self.provider_type
Expand Down
22 changes: 22 additions & 0 deletions backend/danswer/utils/text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str:
return text


_INITIAL_FILTER = re.compile(
"["
"\U0000FFF0-\U0000FFFF" # Specials
"\U0001F000-\U0001F9FF" # Emoticons
"\U00002000-\U0000206F" # General Punctuation
"\U00002190-\U000021FF" # Arrows
"\U00002700-\U000027BF" # Dingbats
"]+",
flags=re.UNICODE,
)


def clean_text(text: str) -> str:
# Remove specific Unicode ranges that might cause issues
cleaned = _INITIAL_FILTER.sub("", text)

# Remove any control characters except for newline and tab
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")

return cleaned


def is_valid_email(text: str) -> bool:
"""Can use a library instead if more detailed checks are needed"""
regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
Expand Down
Loading