From b873e6349cb4c51f2c1fcaf02fa67787e26216d7 Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:14:27 +0800 Subject: [PATCH] add child chunk preview number limit (#12309) --- api/configs/feature/__init__.py | 5 +++++ .../processor/parent_child_index_processor.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index e11993ddc7a1fe..1550de0fd24d54 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -667,6 +667,11 @@ class IndexingConfig(BaseSettings): default=4000, ) + CHILD_CHUNKS_PREVIEW_NUMBER: PositiveInt = Field( + description="Maximum number of child chunks to preview", + default=50, + ) + class MultiModalTransferConfig(BaseSettings): MULTIMODAL_SEND_FORMAT: Literal["base64", "url"] = Field( diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index e8423e2b777b15..31401220818d20 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -3,6 +3,7 @@ import uuid from typing import Optional +from configs import dify_config from core.model_manager import ModelInstance from core.rag.cleaner.clean_processor import CleanProcessor from core.rag.datasource.retrieval_service import RetrievalService @@ -80,6 +81,10 @@ def transform(self, documents: list[Document], **kwargs) -> list[Document]: child_nodes = self._split_child_nodes( document, rules, process_rule.get("mode"), kwargs.get("embedding_model_instance") ) + if kwargs.get("preview"): + if len(child_nodes) > dify_config.CHILD_CHUNKS_PREVIEW_NUMBER: + child_nodes = child_nodes[: dify_config.CHILD_CHUNKS_PREVIEW_NUMBER] + document.children = child_nodes doc_id = str(uuid.uuid4()) hash = helper.generate_text_hash(document.page_content)