Skip to content

Commit

Permalink
feat: support kb document manage
Browse files Browse the repository at this point in the history
  • Loading branch information
Mini256 committed Nov 27, 2024
1 parent 8d1087e commit b0d751b
Show file tree
Hide file tree
Showing 9 changed files with 462 additions and 118 deletions.
4 changes: 1 addition & 3 deletions backend/app/api/admin_routes/document/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
from fastapi import APIRouter, Depends, Query
from fastapi_pagination import Params, Page

from app.api.admin_routes.document.models import DocumentItem
from app.api.admin_routes.knowledge_base.document.models import DocumentFilters, DocumentItem
from app.api.deps import SessionDep, CurrentSuperuserDep
from app.repositories import document_repo

from app.repositories.document import DocumentFilters

router = APIRouter()


Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,10 @@ class DocumentItem(BaseModel):
knowledge_base: KnowledgeBaseDescriptor | None
last_modified_at: datetime
created_at: datetime
updated_at: datetime
updated_at: datetime


class KBDocumentUpload(BaseModel):
file_id: int
file_name: str

153 changes: 153 additions & 0 deletions backend/app/api/admin_routes/knowledge_base/document/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import logging
from typing import Annotated

from fastapi import APIRouter, Depends, Query
from fastapi_pagination import Params, Page

from app.api.admin_routes.knowledge_base.data_sources.models import KBDataSource
from app.api.admin_routes.knowledge_base.document.models import KBDocumentUpload, DocumentFilters, DocumentItem
from app.api.admin_routes.knowledge_base.models import ChunkItem
from app.api.admin_routes.knowledge_base.routes import logger
from app.api.deps import SessionDep, CurrentSuperuserDep
from app.exceptions import (
InternalServerError,
KBDataSourceNotFoundError,
KnowledgeBaseNotFoundError
)
from app.models import DataSource
from app.models.chunk import get_kb_chunk_model
from app.repositories import knowledge_base_repo, document_repo
from app.repositories.chunk import ChunkRepo
from app.tasks import build_index_for_document, build_kg_index_for_chunk
from app.tasks.knowledge_base import (
import_documents_from_kb_datasource,
purge_kb_datasource_related_resources
)


router = APIRouter()
logger = logging.getLogger(__name__)


@router.post("/admin/knowledge_bases/{kb_id}/documents/upload")
def upload_kb_document(
session: SessionDep,
user: CurrentSuperuserDep,
kb_id: int,
upload: KBDocumentUpload
) -> KBDataSource:
try:
kb = knowledge_base_repo.must_get(session, kb_id)
new_data_source = DataSource(
name=upload.name,
description="",
data_source_type=upload.data_source_type,
config=upload.config,
)
new_data_source = knowledge_base_repo.add_kb_datasource(session, kb, new_data_source)

import_documents_from_kb_datasource.delay(kb_id, new_data_source.id)

return new_data_source
except KnowledgeBaseNotFoundError as e:
raise e
except Exception as e:
logger.error(f"Failed to create data source for knowledge base #{kb_id}: {e}", exc_info=e)
raise InternalServerError()


@router.get("/admin/knowledge_bases/{kb_id}/documents")
def list_kb_documents(
session: SessionDep,
user: CurrentSuperuserDep,
kb_id: int,
filters: Annotated[DocumentFilters, Query()],
params: Params = Depends(),
) -> Page[DocumentItem]:
try:
kb = knowledge_base_repo.must_get(session, kb_id)
filters.knowledge_base_id = kb.id
return document_repo.paginate(
session=session,
filters=filters,
params=params,
)
except KnowledgeBaseNotFoundError as e:
raise e
except Exception as e:
logger.exception(e)
raise InternalServerError()


@router.get("/admin/knowledge_bases/{kb_id}/documents/{doc_id}/chunks")
def list_kb_chunks(
session: SessionDep,
user: CurrentSuperuserDep,
kb_id: int,
doc_id: int,
) -> list[ChunkItem]:
try:
kb = knowledge_base_repo.must_get(session, kb_id)
chunk_repo = ChunkRepo(get_kb_chunk_model(kb))
return chunk_repo.get_document_chunks(session, doc_id)
except KnowledgeBaseNotFoundError as e:
raise e
except Exception as e:
logger.exception(e)
raise InternalServerError()


@router.post("/admin/knowledge_bases/{kb_id}/documents/reindex")
def batch_reindex_kb_documents(
session: SessionDep,
user: CurrentSuperuserDep,
kb_id: int,
document_ids: list[int]
) -> dict:
try:
kb = knowledge_base_repo.must_get(session, kb_id)
chunk_repo = ChunkRepo(get_kb_chunk_model(kb))

for document_id in document_ids:
build_index_for_document.delay(kb.id, document_id)

chunks = chunk_repo.get_document_chunks(session, document_id)
for chunk in chunks:
build_kg_index_for_chunk.delay(kb.id, chunk.id)

return {
"detail": f"Triggered {len(document_ids)} documents to reindex knowledge base #{kb_id} successfully"
}
except KnowledgeBaseNotFoundError as e:
raise e
except Exception as e:
logger.exception(e)
raise InternalServerError()


@router.delete("/admin/knowledge_bases/{kb_id}/documents/{document_id}")
def remove_kb_document(
session: SessionDep,
user: CurrentSuperuserDep,
kb_id: int,
data_source_id: int,
):
try:
kb = knowledge_base_repo.must_get(session, kb_id)
data_source = kb.must_get_data_source_by_id(data_source_id)

# Flag the data source to be deleted, it will be deleted completely by the background job.
knowledge_base_repo.remove_kb_document(session, kb, data_source)

purge_kb_datasource_related_resources.delay(kb_id, data_source_id)

return {
"detail": "success"
}
except KnowledgeBaseNotFoundError as e:
raise e
except KBDataSourceNotFoundError as e:
raise e
except Exception as e:
logger.error(f"Failed to remove data source #{data_source_id} from knowledge base #{kb_id}: {e}", exc_info=e)
raise InternalServerError()
123 changes: 20 additions & 103 deletions backend/app/api/admin_routes/knowledge_base/routes.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,22 @@
import logging
from typing import Annotated

from fastapi import APIRouter, Depends, logger, Query
from fastapi import APIRouter, Depends
from fastapi_pagination import Params, Page

from app.models.chunk import get_kb_chunk_model
from app.rag.knowledge_base.index_store import init_kb_tidb_vector_store, init_kb_tidb_graph_store
from app.repositories.chunk import ChunkRepo
from app.repositories.embedding_model import embedding_model_repo
from app.repositories.llm import get_default_db_llm

from app.repositories.embedding_model import embed_model_repo
from app.repositories.llm import llm_repo

from .models import (
KnowledgeBaseDetail,
KnowledgeBaseItem,
KnowledgeBaseCreate, ChunkItem, KnowledgeBaseUpdate, VectorIndexError, KGIndexError
KnowledgeBaseCreate, KnowledgeBaseUpdate, VectorIndexError, KGIndexError
)
from app.api.deps import SessionDep, CurrentSuperuserDep
from app.exceptions import (
InternalServerError,
KnowledgeBaseNotFoundError,
KBNoVectorIndexConfiguredError,
KBNoLLMConfiguredError,
KBNoEmbedModelConfiguredError
KBNoVectorIndexConfiguredError
)
from app.models import (
KnowledgeBase,
Expand All @@ -32,10 +26,12 @@
build_kg_index_for_chunk,
build_index_for_document,
)
from app.repositories import knowledge_base_repo, data_source_repo, document_repo
from app.tasks.knowledge_base import import_documents_for_knowledge_base, purge_knowledge_base_related_resources, \
stats_for_knowledge_base
from ..document.models import DocumentItem, DocumentFilters
from app.repositories import knowledge_base_repo, data_source_repo
from app.tasks.knowledge_base import (
import_documents_for_knowledge_base,
stats_for_knowledge_base,
purge_knowledge_base_related_resources
)

router = APIRouter()
logger = logging.getLogger(__name__)
Expand All @@ -52,34 +48,24 @@ def create_knowledge_base(
data_source_repo.create(session, DataSource(
name=data_source.name,
description='',
user_id=user.id,
data_source_type=data_source.data_source_type,
config=data_source.config,
)) for data_source in create.data_sources
]

db_llm_id = create.llm_id
if not db_llm_id:
default_llm = get_default_db_llm(session)
if default_llm:
db_llm_id = default_llm.id
else:
raise KBNoLLMConfiguredError()

db_embed_model_id = create.embedding_model_id
if not db_embed_model_id:
default_embed_model = embedding_model_repo.get_default_model(session)
if default_embed_model:
db_embed_model_id = default_embed_model.id
else:
raise KBNoEmbedModelConfiguredError()
if not create.llm_id:
create.llm_id = llm_repo.must_get_default_llm(session).id

if not create.embedding_model_id:
create.embedding_model_id = embed_model_repo.must_get_default_model(session).id

knowledge_base = KnowledgeBase(
name=create.name,
description=create.description,
index_methods=create.index_methods,
llm_id=db_llm_id,
embedding_model_id=db_embed_model_id,
llm_id=create.llm_id,
embedding_model_id=create.embedding_model_id,
data_sources=data_sources,
created_by=user.id,
updated_by=user.id,
Expand All @@ -97,7 +83,7 @@ def create_knowledge_base(
except KBNoVectorIndexConfiguredError as e:
raise e
except Exception as e:
logging.exception(e)
logger.exception(e)
raise InternalServerError()


Expand Down Expand Up @@ -141,7 +127,7 @@ def update_knowledge_base_setting(
except KBNoVectorIndexConfiguredError as e:
raise e
except Exception as e:
logging.exception(e)
logger.exception(e)
raise InternalServerError()


Expand Down Expand Up @@ -190,75 +176,6 @@ def get_knowledge_base_index_overview(
raise InternalServerError()


@router.get("/admin/knowledge_bases/{kb_id}/documents")
def list_knowledge_base_documents(
session: SessionDep,
user: CurrentSuperuserDep,
kb_id: int,
filters: Annotated[DocumentFilters, Query()],
params: Params = Depends(),
) -> Page[DocumentItem]:
try:
kb = knowledge_base_repo.must_get(session, kb_id)
filters.knowledge_base_id = kb.id
return document_repo.paginate(
session=session,
filters=filters,
params=params,
)
except KnowledgeBaseNotFoundError as e:
raise e
except Exception as e:
logger.exception(e)
raise InternalServerError()


@router.get("/admin/knowledge_bases/{kb_id}/documents/{doc_id}/chunks")
def list_knowledge_base_chunks(
session: SessionDep,
user: CurrentSuperuserDep,
kb_id: int,
doc_id: int,
) -> list[ChunkItem]:
try:
kb = knowledge_base_repo.must_get(session, kb_id)
chunk_repo = ChunkRepo(get_kb_chunk_model(kb))
return chunk_repo.get_document_chunks(session, doc_id)
except KnowledgeBaseNotFoundError as e:
raise e
except Exception as e:
logger.exception(e)
raise InternalServerError()


@router.post("/admin/knowledge_bases/{kb_id}/documents/reindex")
def batch_reindex_knowledge_base_documents(
session: SessionDep,
user: CurrentSuperuserDep,
kb_id: int,
document_ids: list[int]
) -> dict:
try:
kb = knowledge_base_repo.must_get(session, kb_id)
chunk_repo = ChunkRepo(get_kb_chunk_model(kb))

for document_id in document_ids:
build_index_for_document.delay(kb.id, document_id)

chunks = chunk_repo.get_document_chunks(session, document_id)
for chunk in chunks:
build_kg_index_for_chunk.delay(kb.id, chunk.id)

return {
"detail": f"Triggered {len(document_ids)} documents to reindex knowledge base #{kb_id} successfully"
}
except KnowledgeBaseNotFoundError as e:
raise e
except Exception as e:
logger.exception(e)
raise InternalServerError()


@router.get("/admin/knowledge_bases/{kb_id}/vector-index-errors")
def list_kb_vector_index_errors(
session: SessionDep,
Expand Down
13 changes: 13 additions & 0 deletions backend/app/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@ def __init__(self, knowledge_base_id: int):
self.detail = f"llm #{knowledge_base_id} is not found"


class DBRerankerNotFoundError(HTTPException):
status_code = 404

def __init__(self, reranker_id: int):
self.detail = f"reranker #{reranker_id} is not found"

class DefaultRerankerNotFoundError(HTTPException):
status_code = 404

def __init__(self):
self.detail = f"default reranker is not found"


class KnowledgeBaseNotFoundError(HTTPException):
status_code = 404

Expand Down
Loading

0 comments on commit b0d751b

Please sign in to comment.