fix: isolate doc ingestion with a llama http server (#90)

nextcloud · Nov 6, 2024 · 236201d · 236201d
2 parents 3b8d459 + 248af0c
commit 236201d
Show file tree

Hide file tree

Showing 25 changed files with 514 additions and 470 deletions.
diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
@@ -138,6 +138,14 @@ jobs:
         run: |
           cd data/admin/files
           mv documentation/admin_manual .
+          cp -R documentation/developer_manual .
+          cd developer_manual
+          find . -type f -name "*.rst" -exec bash -c 'mv "$0" "${0%.rst}.md"' {} \;
+          cd ..
+          cp -R documentation/developer_manual ./developer_manual2
+          cd developer_manual2
+          find . -type f -name "*.rst" -exec bash -c 'mv "$0" "${0%.rst}.txt"' {} \;
+          cd ..
           rm -rf documentation
 
       - name: Setup python 3.11
@@ -152,17 +160,22 @@ jobs:
         run: |
           cd context_chat_backend
           pip install --upgrade pip setuptools wheel
-          pip install --no-deps -r requirements.txt
+          pip install -r requirements.txt
           cp example.env .env
           echo "NEXTCLOUD_URL=http://localhost:8080" >> .env
-          ./main.py > backend_logs 2>&1 &
+          python3 -u ./main.py > backend_logs 2>&1 &
           echo $! > ../pid.txt          # Save the process ID (PID)
 
       - name: Register backend
         run: |
           ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080
           ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish
 
+      - name: Scan files, baseline
+        run: |
+          ./occ files:scan admin
+          ./occ context_chat:scan admin -m text/plain
+
       - name: Check python memory usage
         run: |
           ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem
@@ -171,7 +184,8 @@ jobs:
       - name: Scan files
         run: |
           ./occ files:scan admin
-          ./occ context_chat:scan admin
+          ./occ context_chat:scan admin -m text/markdown &
+          ./occ context_chat:scan admin -m text/x-rst
 
       - name: Check python memory usage
         run: |
@@ -222,4 +236,6 @@ jobs:
           tail data/nextcloud.log
           echo '--------------------------------------------------'
           [ -f context_chat_backend/backend_logs ] && cat context_chat_backend/backend_logs || echo "No backend logs"
+          echo '--------------------------------------------------'
+          [ -f context_chat_backend/embedding_model.log ] && cat context_chat_backend/embedding_model.log || echo "No embedding backend logs"
           
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
-.venv/
+.venv*/
 __pycache__/
 .env
 persistent_storage/*
 .vscode/
+embedding_model.log
diff --git a/Dockerfile b/Dockerfile
@@ -35,7 +35,7 @@ COPY requirements.txt .
 RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel
 RUN python3 -m pip install --no-cache-dir https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.84-cu122/llama_cpp_python-0.2.84-cp311-cp311-linux_x86_64.whl
 RUN sed -i '/llama_cpp_python/d' requirements.txt
-RUN python3 -m pip install --no-cache-dir --no-deps -r requirements.txt
+RUN python3 -m pip install --no-cache-dir -r requirements.txt && python3 -m pip cache purge
 
 # Copy application files
 COPY context_chat_backend context_chat_backend

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ Install the given apps for Context Chat to work as desired **in the given order*
 1. `python -m venv .venv`
 2. `. .venv/bin/activate`
 3. `pip install --upgrade pip setuptools wheel`
-4. Install requirements `pip install --no-deps -r requirements.txt`
+4. Install requirements `pip install -r requirements.txt`
 5. Copy example.env to .env and fill in the variables
 6. Ensure the config file at `persistent_storage/config.yaml` points to the correct config file (cpu vs gpu). If you're unsure, delete it. It will be recreated upon launching the application. The default is to point to the gpu config.
 7. Configure `persistent_storage/config.yaml` for the model name, model type and its parameters (which also includes model file's path and model id as per requirements, see example config)

diff --git a/config.cpu.yaml b/config.cpu.yaml
@@ -1,14 +1,10 @@
 debug: true
 disable_aaa: false
 httpx_verify_ssl: true
-model_offload_timeout: 15  # 15 minutes
 use_colors: true
 uvicorn_workers: 1
-embedding_chunk_size: 1000
-
-# model files download configuration
-disable_custom_model_download: false
-model_download_uri: https://download.nextcloud.com/server/apps/context_chat_backend
+embedding_chunk_size: 2000
+doc_parser_worker_limit: 10
 
 
 vectordb:
@@ -27,10 +23,17 @@ vectordb:
     # url: http://localhost:8080
 
 embedding:
+  protocol: http
+  host: localhost
+  port: 5000
+  workers: 1
+  offload_after_mins: 15 # in minutes
+  request_timeout: 1800 # in seconds
   llama:
-    model_path: multilingual-e5-large-instruct-q6_k.gguf
-    device: cpu
-    n_batch: 512
+    # 'model_alias' is reserved
+    # 'embedding' is always set to True
+    model: multilingual-e5-large-instruct-q6_k.gguf
+    n_batch: 16
     n_ctx: 8192
 
 llm:

diff --git a/config.gpu.yaml b/config.gpu.yaml
@@ -1,14 +1,10 @@
 debug: true
 disable_aaa: false
 httpx_verify_ssl: true
-model_offload_timeout: 15 # 15 minutes
 use_colors: true
 uvicorn_workers: 1
-embedding_chunk_size: 1000
-
-# model files download configuration
-disable_custom_model_download: false
-model_download_uri: https://download.nextcloud.com/server/apps/context_chat_backend
+embedding_chunk_size: 2000
+doc_parser_worker_limit: 10
 
 
 vectordb:
@@ -27,17 +23,20 @@ vectordb:
     # url: http://localhost:8080
 
 embedding:
+  protocol: http
+  host: localhost
+  port: 5000
+  workers: 1
+  offload_after_mins: 15 # in minutes
+  request_timeout: 1800 # in seconds
   llama:
-    model_path: multilingual-e5-large-instruct-q6_k.gguf
-    n_batch: 512
+    # 'model_alias' is reserved
+    # 'embedding' is always set to True
+    model: multilingual-e5-large-instruct-q6_k.gguf
+    n_batch: 16
     n_ctx: 8192
     n_gpu_layers: -1
 
-  hugging_face:
-    model_name: nextcloud-ai/multilingual-e5-large-instruct
-    model_kwargs:
-      device: cuda
-
 llm:
   nc_texttotext:
 

diff --git a/context_chat_backend/__init__.py b/context_chat_backend/__init__.py
@@ -1,4 +0,0 @@
-from .controller import app
-from .utils import to_int
-
-__all__ = ['app', 'to_int']

diff --git a/context_chat_backend/chain/ingest/delete.py b/context_chat_backend/chain/ingest/delete.py
@@ -0,0 +1,17 @@
+from ...dyn_loader import VectorDBLoader
+from ...vectordb.base import BaseVectorDB
+
+
+def delete_by_source(vectordb_loader: VectorDBLoader, user_id: str, source_names: list[str]) -> bool:
+	db: BaseVectorDB = vectordb_loader.load()
+	return db.delete(user_id, 'source', source_names)
+
+
+def delete_by_provider(vectordb_loader: VectorDBLoader, user_id: str, providerKey: str) -> bool:
+	db: BaseVectorDB = vectordb_loader.load()
+	return db.delete(user_id, 'provider', [providerKey])
+
+
+def delete_for_all_users(vectordb_loader: VectorDBLoader, providerKey: str) -> bool:
+	db: BaseVectorDB = vectordb_loader.load()
+	return db.delete_for_all_users('provider', [providerKey])
diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
@@ -1,18 +1,17 @@
 import re
-import threading
 from logging import error as log_error
 
 from fastapi.datastructures import UploadFile
 from langchain.schema import Document
 
 from ...config_parser import TConfig
+from ...dyn_loader import VectorDBLoader
 from ...utils import not_none, to_int
 from ...vectordb import BaseVectorDB
 from .doc_loader import decode_source
 from .doc_splitter import get_splitter_for
 from .mimetype_list import SUPPORTED_MIMETYPES
 
-embed_lock = threading.Lock()
 
 def _allowed_file(file: UploadFile) -> bool:
 	return file.headers.get('type', default='') in SUPPORTED_MIMETYPES
@@ -72,6 +71,7 @@ def _sources_to_documents(sources: list[UploadFile]) -> dict[str, list[Document]
 	documents = {}
 
 	for source in sources:
+		print('processing source:', source.filename, flush=True)
 		user_id = source.headers.get('userId')
 		if user_id is None:
 			log_error(f'userId not found in headers for source: {source.filename}')
@@ -82,6 +82,8 @@ def _sources_to_documents(sources: list[UploadFile]) -> dict[str, list[Document]
 		if content is None or content == '':
 			continue
 
+		print('decoded non empty source:', source.filename, flush=True)
+
 		metadata = {
 			'source': source.filename,
 			'title': source.headers.get('title'),
@@ -114,17 +116,26 @@ def _bucket_by_type(documents: list[Document]) -> dict[str, list[Document]]:
 	return bucketed_documents
 
 
-def _process_sources(vectordb: BaseVectorDB, config: TConfig, sources: list[UploadFile]) -> bool:
+def _process_sources(
+	vectordb: BaseVectorDB,
+	config: TConfig,
+	sources: list[UploadFile],
+) -> bool:
 	filtered_sources = _filter_sources(sources[0].headers['userId'], vectordb, sources)
 
 	if len(filtered_sources) == 0:
 		# no new sources to embed
+		print('Filtered all sources, nothing to embed', flush=True)
 		return True
 
+	print('Filtered sources:', [source.filename for source in filtered_sources], flush=True)
 	ddocuments: dict[str, list[Document]] = _sources_to_documents(filtered_sources)
 
+	print('Converted sources to documents')
+
 	if len(ddocuments.keys()) == 0:
 		# document(s) were empty, not an error
+		print('All documents were found empty after being processed', flush=True)
 		return True
 
 	success = True
@@ -135,7 +146,7 @@ def _process_sources(vectordb: BaseVectorDB, config: TConfig, sources: list[Uplo
 		type_bucketed_docs = _bucket_by_type(documents)
 
 		for _type, _docs in type_bucketed_docs.items():
-			text_splitter = get_splitter_for(config['embedding_chunk_size'], _type)
+			text_splitter = get_splitter_for(config.embedding_chunk_size, _type)
 			split_docs = text_splitter.split_documents(_docs)
 			split_documents.extend(split_docs)
 
@@ -150,21 +161,23 @@ def _process_sources(vectordb: BaseVectorDB, config: TConfig, sources: list[Uplo
 		# filter out empty documents
 		split_documents = list(filter(lambda doc: doc.page_content != '', split_documents))
 
+		print('split documents count:', len(split_documents), flush=True)
+
 		if len(split_documents) == 0:
 			continue
 
-		with embed_lock:
-			user_client = vectordb.get_user_client(user_id)
-			doc_ids = user_client.add_documents(split_documents)
+		user_client = vectordb.get_user_client(user_id)
+		doc_ids = user_client.add_documents(split_documents)
 
+		print('Added documents to vectordb', flush=True)
 		# does not do per document error checking
 		success &= len(split_documents) == len(doc_ids)
 
 	return success
 
 
 def embed_sources(
-	vectordb: BaseVectorDB,
+	vectordb_loader: VectorDBLoader,
 	config: TConfig,
 	sources: list[UploadFile],
 ) -> bool:
@@ -177,7 +190,9 @@ def embed_sources(
 
 	print(
 		'Embedding sources:\n' +
-		'\n'.join([f'{source.filename} ({source.headers.get("title", "")})' for source in sources_filtered]),
+		'\n'.join([f'{source.filename} ({source.headers["title"]})' for source in sources_filtered]),
 		flush=True,
 	)
+
+	vectordb = vectordb_loader.load()
 	return _process_sources(vectordb, config, sources_filtered)
diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py
@@ -1,8 +1,9 @@
+
 from langchain.llms.base import LLM
 from typing_extensions import TypedDict
 
 from ..config_parser import TConfig
-from ..vectordb import BaseVectorDB
+from ..dyn_loader import VectorDBLoader
 from .context import ContextException, ScopeType, get_context_chunks, get_context_docs
 from .query_proc import get_pruned_query
 
@@ -24,7 +25,7 @@ def process_query(
 	query: str,
 	no_ctx_template: str | None = None,
 	end_separator: str = '',
-) -> LLMOutput:
+):
 	"""
 	Raises
 	------
@@ -42,7 +43,7 @@ def process_query(
 
 def process_context_query(
 	user_id: str,
-	vectordb: BaseVectorDB,
+	vectordb_loader: VectorDBLoader,
 	llm: LLM,
 	app_config: TConfig,
 	query: str,
@@ -51,14 +52,15 @@ def process_context_query(
 	scope_list: list[str] | None = None,
 	template: str | None = None,
 	end_separator: str = '',
-) -> LLMOutput:
+):
 	"""
 	Raises
 	------
 	ValueError
 		If the context length is too small to fit the query
 	"""
-	context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list)
+	db = vectordb_loader.load()
+	context_docs = get_context_docs(user_id, query, db, ctx_limit, scope_type, scope_list)
 	if len(context_docs) == 0:
 		raise ContextException('No documents retrieved, please index a few documents first to use context-aware mode')
 

diff --git a/context_chat_backend/chain/query_proc.py b/context_chat_backend/chain/query_proc.py
@@ -13,7 +13,7 @@ def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, text_
 	ValueError
 		If the context length is too small to fit the query
 	'''
-	llm_config = config['llm'][1]
+	llm_config = config.llm[1]
 	# fav
 	n_ctx = llm_config.get('n_ctx') \
 		or llm_config.get('config', {}).get('context_length') \