Skip to content

Commit

Permalink
fix: isolate doc ingestion with a llama http server (#90)
Browse files Browse the repository at this point in the history
  • Loading branch information
kyteinsky authored Nov 6, 2024
2 parents 3b8d459 + 248af0c commit 236201d
Show file tree
Hide file tree
Showing 25 changed files with 514 additions and 470 deletions.
22 changes: 19 additions & 3 deletions .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,14 @@ jobs:
run: |
cd data/admin/files
mv documentation/admin_manual .
cp -R documentation/developer_manual .
cd developer_manual
find . -type f -name "*.rst" -exec bash -c 'mv "$0" "${0%.rst}.md"' {} \;
cd ..
cp -R documentation/developer_manual ./developer_manual2
cd developer_manual2
find . -type f -name "*.rst" -exec bash -c 'mv "$0" "${0%.rst}.txt"' {} \;
cd ..
rm -rf documentation
- name: Setup python 3.11
Expand All @@ -152,17 +160,22 @@ jobs:
run: |
cd context_chat_backend
pip install --upgrade pip setuptools wheel
pip install --no-deps -r requirements.txt
pip install -r requirements.txt
cp example.env .env
echo "NEXTCLOUD_URL=http://localhost:8080" >> .env
./main.py > backend_logs 2>&1 &
python3 -u ./main.py > backend_logs 2>&1 &
echo $! > ../pid.txt # Save the process ID (PID)
- name: Register backend
run: |
./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080
./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish
- name: Scan files, baseline
run: |
./occ files:scan admin
./occ context_chat:scan admin -m text/plain
- name: Check python memory usage
run: |
ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem
Expand All @@ -171,7 +184,8 @@ jobs:
- name: Scan files
run: |
./occ files:scan admin
./occ context_chat:scan admin
./occ context_chat:scan admin -m text/markdown &
./occ context_chat:scan admin -m text/x-rst
- name: Check python memory usage
run: |
Expand Down Expand Up @@ -222,4 +236,6 @@ jobs:
tail data/nextcloud.log
echo '--------------------------------------------------'
[ -f context_chat_backend/backend_logs ] && cat context_chat_backend/backend_logs || echo "No backend logs"
echo '--------------------------------------------------'
[ -f context_chat_backend/embedding_model.log ] && cat context_chat_backend/embedding_model.log || echo "No embedding backend logs"
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.venv/
.venv*/
__pycache__/
.env
persistent_storage/*
.vscode/
embedding_model.log
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ COPY requirements.txt .
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel
RUN python3 -m pip install --no-cache-dir https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.84-cu122/llama_cpp_python-0.2.84-cp311-cp311-linux_x86_64.whl
RUN sed -i '/llama_cpp_python/d' requirements.txt
RUN python3 -m pip install --no-cache-dir --no-deps -r requirements.txt
RUN python3 -m pip install --no-cache-dir -r requirements.txt && python3 -m pip cache purge

# Copy application files
COPY context_chat_backend context_chat_backend
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Install the given apps for Context Chat to work as desired **in the given order*
1. `python -m venv .venv`
2. `. .venv/bin/activate`
3. `pip install --upgrade pip setuptools wheel`
4. Install requirements `pip install --no-deps -r requirements.txt`
4. Install requirements `pip install -r requirements.txt`
5. Copy example.env to .env and fill in the variables
6. Ensure the config file at `persistent_storage/config.yaml` points to the correct config file (cpu vs gpu). If you're unsure, delete it. It will be recreated upon launching the application. The default is to point to the gpu config.
7. Configure `persistent_storage/config.yaml` for the model name, model type and its parameters (which also includes model file's path and model id as per requirements, see example config)
Expand Down
21 changes: 12 additions & 9 deletions config.cpu.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
debug: true
disable_aaa: false
httpx_verify_ssl: true
model_offload_timeout: 15 # 15 minutes
use_colors: true
uvicorn_workers: 1
embedding_chunk_size: 1000

# model files download configuration
disable_custom_model_download: false
model_download_uri: https://download.nextcloud.com/server/apps/context_chat_backend
embedding_chunk_size: 2000
doc_parser_worker_limit: 10


vectordb:
Expand All @@ -27,10 +23,17 @@ vectordb:
# url: http://localhost:8080

embedding:
protocol: http
host: localhost
port: 5000
workers: 1
offload_after_mins: 15 # in minutes
request_timeout: 1800 # in seconds
llama:
model_path: multilingual-e5-large-instruct-q6_k.gguf
device: cpu
n_batch: 512
# 'model_alias' is reserved
# 'embedding' is always set to True
model: multilingual-e5-large-instruct-q6_k.gguf
n_batch: 16
n_ctx: 8192

llm:
Expand Down
25 changes: 12 additions & 13 deletions config.gpu.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
debug: true
disable_aaa: false
httpx_verify_ssl: true
model_offload_timeout: 15 # 15 minutes
use_colors: true
uvicorn_workers: 1
embedding_chunk_size: 1000

# model files download configuration
disable_custom_model_download: false
model_download_uri: https://download.nextcloud.com/server/apps/context_chat_backend
embedding_chunk_size: 2000
doc_parser_worker_limit: 10


vectordb:
Expand All @@ -27,17 +23,20 @@ vectordb:
# url: http://localhost:8080

embedding:
protocol: http
host: localhost
port: 5000
workers: 1
offload_after_mins: 15 # in minutes
request_timeout: 1800 # in seconds
llama:
model_path: multilingual-e5-large-instruct-q6_k.gguf
n_batch: 512
# 'model_alias' is reserved
# 'embedding' is always set to True
model: multilingual-e5-large-instruct-q6_k.gguf
n_batch: 16
n_ctx: 8192
n_gpu_layers: -1

hugging_face:
model_name: nextcloud-ai/multilingual-e5-large-instruct
model_kwargs:
device: cuda

llm:
nc_texttotext:

Expand Down
4 changes: 0 additions & 4 deletions context_chat_backend/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
from .controller import app
from .utils import to_int

__all__ = ['app', 'to_int']
17 changes: 17 additions & 0 deletions context_chat_backend/chain/ingest/delete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from ...dyn_loader import VectorDBLoader
from ...vectordb.base import BaseVectorDB


def delete_by_source(vectordb_loader: VectorDBLoader, user_id: str, source_names: list[str]) -> bool:
db: BaseVectorDB = vectordb_loader.load()
return db.delete(user_id, 'source', source_names)


def delete_by_provider(vectordb_loader: VectorDBLoader, user_id: str, providerKey: str) -> bool:
db: BaseVectorDB = vectordb_loader.load()
return db.delete(user_id, 'provider', [providerKey])


def delete_for_all_users(vectordb_loader: VectorDBLoader, providerKey: str) -> bool:
db: BaseVectorDB = vectordb_loader.load()
return db.delete_for_all_users('provider', [providerKey])
33 changes: 24 additions & 9 deletions context_chat_backend/chain/ingest/injest.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import re
import threading
from logging import error as log_error

from fastapi.datastructures import UploadFile
from langchain.schema import Document

from ...config_parser import TConfig
from ...dyn_loader import VectorDBLoader
from ...utils import not_none, to_int
from ...vectordb import BaseVectorDB
from .doc_loader import decode_source
from .doc_splitter import get_splitter_for
from .mimetype_list import SUPPORTED_MIMETYPES

embed_lock = threading.Lock()

def _allowed_file(file: UploadFile) -> bool:
return file.headers.get('type', default='') in SUPPORTED_MIMETYPES
Expand Down Expand Up @@ -72,6 +71,7 @@ def _sources_to_documents(sources: list[UploadFile]) -> dict[str, list[Document]
documents = {}

for source in sources:
print('processing source:', source.filename, flush=True)
user_id = source.headers.get('userId')
if user_id is None:
log_error(f'userId not found in headers for source: {source.filename}')
Expand All @@ -82,6 +82,8 @@ def _sources_to_documents(sources: list[UploadFile]) -> dict[str, list[Document]
if content is None or content == '':
continue

print('decoded non empty source:', source.filename, flush=True)

metadata = {
'source': source.filename,
'title': source.headers.get('title'),
Expand Down Expand Up @@ -114,17 +116,26 @@ def _bucket_by_type(documents: list[Document]) -> dict[str, list[Document]]:
return bucketed_documents


def _process_sources(vectordb: BaseVectorDB, config: TConfig, sources: list[UploadFile]) -> bool:
def _process_sources(
vectordb: BaseVectorDB,
config: TConfig,
sources: list[UploadFile],
) -> bool:
filtered_sources = _filter_sources(sources[0].headers['userId'], vectordb, sources)

if len(filtered_sources) == 0:
# no new sources to embed
print('Filtered all sources, nothing to embed', flush=True)
return True

print('Filtered sources:', [source.filename for source in filtered_sources], flush=True)
ddocuments: dict[str, list[Document]] = _sources_to_documents(filtered_sources)

print('Converted sources to documents')

if len(ddocuments.keys()) == 0:
# document(s) were empty, not an error
print('All documents were found empty after being processed', flush=True)
return True

success = True
Expand All @@ -135,7 +146,7 @@ def _process_sources(vectordb: BaseVectorDB, config: TConfig, sources: list[Uplo
type_bucketed_docs = _bucket_by_type(documents)

for _type, _docs in type_bucketed_docs.items():
text_splitter = get_splitter_for(config['embedding_chunk_size'], _type)
text_splitter = get_splitter_for(config.embedding_chunk_size, _type)
split_docs = text_splitter.split_documents(_docs)
split_documents.extend(split_docs)

Expand All @@ -150,21 +161,23 @@ def _process_sources(vectordb: BaseVectorDB, config: TConfig, sources: list[Uplo
# filter out empty documents
split_documents = list(filter(lambda doc: doc.page_content != '', split_documents))

print('split documents count:', len(split_documents), flush=True)

if len(split_documents) == 0:
continue

with embed_lock:
user_client = vectordb.get_user_client(user_id)
doc_ids = user_client.add_documents(split_documents)
user_client = vectordb.get_user_client(user_id)
doc_ids = user_client.add_documents(split_documents)

print('Added documents to vectordb', flush=True)
# does not do per document error checking
success &= len(split_documents) == len(doc_ids)

return success


def embed_sources(
vectordb: BaseVectorDB,
vectordb_loader: VectorDBLoader,
config: TConfig,
sources: list[UploadFile],
) -> bool:
Expand All @@ -177,7 +190,9 @@ def embed_sources(

print(
'Embedding sources:\n' +
'\n'.join([f'{source.filename} ({source.headers.get("title", "")})' for source in sources_filtered]),
'\n'.join([f'{source.filename} ({source.headers["title"]})' for source in sources_filtered]),
flush=True,
)

vectordb = vectordb_loader.load()
return _process_sources(vectordb, config, sources_filtered)
12 changes: 7 additions & 5 deletions context_chat_backend/chain/one_shot.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@

from langchain.llms.base import LLM
from typing_extensions import TypedDict

from ..config_parser import TConfig
from ..vectordb import BaseVectorDB
from ..dyn_loader import VectorDBLoader
from .context import ContextException, ScopeType, get_context_chunks, get_context_docs
from .query_proc import get_pruned_query

Expand All @@ -24,7 +25,7 @@ def process_query(
query: str,
no_ctx_template: str | None = None,
end_separator: str = '',
) -> LLMOutput:
):
"""
Raises
------
Expand All @@ -42,7 +43,7 @@ def process_query(

def process_context_query(
user_id: str,
vectordb: BaseVectorDB,
vectordb_loader: VectorDBLoader,
llm: LLM,
app_config: TConfig,
query: str,
Expand All @@ -51,14 +52,15 @@ def process_context_query(
scope_list: list[str] | None = None,
template: str | None = None,
end_separator: str = '',
) -> LLMOutput:
):
"""
Raises
------
ValueError
If the context length is too small to fit the query
"""
context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list)
db = vectordb_loader.load()
context_docs = get_context_docs(user_id, query, db, ctx_limit, scope_type, scope_list)
if len(context_docs) == 0:
raise ContextException('No documents retrieved, please index a few documents first to use context-aware mode')

Expand Down
2 changes: 1 addition & 1 deletion context_chat_backend/chain/query_proc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def get_pruned_query(llm: LLM, config: TConfig, query: str, template: str, text_
ValueError
If the context length is too small to fit the query
'''
llm_config = config['llm'][1]
llm_config = config.llm[1]
# fav
n_ctx = llm_config.get('n_ctx') \
or llm_config.get('config', {}).get('context_length') \
Expand Down
Loading

0 comments on commit 236201d

Please sign in to comment.