v.2.2.2

Significantly increased the speed and resources required to both create vector database and search it. Removed creation of relevant_contexts.txt, which was originally created for debugging purposes.
BBC-Esq · Oct 10, 2023 · 93944b2 · 93944b2
1 parent 1de59f6
commit 93944b2
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 11 deletions.
diff --git a/document_processor.py b/document_processor.py
@@ -6,7 +6,7 @@
 from langchain.docstore.document import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.document_loaders import (
-    PDFMinerLoader,
+    PyMuPDFLoader,
     Docx2txtLoader,
     TextLoader,
     JSONLoader,
@@ -21,7 +21,7 @@
 INGEST_THREADS = os.cpu_count() or 8
 
 DOCUMENT_MAP = {
-    ".pdf": PDFMinerLoader,
+    ".pdf": PyMuPDFLoader,
     ".docx": Docx2txtLoader,
     ".txt": TextLoader,
     ".json": JSONLoader,
@@ -43,7 +43,14 @@ def load_single_document(file_path: str) -> Document:
             loader = loader_class(file_path)
     else:
         raise ValueError("Document type is undefined")
-    return loader.load()[0]
+
+    document = loader.load()[0]
+
+    # file_path = os.path.join(ROOT_DIRECTORY, 'test.txt')
+    # with open(file_path, 'a', encoding='utf-8') as file:
+        # file.write(document.page_content + '\n')
+
+    return document
 
 def load_document_batch(filepaths):
     with ThreadPoolExecutor(len(filepaths)) as exe:
@@ -71,10 +78,9 @@ def load_documents(source_dir: str) -> list[Document]:
 
 def split_documents(documents):
     logging.info("Splitting documents...")
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
     texts = text_splitter.split_documents(documents)
-    print(f"Number of chunks created: {len(texts)}")
-    logging.info(f"Split into {len(texts)} chunks of text.")
+    print("Splitting chunks completed.")
     return texts
 
 if __name__ == "__main__":

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 openai==0.28.0
 langchain==0.0.267
 chromadb==0.3.29
-pdfminer.six==20221105
+PyMuPDF==1.23.4
 InstructorEmbedding==1.0.1
 sentence-transformers==2.2.2
 pandas==2.0.3

diff --git a/server_connector.py b/server_connector.py
@@ -66,10 +66,6 @@ def ask_local_chatgpt(query, persist_directory=PERSIST_DIRECTORY, client_setting
     prepend_string = "Only base your answer to the following question on the provided context."
     augmented_query = "\n\n---\n\n".join(contexts) + "\n\n-----\n\n" + query
     response_json = connect_to_local_chatgpt(augmented_query)
-
-    with open("relevant_contexts.txt", "w", encoding="utf-8") as f:
-        for content in contexts:
-            f.write(content + "\n\n---\n\n")
 
     return {"answer": response_json, "sources": relevant_contexts}