Skip to content

Commit

Permalink
v.2.2.2
Browse files Browse the repository at this point in the history
Significantly increased the speed and resources required to both create vector database and search it.

Removed creation of relevant_contexts.txt, which was originally created for debugging purposes.
  • Loading branch information
BBC-Esq authored Oct 10, 2023
1 parent 1de59f6 commit 93944b2
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 11 deletions.
18 changes: 12 additions & 6 deletions document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import (
PDFMinerLoader,
PyMuPDFLoader,
Docx2txtLoader,
TextLoader,
JSONLoader,
Expand All @@ -21,7 +21,7 @@
INGEST_THREADS = os.cpu_count() or 8

DOCUMENT_MAP = {
".pdf": PDFMinerLoader,
".pdf": PyMuPDFLoader,
".docx": Docx2txtLoader,
".txt": TextLoader,
".json": JSONLoader,
Expand All @@ -43,7 +43,14 @@ def load_single_document(file_path: str) -> Document:
loader = loader_class(file_path)
else:
raise ValueError("Document type is undefined")
return loader.load()[0]

document = loader.load()[0]

# file_path = os.path.join(ROOT_DIRECTORY, 'test.txt')
# with open(file_path, 'a', encoding='utf-8') as file:
# file.write(document.page_content + '\n')

return document

def load_document_batch(filepaths):
with ThreadPoolExecutor(len(filepaths)) as exe:
Expand Down Expand Up @@ -71,10 +78,9 @@ def load_documents(source_dir: str) -> list[Document]:

def split_documents(documents):
logging.info("Splitting documents...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
print(f"Number of chunks created: {len(texts)}")
logging.info(f"Split into {len(texts)} chunks of text.")
print("Splitting chunks completed.")
return texts

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
openai==0.28.0
langchain==0.0.267
chromadb==0.3.29
pdfminer.six==20221105
PyMuPDF==1.23.4
InstructorEmbedding==1.0.1
sentence-transformers==2.2.2
pandas==2.0.3
Expand Down
4 changes: 0 additions & 4 deletions server_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,6 @@ def ask_local_chatgpt(query, persist_directory=PERSIST_DIRECTORY, client_setting
prepend_string = "Only base your answer to the following question on the provided context."
augmented_query = "\n\n---\n\n".join(contexts) + "\n\n-----\n\n" + query
response_json = connect_to_local_chatgpt(augmented_query)

with open("relevant_contexts.txt", "w", encoding="utf-8") as f:
for content in contexts:
f.write(content + "\n\n---\n\n")

return {"answer": response_json, "sources": relevant_contexts}

Expand Down

0 comments on commit 93944b2

Please sign in to comment.