Merge branch 'openai/version-update' into zsb/dev

amadolid · Nov 28, 2023 · bf8c2cf · bf8c2cf
2 parents 3f440e6 + 995c4d9
commit bf8c2cf
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 44 deletions.
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py
@@ -4,9 +4,9 @@
 from requests import get
 from uuid import uuid4
 
-from .utils import extract_text_from_file, get_embeddings, generate_chunks
+from .utils import extract_text_from_file, get_embeddings, generate_chunks, extraction
 from jaseci.jsorc.live_actions import jaseci_action
-from elasticsearch import Elasticsearch, NotFoundError
+from elasticsearch import Elasticsearch
 
 OAI_CLIENT = None
 ES_CLIENT = None
@@ -215,6 +215,22 @@ def reapply_index_template():
  )
 
 
+@jaseci_action(act_group=["es_ret"], allow_remote=True)
+def file_is_readable(id: str, meta: dict = {}):
+ """temp"""
+ from jaseci.utils.file_handler import FileHandler
+
+ file_handler: FileHandler = meta["h"].get_file_handler(id)
+ try:
+ with file_handler.open(mode="rb", detached=True) as buff:
+ if extraction(buff):
+ return True
+ except Exception:
+ pass
+
+ return False
+
+
 def openai() -> OpenAI:
  global CONFIG, OAI_CLIENT
  if not OAI_CLIENT:

diff --git a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/utils.py b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/utils.py
@@ -18,50 +18,54 @@ def get_embeddings(texts: list, oai_client: OpenAI, config: dict = {}):
  ]
 
 
+def extraction(buff) -> str:
+ mimetype = from_buffer(buff.read(), mime=True)
+ buff.seek(0)
+
+ if mimetype == "application/pdf":
+ # Extract text from pdf using PyPDF2
+ reader = PdfReader(buff)
+ return " ".join([page.extract_text() for page in reader.pages])
+ elif mimetype == "text/plain" or mimetype == "text/markdown":
+ # Read text from plain text buff
+ return buff.read().decode("utf-8")
+ elif (
+ mimetype
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+ ):
+ # Extract text from docx using docx2txt
+ return docx2txt.process(buff)
+ elif mimetype == "text/csv":
+ # Extract text from csv using csv module
+ extracted_text = ""
+ decoded_buffer = (line.decode("utf-8") for line in buff)
+ reader = csv.reader(decoded_buffer)
+ for row in reader:
+ extracted_text += " ".join(row) + "\n"
+ return extracted_text
+ elif (
+ mimetype
+ == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+ ):
+ # Extract text from pptx using python-pptx
+ extracted_text = ""
+ presentation = pptx.Presentation(buff)
+ for slide in presentation.slides:
+ for shape in slide.shapes:
+ if shape.has_text_frame:
+ for paragraph in shape.text_frame.paragraphs:
+ for run in paragraph.runs:
+ extracted_text += run.text + " "
+ extracted_text += "\n"
+ return extracted_text
+ else:
+ # Unsupported file type
+ raise ValueError("Unsupported file type: {}".format(mimetype))
+
+
 def extract_text_from_file(file) -> str:
  with open(file, "rb") as buff:
- mimetype = from_buffer(buff.read(), mime=True)
- buff.seek(0)
-
- if mimetype == "application/pdf":
- # Extract text from pdf using PyPDF2
- reader = PdfReader(buff)
- extracted_text = " ".join([page.extract_text() for page in reader.pages])
- elif mimetype == "text/plain" or mimetype == "text/markdown":
- # Read text from plain text buff
- extracted_text = buff.read().decode("utf-8")
- elif (
- mimetype
- == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
- ):
- # Extract text from docx using docx2txt
- extracted_text = docx2txt.process(buff)
- elif mimetype == "text/csv":
- # Extract text from csv using csv module
- extracted_text = ""
- decoded_buffer = (line.decode("utf-8") for line in buff)
- reader = csv.reader(decoded_buffer)
- for row in reader:
- extracted_text += " ".join(row) + "\n"
- elif (
- mimetype
- == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
- ):
- # Extract text from pptx using python-pptx
- extracted_text = ""
- presentation = pptx.Presentation(buff)
- for slide in presentation.slides:
- for shape in slide.shapes:
- if shape.has_text_frame:
- for paragraph in shape.text_frame.paragraphs:
- for run in paragraph.runs:
- extracted_text += run.text + " "
- extracted_text += "\n"
- else:
- # Unsupported file type
- raise ValueError("Unsupported file type: {}".format(mimetype))
-
- return extracted_text
+ return " ".join(extraction(buff).split())
 
 
 def generate_chunks(doc: dict, config: dict) -> list: