Skip to content

Commit

Permalink
Merge branch 'openai/version-update' into zsb/dev
Browse files Browse the repository at this point in the history
  • Loading branch information
amadolid committed Nov 28, 2023
2 parents 3f440e6 + 995c4d9 commit bf8c2cf
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from requests import get
from uuid import uuid4

from .utils import extract_text_from_file, get_embeddings, generate_chunks
from .utils import extract_text_from_file, get_embeddings, generate_chunks, extraction
from jaseci.jsorc.live_actions import jaseci_action
from elasticsearch import Elasticsearch, NotFoundError
from elasticsearch import Elasticsearch

OAI_CLIENT = None
ES_CLIENT = None
Expand Down Expand Up @@ -215,6 +215,22 @@ def reapply_index_template():
)


@jaseci_action(act_group=["es_ret"], allow_remote=True)
def file_is_readable(id: str, meta: dict = {}):
"""temp"""
from jaseci.utils.file_handler import FileHandler

file_handler: FileHandler = meta["h"].get_file_handler(id)
try:
with file_handler.open(mode="rb", detached=True) as buff:
if extraction(buff):
return True
except Exception:
pass

return False


def openai() -> OpenAI:
global CONFIG, OAI_CLIENT
if not OAI_CLIENT:
Expand Down
88 changes: 46 additions & 42 deletions jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,50 +18,54 @@ def get_embeddings(texts: list, oai_client: OpenAI, config: dict = {}):
]


def extraction(buff) -> str:
mimetype = from_buffer(buff.read(), mime=True)
buff.seek(0)

if mimetype == "application/pdf":
# Extract text from pdf using PyPDF2
reader = PdfReader(buff)
return " ".join([page.extract_text() for page in reader.pages])
elif mimetype == "text/plain" or mimetype == "text/markdown":
# Read text from plain text buff
return buff.read().decode("utf-8")
elif (
mimetype
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
# Extract text from docx using docx2txt
return docx2txt.process(buff)
elif mimetype == "text/csv":
# Extract text from csv using csv module
extracted_text = ""
decoded_buffer = (line.decode("utf-8") for line in buff)
reader = csv.reader(decoded_buffer)
for row in reader:
extracted_text += " ".join(row) + "\n"
return extracted_text
elif (
mimetype
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
# Extract text from pptx using python-pptx
extracted_text = ""
presentation = pptx.Presentation(buff)
for slide in presentation.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
extracted_text += run.text + " "
extracted_text += "\n"
return extracted_text
else:
# Unsupported file type
raise ValueError("Unsupported file type: {}".format(mimetype))


def extract_text_from_file(file) -> str:
with open(file, "rb") as buff:
mimetype = from_buffer(buff.read(), mime=True)
buff.seek(0)

if mimetype == "application/pdf":
# Extract text from pdf using PyPDF2
reader = PdfReader(buff)
extracted_text = " ".join([page.extract_text() for page in reader.pages])
elif mimetype == "text/plain" or mimetype == "text/markdown":
# Read text from plain text buff
extracted_text = buff.read().decode("utf-8")
elif (
mimetype
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
# Extract text from docx using docx2txt
extracted_text = docx2txt.process(buff)
elif mimetype == "text/csv":
# Extract text from csv using csv module
extracted_text = ""
decoded_buffer = (line.decode("utf-8") for line in buff)
reader = csv.reader(decoded_buffer)
for row in reader:
extracted_text += " ".join(row) + "\n"
elif (
mimetype
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
# Extract text from pptx using python-pptx
extracted_text = ""
presentation = pptx.Presentation(buff)
for slide in presentation.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
extracted_text += run.text + " "
extracted_text += "\n"
else:
# Unsupported file type
raise ValueError("Unsupported file type: {}".format(mimetype))

return extracted_text
return " ".join(extraction(buff).split())


def generate_chunks(doc: dict, config: dict) -> list:
Expand Down

0 comments on commit bf8c2cf

Please sign in to comment.