diff --git a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py index 2f654ab5f7..53d48e80af 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py @@ -4,9 +4,9 @@ from requests import get from uuid import uuid4 -from .utils import extract_text_from_file, get_embeddings, generate_chunks +from .utils import extract_text_from_file, get_embeddings, generate_chunks, extraction from jaseci.jsorc.live_actions import jaseci_action -from elasticsearch import Elasticsearch, NotFoundError +from elasticsearch import Elasticsearch OAI_CLIENT = None ES_CLIENT = None @@ -215,6 +215,22 @@ def reapply_index_template(): ) +@jaseci_action(act_group=["es_ret"], allow_remote=True) +def file_is_readable(id: str, meta: dict = {}): + """temp""" + from jaseci.utils.file_handler import FileHandler + + file_handler: FileHandler = meta["h"].get_file_handler(id) + try: + with file_handler.open(mode="rb", detached=True) as buff: + if extraction(buff): + return True + except Exception: + pass + + return False + + def openai() -> OpenAI: global CONFIG, OAI_CLIENT if not OAI_CLIENT: diff --git a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/utils.py b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/utils.py index 219d0e7928..3fff989b31 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/utils.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/utils.py @@ -18,50 +18,54 @@ def get_embeddings(texts: list, oai_client: OpenAI, config: dict = {}): ] +def extraction(buff) -> str: + mimetype = from_buffer(buff.read(), mime=True) + buff.seek(0) + + if mimetype == "application/pdf": + # Extract text from pdf using PyPDF2 + reader = PdfReader(buff) + return " ".join([page.extract_text() for page in reader.pages]) + elif mimetype == "text/plain" or mimetype == "text/markdown": + # Read text from plain text buff + return buff.read().decode("utf-8") + elif ( + mimetype + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ): + # Extract text from docx using docx2txt + return docx2txt.process(buff) + elif mimetype == "text/csv": + # Extract text from csv using csv module + extracted_text = "" + decoded_buffer = (line.decode("utf-8") for line in buff) + reader = csv.reader(decoded_buffer) + for row in reader: + extracted_text += " ".join(row) + "\n" + return extracted_text + elif ( + mimetype + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ): + # Extract text from pptx using python-pptx + extracted_text = "" + presentation = pptx.Presentation(buff) + for slide in presentation.slides: + for shape in slide.shapes: + if shape.has_text_frame: + for paragraph in shape.text_frame.paragraphs: + for run in paragraph.runs: + extracted_text += run.text + " " + extracted_text += "\n" + return extracted_text + else: + # Unsupported file type + raise ValueError("Unsupported file type: {}".format(mimetype)) + + def extract_text_from_file(file) -> str: with open(file, "rb") as buff: - mimetype = from_buffer(buff.read(), mime=True) - buff.seek(0) - - if mimetype == "application/pdf": - # Extract text from pdf using PyPDF2 - reader = PdfReader(buff) - extracted_text = " ".join([page.extract_text() for page in reader.pages]) - elif mimetype == "text/plain" or mimetype == "text/markdown": - # Read text from plain text buff - extracted_text = buff.read().decode("utf-8") - elif ( - mimetype - == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ): - # Extract text from docx using docx2txt - extracted_text = docx2txt.process(buff) - elif mimetype == "text/csv": - # Extract text from csv using csv module - extracted_text = "" - decoded_buffer = (line.decode("utf-8") for line in buff) - reader = csv.reader(decoded_buffer) - for row in reader: - extracted_text += " ".join(row) + "\n" - elif ( - mimetype - == "application/vnd.openxmlformats-officedocument.presentationml.presentation" - ): - # Extract text from pptx using python-pptx - extracted_text = "" - presentation = pptx.Presentation(buff) - for slide in presentation.slides: - for shape in slide.shapes: - if shape.has_text_frame: - for paragraph in shape.text_frame.paragraphs: - for run in paragraph.runs: - extracted_text += run.text + " " - extracted_text += "\n" - else: - # Unsupported file type - raise ValueError("Unsupported file type: {}".format(mimetype)) - - return extracted_text + return " ".join(extraction(buff).split()) def generate_chunks(doc: dict, config: dict) -> list: