From 7ac750cb80b02dd33e063ad1815bfa8dd51e3838 Mon Sep 17 00:00:00 2001 From: "Alexie (Boyong) Madolid" Date: Tue, 28 Nov 2023 12:01:42 +0800 Subject: [PATCH] [ELASTIC]: file validation --- .../elastic_retrieval/elastic_retrieval.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py index 2f654ab5f7..f6ce918ae8 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py @@ -1,8 +1,13 @@ +import docx2txt +import csv +import pptx from openai import OpenAI from os import environ, unlink from datetime import datetime from requests import get from uuid import uuid4 +from pypdf import PdfReader +from magic import from_buffer from .utils import extract_text_from_file, get_embeddings, generate_chunks from jaseci.jsorc.live_actions import jaseci_action @@ -215,6 +220,65 @@ def reapply_index_template(): ) +@jaseci_action(act_group=["es_ret"], allow_remote=True) +def file_is_readable(id: str, meta: dict = {}): + """temp""" + from jaseci.utils.file_handler import FileHandler + + file_handler: FileHandler = meta["h"].get_file_handler(id) + try: + with file_handler.open(mode="rb", detached=True) as buff: + mimetype = from_buffer(buff.read(), mime=True) + buff.seek(0) + + if mimetype == "application/pdf": + # Extract text from pdf using PyPDF2 + reader = PdfReader(buff) + extracted_text = " ".join( + [page.extract_text() for page in reader.pages] + ) + elif mimetype == "text/plain" or mimetype == "text/markdown": + # Read text from plain text buff + extracted_text = buff.read().decode("utf-8") + elif ( + mimetype + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ): + # Extract text from docx using docx2txt + extracted_text = docx2txt.process(buff) + elif mimetype == "text/csv": + # Extract text from csv using csv module + extracted_text = "" + decoded_buffer = (line.decode("utf-8") for line in buff) + reader = csv.reader(decoded_buffer) + for row in reader: + extracted_text += " ".join(row) + "\n" + elif ( + mimetype + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ): + # Extract text from pptx using python-pptx + extracted_text = "" + presentation = pptx.Presentation(buff) + for slide in presentation.slides: + for shape in slide.shapes: + if shape.has_text_frame: + for paragraph in shape.text_frame.paragraphs: + for run in paragraph.runs: + extracted_text += run.text + " " + extracted_text += "\n" + else: + # Unsupported file type + raise ValueError("Unsupported file type: {}".format(mimetype)) + + if extracted_text: + return True + except Exception: + pass + + return False + + def openai() -> OpenAI: global CONFIG, OAI_CLIENT if not OAI_CLIENT: