From 7ac750cb80b02dd33e063ad1815bfa8dd51e3838 Mon Sep 17 00:00:00 2001
From: "Alexie (Boyong) Madolid" <maex@tuta.io>
Date: Tue, 28 Nov 2023 12:01:42 +0800
Subject: [PATCH] [ELASTIC]: file validation

---
 .../elastic_retrieval/elastic_retrieval.py    | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py
index 2f654ab5f7..f6ce918ae8 100644
--- a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py
+++ b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py
@@ -1,8 +1,13 @@
+import docx2txt
+import csv
+import pptx
 from openai import OpenAI
 from os import environ, unlink
 from datetime import datetime
 from requests import get
 from uuid import uuid4
+from pypdf import PdfReader
+from magic import from_buffer
 
 from .utils import extract_text_from_file, get_embeddings, generate_chunks
 from jaseci.jsorc.live_actions import jaseci_action
@@ -215,6 +220,65 @@ def reapply_index_template():
     )
 
 
+@jaseci_action(act_group=["es_ret"], allow_remote=True)
+def file_is_readable(id: str, meta: dict = {}):
+    """temp"""
+    from jaseci.utils.file_handler import FileHandler
+
+    file_handler: FileHandler = meta["h"].get_file_handler(id)
+    try:
+        with file_handler.open(mode="rb", detached=True) as buff:
+            mimetype = from_buffer(buff.read(), mime=True)
+            buff.seek(0)
+
+            if mimetype == "application/pdf":
+                # Extract text from pdf using PyPDF2
+                reader = PdfReader(buff)
+                extracted_text = " ".join(
+                    [page.extract_text() for page in reader.pages]
+                )
+            elif mimetype == "text/plain" or mimetype == "text/markdown":
+                # Read text from plain text buff
+                extracted_text = buff.read().decode("utf-8")
+            elif (
+                mimetype
+                == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+            ):
+                # Extract text from docx using docx2txt
+                extracted_text = docx2txt.process(buff)
+            elif mimetype == "text/csv":
+                # Extract text from csv using csv module
+                extracted_text = ""
+                decoded_buffer = (line.decode("utf-8") for line in buff)
+                reader = csv.reader(decoded_buffer)
+                for row in reader:
+                    extracted_text += " ".join(row) + "\n"
+            elif (
+                mimetype
+                == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+            ):
+                # Extract text from pptx using python-pptx
+                extracted_text = ""
+                presentation = pptx.Presentation(buff)
+                for slide in presentation.slides:
+                    for shape in slide.shapes:
+                        if shape.has_text_frame:
+                            for paragraph in shape.text_frame.paragraphs:
+                                for run in paragraph.runs:
+                                    extracted_text += run.text + " "
+                            extracted_text += "\n"
+            else:
+                # Unsupported file type
+                raise ValueError("Unsupported file type: {}".format(mimetype))
+
+            if extracted_text:
+                return True
+    except Exception:
+        pass
+
+    return False
+
+
 def openai() -> OpenAI:
     global CONFIG, OAI_CLIENT
     if not OAI_CLIENT: