Skip to content

Commit

Permalink
[ELASTIC]: file validation
Browse files Browse the repository at this point in the history
  • Loading branch information
amadolid committed Nov 28, 2023
1 parent cf00deb commit 1a360f1
Showing 1 changed file with 64 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import docx2txt
import csv
import pptx
from openai import OpenAI
from os import environ, unlink
from datetime import datetime
from requests import get
from uuid import uuid4
from pypdf import PdfReader
from magic import from_buffer

from .utils import extract_text_from_file, get_embeddings, generate_chunks
from jaseci.jsorc.live_actions import jaseci_action
Expand Down Expand Up @@ -215,6 +220,65 @@ def reapply_index_template():
)


@jaseci_action(act_group=["es_ret"], allow_remote=True)
def is_valid(id: str, meta: dict = {}):
"""temp"""
from jaseci.utils.file_handler import FileHandler

file_handler: FileHandler = meta["h"].get_file_handler(id)
try:
with file_handler.open(mode="rb", detached=True) as buff:
mimetype = from_buffer(buff.read(), mime=True)
buff.seek(0)

if mimetype == "application/pdf":
# Extract text from pdf using PyPDF2
reader = PdfReader(buff)
extracted_text = " ".join(
[page.extract_text() for page in reader.pages]
)
elif mimetype == "text/plain" or mimetype == "text/markdown":
# Read text from plain text buff
extracted_text = buff.read().decode("utf-8")
elif (
mimetype
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
# Extract text from docx using docx2txt
extracted_text = docx2txt.process(buff)
elif mimetype == "text/csv":
# Extract text from csv using csv module
extracted_text = ""
decoded_buffer = (line.decode("utf-8") for line in buff)
reader = csv.reader(decoded_buffer)
for row in reader:
extracted_text += " ".join(row) + "\n"
elif (
mimetype
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
# Extract text from pptx using python-pptx
extracted_text = ""
presentation = pptx.Presentation(buff)
for slide in presentation.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
extracted_text += run.text + " "
extracted_text += "\n"
else:
# Unsupported file type
raise ValueError("Unsupported file type: {}".format(mimetype))

if extracted_text:
return True
except Exception:
pass

return False


def openai() -> OpenAI:
global CONFIG, OAI_CLIENT
if not OAI_CLIENT:
Expand Down

0 comments on commit 1a360f1

Please sign in to comment.