feat(simple file processor): added simple file processor service usin…

…g ocrmypdf. No GRPC implemented
EchoSkorJjj · Feb 25, 2024 · 7e3d823 · 7e3d823
1 parent 50004fb
commit 7e3d823
Show file tree

Hide file tree

Showing 8 changed files with 231 additions and 0 deletions.
diff --git a/backend/simple/fileprocessor/Dockerfile b/backend/simple/fileprocessor/Dockerfile
@@ -0,0 +1,37 @@
+# Start with a Python slim base image
+FROM python:3.9-slim
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set timezone (required for some configurations)
+ENV TZ=Asia/Singapore
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+# Install OCRmyPDF, Tesseract OCR, and other dependencies
+RUN apt-get update && \
+    apt-get install -y \
+    ocrmypdf \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    poppler-utils \
+    ghostscript \
+    qpdf \
+    && apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the requirements file and install Python dependencies
+COPY requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application
+COPY . /app
+
+# Set the default command to run the app
+CMD ["python3", "-m", "file_processor.service"]
diff --git a/backend/simple/fileprocessor/Makefile b/backend/simple/fileprocessor/Makefile
@@ -0,0 +1,30 @@
+PROJECT_NAME := ESD
+ENVIRONMENT ?= dev
+NPM_SUBDIRS := client
+PYTHON_VERSION := 3.11.2
+VENV_NAME := venv
+VENV_DEACTIVATE := deactivate
+ifeq ($(OS),Windows_NT)
+	PYTHON := python
+	VENV_ACTIVATE := venv\Scripts\activate.bat
+	RM := del /s /q
+else
+	PYTHON := python3
+	VENV_ACTIVATE := . venv/bin/activate
+	RM := rm -rf
+endif
+init: check-terraform check-aws-cli venv requirements npm-install
+
+venv:
+	@echo "Creating python virtual environment in '$(VENV_NAME)' folder..."
+	$(PYTHON) -m venv $(VENV_NAME)
+
+requirements:
+	@echo "Installing Python requirements..."
+	@$(VENV_ACTIVATE) && pip install -r requirements.txt
+
+deactivate-venv:
+	@echo "Deactivating virtual environment..."
+	$(RM) $(VENV_NAME)
+	@find . -name "*.pyc" -delete
+	@exit 0
diff --git a/backend/simple/fileprocessor/docker-compose.yml b/backend/simple/fileprocessor/docker-compose.yml
@@ -0,0 +1,7 @@
+version: '3'
+services:
+  pdf-reader:
+    build:
+      context: .
+    volumes:
+      - ./example.pdf:/app/example.pdf
diff --git a/backend/simple/fileprocessor/example.pdf b/backend/simple/fileprocessor/example.pdf
diff --git a/backend/simple/fileprocessor/file_processor/__init__.py b/backend/simple/fileprocessor/file_processor/__init__.py
@@ -0,0 +1 @@
+
diff --git a/backend/simple/fileprocessor/file_processor/service.py b/backend/simple/fileprocessor/file_processor/service.py
@@ -0,0 +1,114 @@
+import subprocess
+import os
+import logging
+from PyPDF2 import PdfReader
+import asyncio
+import json
+from langdetect import detect
+from io import BytesIO
+import uuid
+import shutil
+import tempfile
+
+# Securely configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Validate and sanitize input PDF path
+def secure_path(input_pdf):
+    if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'):
+        raise ValueError("Invalid PDF file.")
+    return os.path.abspath(input_pdf)
+
+async def ocr_pdf_and_extract_text(input_pdf, lang='eng',input_bytes=None):
+    """
+    Perform OCR on a PDF file and extract text securely, without generating an output file.
+    """
+    try:
+        input_pdf = secure_path(input_pdf)
+    except ValueError as e:
+        logging.error(f"Security check failed: {e}")
+        return None, str(e)
+
+    # comment the above line once grpc implemented
+
+    # # Generate temporary file from the GRPC bytes
+    # with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_input:
+    #     tmp_input.write(input_bytes)
+    #     tmp_input_path = tmp_input.name
+
+    # # Use a temporary file to handle OCR output securely
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output:
+        ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", input_pdf, tmp_output.name]
+        # ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", tmp_input_path, tmp_output.name] Uncomment when GRPC implemented
+
+        try:
+            process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            stdout, stderr = await process.communicate()
+            if process.returncode == 0:
+                logging.info(f"Successfully processed {input_pdf}.")
+                # Securely read the OCR output from the temporary file
+                with open(tmp_output.name, 'rb') as ocr_output:
+                    texts = extract_text_from_stream(ocr_output)
+                return texts, None
+            else:
+                error_message = f"Error processing {input_pdf}: {stderr.decode()}"
+                logging.error(error_message)
+                return None, error_message
+        except Exception as e:
+            logging.error(f"An unexpected error occurred: {e}")
+            return None, str(e)
+        finally:
+            # Ensure temporary file is securely deleted
+            os.remove(tmp_output.name)
+
+def extract_text_from_stream(pdf_stream):
+    """
+    Securely extract text from a PDF stream.
+    """
+    try:
+        reader = PdfReader(pdf_stream)
+        texts = [page.extract_text() for page in reader.pages if page.extract_text()]
+        return texts
+    except Exception as e:
+        logging.error(f"Failed to extract text from stream: {e}")
+        return []
+
+def generate_json_response(file_id, file_path, texts):
+    """
+    Generate a secure JSON response without needing to reference an output file path.
+    """
+    try:
+        metadata = {
+            "title": os.path.basename(file_path),
+            "pageCount": len(texts),
+            "filesize": os.path.getsize(file_path),
+            "locale": detect(' '.join(texts)) if texts else "unknown"
+        }
+    except Exception as e:
+        logging.error(f"Error generating metadata for {file_path}: {e}")
+        metadata = {}
+
+    response = {
+        "fileId": file_id,
+        "metadata": metadata,
+        "pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)]
+    }
+
+    return json.dumps(response, indent=4, ensure_ascii=False)
+
+async def main():
+    input_pdf = "example.pdf"
+    input_pdf_bytes = b''  # This should be the actual bytes of the PDF file
+
+    lang = "eng"
+    file_id = str(uuid.uuid4())
+
+    texts, error = await ocr_pdf_and_extract_text(input_pdf, lang)
+    if texts:
+        json_response = generate_json_response(file_id, input_pdf, texts)
+        print(json_response)
+    else:
+        logging.error(f"OCR processing failed: {error}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/backend/simple/fileprocessor/protos/file_processor.proto b/backend/simple/fileprocessor/protos/file_processor.proto
@@ -0,0 +1,37 @@
+syntax = "proto3";
+
+package fileprocessor;
+
+// Service definition
+service FileProcessor {
+  rpc ProcessFile(FileUploadRequest) returns (FileProcessResponse);
+}
+
+// Message for file upload request
+message FileUploadRequest {
+  string userId = 1;
+  string fileId = 2;
+  string filename = 3;
+  bytes file = 4; // PDF file content
+}
+
+// Metadata associated with the file
+message FileMetadata {
+  string title = 1;
+  int64 pageCount = 2;
+  int64 filesize = 3;
+  string locale = 4;
+}
+
+// Response message including processed file information
+message FileProcessResponse {
+  string fileId = 1;
+  FileMetadata metadata = 2;
+  repeated Page pages = 3; // Processed pages from AWS Textract
+}
+
+// Processed page content from the file
+message Page {
+  int64 pageId = 1;
+  string content = 2; // Extracted text content of the page
+}
diff --git a/backend/simple/fileprocessor/requirements.txt b/backend/simple/fileprocessor/requirements.txt
@@ -0,0 +1,5 @@
+grpcio
+grpcio-tools
+tika
+langdetect
+pypdf2