Skip to content

Commit

Permalink
feat(simple file processor): added simple file processor service usin…
Browse files Browse the repository at this point in the history
…g ocrmypdf. No GRPC implemented
  • Loading branch information
neilscallywag committed Feb 25, 2024
1 parent 50004fb commit 7e3d823
Show file tree
Hide file tree
Showing 8 changed files with 231 additions and 0 deletions.
37 changes: 37 additions & 0 deletions backend/simple/fileprocessor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Start with a Python slim base image
FROM python:3.9-slim

# Avoid prompts from apt
ENV DEBIAN_FRONTEND=noninteractive

# Set timezone (required for some configurations)
ENV TZ=Asia/Singapore
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

# Install OCRmyPDF, Tesseract OCR, and other dependencies
RUN apt-get update && \
apt-get install -y \
ocrmypdf \
tesseract-ocr \
tesseract-ocr-eng \
libsm6 \
libxext6 \
libxrender-dev \
poppler-utils \
ghostscript \
qpdf \
&& apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Set the working directory in the container
WORKDIR /app

# Copy the requirements file and install Python dependencies
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the application
COPY . /app

# Set the default command to run the app
CMD ["python3", "-m", "file_processor.service"]
30 changes: 30 additions & 0 deletions backend/simple/fileprocessor/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
PROJECT_NAME := ESD
ENVIRONMENT ?= dev
NPM_SUBDIRS := client
PYTHON_VERSION := 3.11.2
VENV_NAME := venv
VENV_DEACTIVATE := deactivate
ifeq ($(OS),Windows_NT)
PYTHON := python
VENV_ACTIVATE := venv\Scripts\activate.bat
RM := del /s /q
else
PYTHON := python3
VENV_ACTIVATE := . venv/bin/activate
RM := rm -rf
endif
init: check-terraform check-aws-cli venv requirements npm-install

venv:
@echo "Creating python virtual environment in '$(VENV_NAME)' folder..."
$(PYTHON) -m venv $(VENV_NAME)

requirements:
@echo "Installing Python requirements..."
@$(VENV_ACTIVATE) && pip install -r requirements.txt

deactivate-venv:
@echo "Deactivating virtual environment..."
$(RM) $(VENV_NAME)
@find . -name "*.pyc" -delete
@exit 0
7 changes: 7 additions & 0 deletions backend/simple/fileprocessor/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
version: '3'
services:
pdf-reader:
build:
context: .
volumes:
- ./example.pdf:/app/example.pdf
Binary file added backend/simple/fileprocessor/example.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions backend/simple/fileprocessor/file_processor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

114 changes: 114 additions & 0 deletions backend/simple/fileprocessor/file_processor/service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import subprocess
import os
import logging
from PyPDF2 import PdfReader
import asyncio
import json
from langdetect import detect
from io import BytesIO
import uuid
import shutil
import tempfile

# Securely configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Validate and sanitize input PDF path
def secure_path(input_pdf):
if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'):
raise ValueError("Invalid PDF file.")
return os.path.abspath(input_pdf)

async def ocr_pdf_and_extract_text(input_pdf, lang='eng',input_bytes=None):
"""
Perform OCR on a PDF file and extract text securely, without generating an output file.
"""
try:
input_pdf = secure_path(input_pdf)
except ValueError as e:
logging.error(f"Security check failed: {e}")
return None, str(e)

# comment the above line once grpc implemented

# # Generate temporary file from the GRPC bytes
# with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_input:
# tmp_input.write(input_bytes)
# tmp_input_path = tmp_input.name

# # Use a temporary file to handle OCR output securely
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output:
ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", input_pdf, tmp_output.name]
# ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", tmp_input_path, tmp_output.name] Uncomment when GRPC implemented

try:
process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = await process.communicate()
if process.returncode == 0:
logging.info(f"Successfully processed {input_pdf}.")
# Securely read the OCR output from the temporary file
with open(tmp_output.name, 'rb') as ocr_output:
texts = extract_text_from_stream(ocr_output)
return texts, None
else:
error_message = f"Error processing {input_pdf}: {stderr.decode()}"
logging.error(error_message)
return None, error_message
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
return None, str(e)
finally:
# Ensure temporary file is securely deleted
os.remove(tmp_output.name)

def extract_text_from_stream(pdf_stream):
"""
Securely extract text from a PDF stream.
"""
try:
reader = PdfReader(pdf_stream)
texts = [page.extract_text() for page in reader.pages if page.extract_text()]
return texts
except Exception as e:
logging.error(f"Failed to extract text from stream: {e}")
return []

def generate_json_response(file_id, file_path, texts):
"""
Generate a secure JSON response without needing to reference an output file path.
"""
try:
metadata = {
"title": os.path.basename(file_path),
"pageCount": len(texts),
"filesize": os.path.getsize(file_path),
"locale": detect(' '.join(texts)) if texts else "unknown"
}
except Exception as e:
logging.error(f"Error generating metadata for {file_path}: {e}")
metadata = {}

response = {
"fileId": file_id,
"metadata": metadata,
"pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)]
}

return json.dumps(response, indent=4, ensure_ascii=False)

async def main():
input_pdf = "example.pdf"
input_pdf_bytes = b'' # This should be the actual bytes of the PDF file

lang = "eng"
file_id = str(uuid.uuid4())

texts, error = await ocr_pdf_and_extract_text(input_pdf, lang)
if texts:
json_response = generate_json_response(file_id, input_pdf, texts)
print(json_response)
else:
logging.error(f"OCR processing failed: {error}")

if __name__ == "__main__":
asyncio.run(main())
37 changes: 37 additions & 0 deletions backend/simple/fileprocessor/protos/file_processor.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
syntax = "proto3";

package fileprocessor;

// Service definition
service FileProcessor {
rpc ProcessFile(FileUploadRequest) returns (FileProcessResponse);
}

// Message for file upload request
message FileUploadRequest {
string userId = 1;
string fileId = 2;
string filename = 3;
bytes file = 4; // PDF file content
}

// Metadata associated with the file
message FileMetadata {
string title = 1;
int64 pageCount = 2;
int64 filesize = 3;
string locale = 4;
}

// Response message including processed file information
message FileProcessResponse {
string fileId = 1;
FileMetadata metadata = 2;
repeated Page pages = 3; // Processed pages from AWS Textract
}

// Processed page content from the file
message Page {
int64 pageId = 1;
string content = 2; // Extracted text content of the page
}
5 changes: 5 additions & 0 deletions backend/simple/fileprocessor/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
grpcio
grpcio-tools
tika
langdetect
pypdf2

0 comments on commit 7e3d823

Please sign in to comment.