-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(simple file processor): added simple file processor service usin…
…g ocrmypdf. No GRPC implemented
- Loading branch information
1 parent
50004fb
commit 7e3d823
Showing
8 changed files
with
231 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Start with a Python slim base image | ||
FROM python:3.9-slim | ||
|
||
# Avoid prompts from apt | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# Set timezone (required for some configurations) | ||
ENV TZ=Asia/Singapore | ||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone | ||
|
||
# Install OCRmyPDF, Tesseract OCR, and other dependencies | ||
RUN apt-get update && \ | ||
apt-get install -y \ | ||
ocrmypdf \ | ||
tesseract-ocr \ | ||
tesseract-ocr-eng \ | ||
libsm6 \ | ||
libxext6 \ | ||
libxrender-dev \ | ||
poppler-utils \ | ||
ghostscript \ | ||
qpdf \ | ||
&& apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Set the working directory in the container | ||
WORKDIR /app | ||
|
||
# Copy the requirements file and install Python dependencies | ||
COPY requirements.txt /app/ | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# Copy the rest of the application | ||
COPY . /app | ||
|
||
# Set the default command to run the app | ||
CMD ["python3", "-m", "file_processor.service"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
PROJECT_NAME := ESD | ||
ENVIRONMENT ?= dev | ||
NPM_SUBDIRS := client | ||
PYTHON_VERSION := 3.11.2 | ||
VENV_NAME := venv | ||
VENV_DEACTIVATE := deactivate | ||
ifeq ($(OS),Windows_NT) | ||
PYTHON := python | ||
VENV_ACTIVATE := venv\Scripts\activate.bat | ||
RM := del /s /q | ||
else | ||
PYTHON := python3 | ||
VENV_ACTIVATE := . venv/bin/activate | ||
RM := rm -rf | ||
endif | ||
init: check-terraform check-aws-cli venv requirements npm-install | ||
|
||
venv: | ||
@echo "Creating python virtual environment in '$(VENV_NAME)' folder..." | ||
$(PYTHON) -m venv $(VENV_NAME) | ||
|
||
requirements: | ||
@echo "Installing Python requirements..." | ||
@$(VENV_ACTIVATE) && pip install -r requirements.txt | ||
|
||
deactivate-venv: | ||
@echo "Deactivating virtual environment..." | ||
$(RM) $(VENV_NAME) | ||
@find . -name "*.pyc" -delete | ||
@exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
version: '3' | ||
services: | ||
pdf-reader: | ||
build: | ||
context: . | ||
volumes: | ||
- ./example.pdf:/app/example.pdf |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
import subprocess | ||
import os | ||
import logging | ||
from PyPDF2 import PdfReader | ||
import asyncio | ||
import json | ||
from langdetect import detect | ||
from io import BytesIO | ||
import uuid | ||
import shutil | ||
import tempfile | ||
|
||
# Securely configure logging | ||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | ||
|
||
# Validate and sanitize input PDF path | ||
def secure_path(input_pdf): | ||
if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'): | ||
raise ValueError("Invalid PDF file.") | ||
return os.path.abspath(input_pdf) | ||
|
||
async def ocr_pdf_and_extract_text(input_pdf, lang='eng',input_bytes=None): | ||
""" | ||
Perform OCR on a PDF file and extract text securely, without generating an output file. | ||
""" | ||
try: | ||
input_pdf = secure_path(input_pdf) | ||
except ValueError as e: | ||
logging.error(f"Security check failed: {e}") | ||
return None, str(e) | ||
|
||
# comment the above line once grpc implemented | ||
|
||
# # Generate temporary file from the GRPC bytes | ||
# with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_input: | ||
# tmp_input.write(input_bytes) | ||
# tmp_input_path = tmp_input.name | ||
|
||
# # Use a temporary file to handle OCR output securely | ||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output: | ||
ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", input_pdf, tmp_output.name] | ||
# ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", tmp_input_path, tmp_output.name] Uncomment when GRPC implemented | ||
|
||
try: | ||
process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
stdout, stderr = await process.communicate() | ||
if process.returncode == 0: | ||
logging.info(f"Successfully processed {input_pdf}.") | ||
# Securely read the OCR output from the temporary file | ||
with open(tmp_output.name, 'rb') as ocr_output: | ||
texts = extract_text_from_stream(ocr_output) | ||
return texts, None | ||
else: | ||
error_message = f"Error processing {input_pdf}: {stderr.decode()}" | ||
logging.error(error_message) | ||
return None, error_message | ||
except Exception as e: | ||
logging.error(f"An unexpected error occurred: {e}") | ||
return None, str(e) | ||
finally: | ||
# Ensure temporary file is securely deleted | ||
os.remove(tmp_output.name) | ||
|
||
def extract_text_from_stream(pdf_stream): | ||
""" | ||
Securely extract text from a PDF stream. | ||
""" | ||
try: | ||
reader = PdfReader(pdf_stream) | ||
texts = [page.extract_text() for page in reader.pages if page.extract_text()] | ||
return texts | ||
except Exception as e: | ||
logging.error(f"Failed to extract text from stream: {e}") | ||
return [] | ||
|
||
def generate_json_response(file_id, file_path, texts): | ||
""" | ||
Generate a secure JSON response without needing to reference an output file path. | ||
""" | ||
try: | ||
metadata = { | ||
"title": os.path.basename(file_path), | ||
"pageCount": len(texts), | ||
"filesize": os.path.getsize(file_path), | ||
"locale": detect(' '.join(texts)) if texts else "unknown" | ||
} | ||
except Exception as e: | ||
logging.error(f"Error generating metadata for {file_path}: {e}") | ||
metadata = {} | ||
|
||
response = { | ||
"fileId": file_id, | ||
"metadata": metadata, | ||
"pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)] | ||
} | ||
|
||
return json.dumps(response, indent=4, ensure_ascii=False) | ||
|
||
async def main(): | ||
input_pdf = "example.pdf" | ||
input_pdf_bytes = b'' # This should be the actual bytes of the PDF file | ||
|
||
lang = "eng" | ||
file_id = str(uuid.uuid4()) | ||
|
||
texts, error = await ocr_pdf_and_extract_text(input_pdf, lang) | ||
if texts: | ||
json_response = generate_json_response(file_id, input_pdf, texts) | ||
print(json_response) | ||
else: | ||
logging.error(f"OCR processing failed: {error}") | ||
|
||
if __name__ == "__main__": | ||
asyncio.run(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
syntax = "proto3"; | ||
|
||
package fileprocessor; | ||
|
||
// Service definition | ||
service FileProcessor { | ||
rpc ProcessFile(FileUploadRequest) returns (FileProcessResponse); | ||
} | ||
|
||
// Message for file upload request | ||
message FileUploadRequest { | ||
string userId = 1; | ||
string fileId = 2; | ||
string filename = 3; | ||
bytes file = 4; // PDF file content | ||
} | ||
|
||
// Metadata associated with the file | ||
message FileMetadata { | ||
string title = 1; | ||
int64 pageCount = 2; | ||
int64 filesize = 3; | ||
string locale = 4; | ||
} | ||
|
||
// Response message including processed file information | ||
message FileProcessResponse { | ||
string fileId = 1; | ||
FileMetadata metadata = 2; | ||
repeated Page pages = 3; // Processed pages from AWS Textract | ||
} | ||
|
||
// Processed page content from the file | ||
message Page { | ||
int64 pageId = 1; | ||
string content = 2; // Extracted text content of the page | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
grpcio | ||
grpcio-tools | ||
tika | ||
langdetect | ||
pypdf2 |