Skip to content

Commit

Permalink
Merge this shizz (#6)
Browse files Browse the repository at this point in the history
* feat(simple file processor): added simple file processor service using ocrmypdf. No GRPC implemented

* feat(fileprocessor): changed port and added proto files

* fix(fileprocessor): did further cleanup and fixed minor error handling bugs

* chore(fileprocessor): added extensive logging and error handling

---------

Co-authored-by: Thaddeaus Low <thaddeausl.2022@scis.smu.edu.sg>
  • Loading branch information
neilscallywag and thaddeauslow authored Feb 28, 2024
1 parent 68427a2 commit 30628bf
Show file tree
Hide file tree
Showing 18 changed files with 245 additions and 181 deletions.
Binary file added backend/simple/fileprocessor.zip
Binary file not shown.
16 changes: 12 additions & 4 deletions backend/simple/fileprocessor/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Singapore
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

# Install OCRmyPDF, Tesseract OCR, and other dependencies
# Install OCRmyPDF, Tesseract OCR, Protocol Buffers Compiler, and other dependencies
RUN apt-get update && \
apt-get install -y \
ocrmypdf \
Expand All @@ -20,18 +20,26 @@ RUN apt-get update && \
poppler-utils \
ghostscript \
qpdf \
protobuf-compiler \
&& apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install gRPC tools for Python
RUN pip install grpcio grpcio-tools protobuf


# Set the working directory in the container
WORKDIR /app

# Copy the requirements file and install Python dependencies
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the application
COPY . /app
# Copy the proto files and application files into the container
COPY ./src /app

# Compile the proto files to generate Python code
RUN python -m grpc_tools.protoc -I/app --python_out=/app --grpc_python_out=/app /app/file_processor.proto

# Set the default command to run the app
CMD ["python3", "/app/file_processor/service.py"]
CMD ["python3", "-m", "service"]
2 changes: 1 addition & 1 deletion backend/simple/fileprocessor/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ services:
ports:
- "50051:50051"
volumes:
- ./example.pdf:/app/example.pdf
- ./src/example.pdf:/app/example.pdf
Empty file.
147 changes: 0 additions & 147 deletions backend/simple/fileprocessor/file_processor/service.py

This file was deleted.

26 changes: 0 additions & 26 deletions backend/simple/fileprocessor/file_processor/service_client.py

This file was deleted.

1 change: 1 addition & 0 deletions backend/simple/fileprocessor/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file not shown.
Binary file not shown.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,26 @@ syntax = "proto3";

package fileprocessor;

import "google/protobuf/timestamp.proto";
import "google/protobuf/any.proto";

// Metadata associated with each response
message ResponseMetadata {
string request_id = 1;
google.protobuf.Timestamp timestamp = 2;
}

// Generic wrapper for service responses
message ServiceResponseWrapper {
ResponseMetadata metadata = 1;
google.protobuf.Any payload = 2;
}

// Existing definitions below, with modifications to response messages

// Service definition
service FileProcessor {
rpc ProcessFile(FileUploadRequest) returns (FileProcessResponse);
rpc ProcessFile(FileUploadRequest) returns (ServiceResponseWrapper);
}

// Message for file upload request
Expand All @@ -23,7 +40,7 @@ message FileMetadata {
string locale = 4;
}

// Response message including processed file information
// Modified response message to include in the payload of ServiceResponseWrapper
message FileProcessResponse {
string fileId = 1;
FileMetadata metadata = 2;
Expand All @@ -34,4 +51,4 @@ message FileProcessResponse {
message Page {
int64 pageId = 1;
string content = 2; // Extracted text content of the page
}
}
53 changes: 53 additions & 0 deletions backend/simple/fileprocessor/src/file_processor_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# file_processor_service.py
import grpc
import logging
import file_processor_pb2_grpc
import file_processor_pb2
from ocr_processing import process_pdf_file
from datetime import datetime
import uuid
import os

class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer):
def ProcessFile(self, request, context):
file_id = request.fileId
filename = request.filename
input_pdf_bytes = request.file
environment_mode = os.getenv('ENVIRONMENT_MODE', 'development') # Default to development if not set

# Check for kong-request-id in metadata if the mode is production
request_metadata = None
if environment_mode.lower() == 'production':
if 'kong-request-id' not in request.metadata or not request.metadata['kong-request-id']:
context.abort(
code=grpc.StatusCode.INVALID_ARGUMENT,
details="Missing required 'kong-request-id' in metadata for production mode.",
)
request_metadata = request.metadata

try:
texts, metadata = process_pdf_file(input_pdf_bytes, filename)
pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in texts]
file_metadata = file_processor_pb2.FileMetadata(title=metadata["title"],
pageCount=metadata["pageCount"],
filesize=metadata["filesize"],
locale=metadata["locale"])
response_payload = file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=file_metadata, pages=pages)

# Wrap the response payload in ServiceResponseWrapper
response_wrapper = file_processor_pb2.ServiceResponseWrapper()
kong_request_id = request.metadata.get('kong-request-id') if request_metadata else str(uuid.uuid4())
response_wrapper.metadata.request_id = kong_request_id
response_wrapper.metadata.timestamp.FromDatetime(datetime.now())
response_wrapper.payload.Pack(response_payload)

return response_wrapper
except Exception as e:
logging.error(f"Error processing file {file_id}: {str(e)}", exc_info=True)

# Use standard gRPC status codes and metadata for error handling
context.abort(
code=grpc.StatusCode.INTERNAL,
details="Internal server error occurred.",
metadata=(('error-details', str(e)),) # Include the exception message in error-details
)
53 changes: 53 additions & 0 deletions backend/simple/fileprocessor/src/ocr_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# ocr_processing.py
import logging
from PyPDF2 import PdfReader
from io import BytesIO
import tempfile
import subprocess
import os
from utilities import generate_json_response, detect_locale

def process_pdf_file(input_pdf_bytes, filename):
try:
input_stream = BytesIO(input_pdf_bytes)
texts, temp_pdf_path = ocr_pdf(input_stream)
metadata = generate_metadata(filename, temp_pdf_path, texts)

if os.path.exists(temp_pdf_path):
os.remove(temp_pdf_path)

return texts, metadata
except Exception as e:
logging.error(f"Error in OCR processing for file {filename}: {str(e)}", exc_info=True)
raise

def ocr_pdf(input_stream):
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output:
ocrmypdf_cmd = ["ocrmypdf", "-l", "eng", "--force-ocr", "--output-type", "pdf", "-", tmp_output.name]
process = subprocess.run(ocrmypdf_cmd, input=input_stream.read(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

if process.returncode != 0:
logging.error(f"OCR command failed: {process.stderr.decode()}")
raise Exception(f"OCR failed for file {tmp_output.name}: {process.stderr.decode()}")

texts = extract_text_from_pdf(tmp_output.name)
return texts, tmp_output.name

def extract_text_from_pdf(pdf_path):
texts = []
with open(pdf_path, 'rb') as pdf_file:
reader = PdfReader(pdf_file)
for page_num, page in enumerate(reader.pages, start=1):
text = page.extract_text() or "Error extracting text"
texts.append({"pageId": page_num, "content": text})
return texts

def generate_metadata(filename, pdf_path, texts):
locale = detect_locale(' '.join([text["content"] for text in texts]))
metadata = {
"title": os.path.basename(filename),
"pageCount": len(texts),
"filesize": os.path.getsize(pdf_path),
"locale": locale
}
return metadata
Loading

0 comments on commit 30628bf

Please sign in to comment.