Merge this shizz (#6)

* feat(simple file processor): added simple file processor service using ocrmypdf. No GRPC implemented * feat(fileprocessor): changed port and added proto files * fix(fileprocessor): did further cleanup and fixed minor error handling bugs * chore(fileprocessor): added extensive logging and error handling --------- Co-authored-by: Thaddeaus Low <thaddeausl.2022@scis.smu.edu.sg>
EchoSkorJjj · Feb 28, 2024 · 30628bf · 30628bf
1 parent 68427a2
commit 30628bf
Show file tree

Hide file tree

Showing 18 changed files with 245 additions and 181 deletions.
diff --git a/backend/simple/fileprocessor.zip b/backend/simple/fileprocessor.zip
diff --git a/backend/simple/fileprocessor/Dockerfile b/backend/simple/fileprocessor/Dockerfile
@@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=Asia/Singapore
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 
-# Install OCRmyPDF, Tesseract OCR, and other dependencies
+# Install OCRmyPDF, Tesseract OCR, Protocol Buffers Compiler, and other dependencies
 RUN apt-get update && \
     apt-get install -y \
     ocrmypdf \
@@ -20,18 +20,26 @@ RUN apt-get update && \
     poppler-utils \
     ghostscript \
     qpdf \
+    protobuf-compiler \
     && apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# Install gRPC tools for Python
+RUN pip install grpcio grpcio-tools protobuf
+
+
 # Set the working directory in the container
 WORKDIR /app
 
 # Copy the requirements file and install Python dependencies
 COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Copy the rest of the application
-COPY . /app
+# Copy the proto files and application files into the container
+COPY ./src /app
+
+# Compile the proto files to generate Python code
+RUN python -m grpc_tools.protoc -I/app --python_out=/app --grpc_python_out=/app /app/file_processor.proto
 
 # Set the default command to run the app
-CMD ["python3", "/app/file_processor/service.py"]
+CMD ["python3", "-m", "service"]
diff --git a/backend/simple/fileprocessor/docker-compose.yml b/backend/simple/fileprocessor/docker-compose.yml
@@ -6,4 +6,4 @@ services:
     ports:
       - "50051:50051"
     volumes:
-      - ./example.pdf:/app/example.pdf
+      - ./src/example.pdf:/app/example.pdf
diff --git a/backend/simple/fileprocessor/file_processor/__init__.py b/backend/simple/fileprocessor/file_processor/__init__.py
diff --git a/backend/simple/fileprocessor/file_processor/service.py b/backend/simple/fileprocessor/file_processor/service.py
diff --git a/backend/simple/fileprocessor/file_processor/service_client.py b/backend/simple/fileprocessor/file_processor/service_client.py
diff --git a/backend/simple/fileprocessor/src/__init__.py b/backend/simple/fileprocessor/src/__init__.py
@@ -0,0 +1 @@
+
diff --git a/...cache__/file_processor_pb2.cpython-39.pyc → ...cache__/file_processor_pb2.cpython-39.pyc b/...cache__/file_processor_pb2.cpython-39.pyc → ...cache__/file_processor_pb2.cpython-39.pyc
diff --git a/...__/file_processor_pb2_grpc.cpython-39.pyc → ...__/file_processor_pb2_grpc.cpython-39.pyc b/...__/file_processor_pb2_grpc.cpython-39.pyc → ...__/file_processor_pb2_grpc.cpython-39.pyc
diff --git a/backend/simple/fileprocessor/example.pdf → backend/simple/fileprocessor/src/example.pdf b/backend/simple/fileprocessor/example.pdf → backend/simple/fileprocessor/src/example.pdf
diff --git a/...essor/file_processor/file_processor.proto → ...le/fileprocessor/src/file_processor.proto b/...essor/file_processor/file_processor.proto → ...le/fileprocessor/src/file_processor.proto
@@ -2,9 +2,26 @@ syntax = "proto3";
 
 package fileprocessor;
 
+import "google/protobuf/timestamp.proto";
+import "google/protobuf/any.proto";
+
+// Metadata associated with each response
+message ResponseMetadata {
+    string request_id = 1;
+    google.protobuf.Timestamp timestamp = 2;
+}
+
+// Generic wrapper for service responses
+message ServiceResponseWrapper {
+    ResponseMetadata metadata = 1;
+    google.protobuf.Any payload = 2;
+}
+
+// Existing definitions below, with modifications to response messages
+
 // Service definition
 service FileProcessor {
-  rpc ProcessFile(FileUploadRequest) returns (FileProcessResponse);
+  rpc ProcessFile(FileUploadRequest) returns (ServiceResponseWrapper);
 }
 
 // Message for file upload request
@@ -23,7 +40,7 @@ message FileMetadata {
   string locale = 4;
 }
 
-// Response message including processed file information
+// Modified response message to include in the payload of ServiceResponseWrapper
 message FileProcessResponse {
   string fileId = 1;
   FileMetadata metadata = 2;
@@ -34,4 +51,4 @@ message FileProcessResponse {
 message Page {
   int64 pageId = 1;
   string content = 2; // Extracted text content of the page
-}
+}
diff --git a/...ssor/file_processor/file_processor_pb2.py → ...e/fileprocessor/src/file_processor_pb2.py b/...ssor/file_processor/file_processor_pb2.py → ...e/fileprocessor/src/file_processor_pb2.py
diff --git a/...file_processor/file_processor_pb2_grpc.py → ...eprocessor/src/file_processor_pb2_grpc.py b/...file_processor/file_processor_pb2_grpc.py → ...eprocessor/src/file_processor_pb2_grpc.py
diff --git a/backend/simple/fileprocessor/src/file_processor_service.py b/backend/simple/fileprocessor/src/file_processor_service.py
@@ -0,0 +1,53 @@
+# file_processor_service.py
+import grpc
+import logging
+import file_processor_pb2_grpc
+import file_processor_pb2
+from ocr_processing import process_pdf_file
+from datetime import datetime
+import uuid
+import os
+
+class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer):
+    def ProcessFile(self, request, context):
+        file_id = request.fileId
+        filename = request.filename
+        input_pdf_bytes = request.file
+        environment_mode = os.getenv('ENVIRONMENT_MODE', 'development')  # Default to development if not set
+
+        # Check for kong-request-id in metadata if the mode is production
+        request_metadata = None
+        if environment_mode.lower() == 'production':
+            if 'kong-request-id' not in request.metadata or not request.metadata['kong-request-id']:
+                context.abort(
+                    code=grpc.StatusCode.INVALID_ARGUMENT,
+                    details="Missing required 'kong-request-id' in metadata for production mode.",
+                )
+            request_metadata = request.metadata
+
+        try:
+            texts, metadata = process_pdf_file(input_pdf_bytes, filename)
+            pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in texts]
+            file_metadata = file_processor_pb2.FileMetadata(title=metadata["title"],
+                                                            pageCount=metadata["pageCount"],
+                                                            filesize=metadata["filesize"],
+                                                            locale=metadata["locale"])
+            response_payload = file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=file_metadata, pages=pages)
+
+            # Wrap the response payload in ServiceResponseWrapper
+            response_wrapper = file_processor_pb2.ServiceResponseWrapper()
+            kong_request_id = request.metadata.get('kong-request-id') if request_metadata else  str(uuid.uuid4())
+            response_wrapper.metadata.request_id = kong_request_id
+            response_wrapper.metadata.timestamp.FromDatetime(datetime.now())
+            response_wrapper.payload.Pack(response_payload)
+
+            return response_wrapper
+        except Exception as e:
+            logging.error(f"Error processing file {file_id}: {str(e)}", exc_info=True)
+
+            # Use standard gRPC status codes and metadata for error handling
+            context.abort(
+                code=grpc.StatusCode.INTERNAL,
+                details="Internal server error occurred.",
+                metadata=(('error-details', str(e)),)  # Include the exception message in error-details
+            )
diff --git a/backend/simple/fileprocessor/src/ocr_processing.py b/backend/simple/fileprocessor/src/ocr_processing.py
@@ -0,0 +1,53 @@
+# ocr_processing.py
+import logging
+from PyPDF2 import PdfReader
+from io import BytesIO
+import tempfile
+import subprocess
+import os
+from utilities import generate_json_response, detect_locale
+
+def process_pdf_file(input_pdf_bytes, filename):
+    try:
+        input_stream = BytesIO(input_pdf_bytes)
+        texts, temp_pdf_path = ocr_pdf(input_stream)
+        metadata = generate_metadata(filename, temp_pdf_path, texts)
+
+        if os.path.exists(temp_pdf_path):
+            os.remove(temp_pdf_path)
+
+        return texts, metadata
+    except Exception as e:
+        logging.error(f"Error in OCR processing for file {filename}: {str(e)}", exc_info=True)
+        raise
+
+def ocr_pdf(input_stream):
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output:
+        ocrmypdf_cmd = ["ocrmypdf", "-l", "eng", "--force-ocr", "--output-type", "pdf", "-", tmp_output.name]
+        process = subprocess.run(ocrmypdf_cmd, input=input_stream.read(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        if process.returncode != 0:
+            logging.error(f"OCR command failed: {process.stderr.decode()}")
+            raise Exception(f"OCR failed for file {tmp_output.name}: {process.stderr.decode()}")
+
+        texts = extract_text_from_pdf(tmp_output.name)
+        return texts, tmp_output.name
+
+def extract_text_from_pdf(pdf_path):
+    texts = []
+    with open(pdf_path, 'rb') as pdf_file:
+        reader = PdfReader(pdf_file)
+        for page_num, page in enumerate(reader.pages, start=1):
+            text = page.extract_text() or "Error extracting text"
+            texts.append({"pageId": page_num, "content": text})
+    return texts
+
+def generate_metadata(filename, pdf_path, texts):
+    locale = detect_locale(' '.join([text["content"] for text in texts]))
+    metadata = {
+        "title": os.path.basename(filename),
+        "pageCount": len(texts),
+        "filesize": os.path.getsize(pdf_path),
+        "locale": locale
+    }
+    return metadata