fix(fileprocessor): issue with grpc where files are not found + minor…

… cleanup
EchoSkorJjj · Feb 28, 2024 · 548627f · 548627f
2 parents 7e3d823 + 68427a2
commit 548627f
Show file tree

Hide file tree

Showing 28 changed files with 1,038 additions and 187 deletions.
diff --git a/backend/simple/fileprocessor.zip b/backend/simple/fileprocessor.zip
diff --git a/backend/simple/fileprocessor/Dockerfile b/backend/simple/fileprocessor/Dockerfile
@@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=Asia/Singapore
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 
-# Install OCRmyPDF, Tesseract OCR, and other dependencies
+# Install OCRmyPDF, Tesseract OCR, Protocol Buffers Compiler, and other dependencies
 RUN apt-get update && \
     apt-get install -y \
     ocrmypdf \
@@ -20,18 +20,25 @@ RUN apt-get update && \
     poppler-utils \
     ghostscript \
     qpdf \
+    protobuf-compiler \
     && apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# Install gRPC tools for Python
+RUN pip install grpcio grpcio-tools
+
 # Set the working directory in the container
 WORKDIR /app
 
 # Copy the requirements file and install Python dependencies
 COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Copy the rest of the application
-COPY . /app
+# Copy the proto files and application files into the container
+COPY ./src /app
+
+# Compile the proto files to generate Python code
+RUN python -m grpc_tools.protoc -I/app --python_out=/app --grpc_python_out=/app /app/file_processor.proto
 
 # Set the default command to run the app
-CMD ["python3", "-m", "file_processor.service"]
+CMD ["python3", "-m", "service"]
diff --git a/backend/simple/fileprocessor/docker-compose.yml b/backend/simple/fileprocessor/docker-compose.yml
@@ -1,7 +1,9 @@
-version: '3'
+version: "3"
 services:
   pdf-reader:
     build:
       context: .
+    ports:
+      - "50051:50051"
     volumes:
-      - ./example.pdf:/app/example.pdf
+      - ./src/example.pdf:/app/example.pdf
diff --git a/backend/simple/fileprocessor/file_processor/service.py b/backend/simple/fileprocessor/file_processor/service.py
diff --git a/.../fileprocessor/file_processor/__init__.py → backend/simple/fileprocessor/src/__init__.py b/.../fileprocessor/file_processor/__init__.py → backend/simple/fileprocessor/src/__init__.py
diff --git a/backend/simple/fileprocessor/src/__pycache__/file_processor_pb2.cpython-39.pyc b/backend/simple/fileprocessor/src/__pycache__/file_processor_pb2.cpython-39.pyc
diff --git a/backend/simple/fileprocessor/src/__pycache__/file_processor_pb2_grpc.cpython-39.pyc b/backend/simple/fileprocessor/src/__pycache__/file_processor_pb2_grpc.cpython-39.pyc
diff --git a/backend/simple/fileprocessor/example.pdf → backend/simple/fileprocessor/src/example.pdf b/backend/simple/fileprocessor/example.pdf → backend/simple/fileprocessor/src/example.pdf
diff --git a/...fileprocessor/protos/file_processor.proto → ...le/fileprocessor/src/file_processor.proto b/...fileprocessor/protos/file_processor.proto → ...le/fileprocessor/src/file_processor.proto
@@ -34,4 +34,4 @@ message FileProcessResponse {
 message Page {
   int64 pageId = 1;
   string content = 2; // Extracted text content of the page
-}
+}
diff --git a/backend/simple/fileprocessor/src/service.py b/backend/simple/fileprocessor/src/service.py
@@ -0,0 +1,138 @@
+import subprocess
+import os
+import logging
+from PyPDF2 import PdfReader
+import asyncio
+import json
+from langdetect import detect
+from io import BytesIO
+import uuid
+import shutil
+import tempfile
+
+import grpc
+from concurrent import futures
+import file_processor_pb2
+import file_processor_pb2_grpc
+
+# Securely configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer):
+    async def ProcessFile(self, request, context):
+        input_pdf_bytes = request.file
+        file_id = request.fileId
+        filename = request.filename
+        texts, error = await ocr_pdf_and_extract_text(lang='eng', input_bytes=input_pdf_bytes)
+        if texts:
+            json_response = generate_json_response(file_id, filename, texts)
+            response = json.loads(json_response)
+            pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in response["pages"]]
+            metadata = file_processor_pb2.FileMetadata(title=response["metadata"]["title"],
+                                                        pageCount=response["metadata"]["pageCount"],
+                                                        filesize=response["metadata"]["filesize"],
+                                                        locale=response["metadata"]["locale"])
+            return file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=metadata, pages=pages)
+        else:
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(error)
+            return file_processor_pb2.FileProcessResponse()
+
+
+# Validate and sanitize input PDF path
+def secure_path(input_pdf):
+    if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'):
+        raise ValueError("Invalid PDF file.")
+    return os.path.abspath(input_pdf)
+
+async def ocr_pdf_and_extract_text(lang='eng', input_bytes=None):
+    """
+    Perform OCR on a PDF file and extract text securely, directly from bytes, without generating an output file.
+    """
+    if input_bytes is None:
+        return None, "No input bytes provided."
+
+    try:
+        # Use BytesIO to handle the input PDF bytes
+        input_stream = BytesIO(input_bytes)
+
+        # Use a temporary file to handle OCR output securely
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output:
+            ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", "-", tmp_output.name]
+
+            process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            stdout, stderr = await process.communicate(input=input_stream.read())
+
+            if process.returncode == 0:
+                logging.info("Successfully processed PDF from bytes.")
+                with open(tmp_output.name, 'rb') as ocr_output:
+                    texts = extract_text_from_stream(ocr_output)
+                return texts, None
+            else:
+                error_message = f"Error processing PDF from bytes: {stderr.decode()}"
+                logging.error(error_message)
+                return None, error_message
+    except Exception as e:
+        logging.error(f"An unexpected error occurred: {e}")
+        return None, str(e)
+    finally:
+        input_stream.close()  # Ensure the BytesIO stream is closed
+
+def extract_text_from_stream(pdf_stream):
+    """
+    Securely extract text from a PDF stream.
+    """
+    try:
+        reader = PdfReader(pdf_stream)
+        texts = [page.extract_text() for page in reader.pages if page.extract_text()]
+        return texts
+    except Exception as e:
+        logging.error(f"Failed to extract text from stream: {e}")
+        return []
+
+def generate_json_response(file_id, file_path, texts):
+    """
+    Generate a secure JSON response without needing to reference an output file path.
+    """
+    try:
+        metadata = {
+            "title": os.path.basename(file_path),
+            "pageCount": len(texts),
+            "filesize": os.path.getsize(file_path),
+            "locale": detect(' '.join(texts)) if texts else "unknown"
+        }
+    except Exception as e:
+        logging.error(f"Error generating metadata for {file_path}: {e}")
+        metadata = {}
+
+    response = {
+        "fileId": file_id,
+        "metadata": metadata,
+        "pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)]
+    }
+
+    return json.dumps(response, indent=4, ensure_ascii=False)
+
+async def main():
+    input_pdf = "example.pdf"
+    input_pdf_bytes = b''  # This should be the actual bytes of the PDF file
+
+    lang = "eng"
+    file_id = str(uuid.uuid4())
+
+    texts, error = await ocr_pdf_and_extract_text(input_pdf, lang)
+    if texts:
+        json_response = generate_json_response(file_id, input_pdf, texts)
+        print(json_response)
+    else:
+        logging.error(f"OCR processing failed: {error}")
+
+async def serve():
+    server = grpc.aio.server()
+    file_processor_pb2_grpc.add_FileProcessorServicer_to_server(FileProcessorServicer(), server)
+    server.add_insecure_port('[::]:50051')
+    await server.start()
+    await server.wait_for_termination()
+
+if __name__ == '__main__':
+    asyncio.run(serve())
diff --git a/backend/simple/fileprocessor/src/service_client.py b/backend/simple/fileprocessor/src/service_client.py
@@ -0,0 +1,33 @@
+import asyncio
+import uuid
+import grpc
+import file_processor_pb2
+import file_processor_pb2_grpc
+
+async def process_file_stub(user_id, file, filename, file_id):
+    async with grpc.aio.insecure_channel('localhost:50051') as channel:
+        stub = file_processor_pb2_grpc.FileProcessorStub(channel)
+        request = file_processor_pb2.FileUploadRequest(
+            userId=user_id,
+            file=file,  # Use the correct parameter name based on your proto file definition
+            filename=filename,
+            fileId=str(file_id)  # Ensure file_id is a string
+        )
+        response = await stub.ProcessFile(request)
+        return response
+
+async def main():
+    # Read the PDF file as bytes
+    with open("example.pdf", "rb") as pdf_file:
+        input_pdf_bytes = pdf_file.read()
+    user_id = str(uuid.uuid4())
+    filename = "example.pdf"
+    file_id = uuid.uuid4()  # This will be converted to a string in the request
+    print(type(input_pdf_bytes))
+
+    # Correct the function call with proper argument names
+    response = await process_file_stub(user_id=user_id, file=input_pdf_bytes, filename=filename, file_id=file_id)
+    print(response)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/client/package-lock.json b/client/package-lock.json
diff --git a/client/package.json b/client/package.json
@@ -31,6 +31,7 @@
     "zustand": "^4.4.7"
   },
   "devDependencies": {
+    "@faker-js/faker": "^8.4.1",
     "@types/react": "^18.2.55",
     "@types/react-dom": "^18.2.19",
     "@typescript-eslint/eslint-plugin": "^6.21.0",