feat(frontend): add marketplace (#5)

EchoSkorJjj · Feb 27, 2024 · 68427a2 · 68427a2
1 parent c106982
commit 68427a2
Show file tree

Hide file tree

Showing 30 changed files with 1,245 additions and 66 deletions.
diff --git a/backend/simple/fileprocessor/Dockerfile b/backend/simple/fileprocessor/Dockerfile
@@ -0,0 +1,37 @@
+# Start with a Python slim base image
+FROM python:3.9-slim
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set timezone (required for some configurations)
+ENV TZ=Asia/Singapore
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+# Install OCRmyPDF, Tesseract OCR, and other dependencies
+RUN apt-get update && \
+    apt-get install -y \
+    ocrmypdf \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    poppler-utils \
+    ghostscript \
+    qpdf \
+    && apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the requirements file and install Python dependencies
+COPY requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application
+COPY . /app
+
+# Set the default command to run the app
+CMD ["python3", "/app/file_processor/service.py"]
diff --git a/backend/simple/fileprocessor/Makefile b/backend/simple/fileprocessor/Makefile
@@ -0,0 +1,30 @@
+PROJECT_NAME := ESD
+ENVIRONMENT ?= dev
+NPM_SUBDIRS := client
+PYTHON_VERSION := 3.11.2
+VENV_NAME := venv
+VENV_DEACTIVATE := deactivate
+ifeq ($(OS),Windows_NT)
+	PYTHON := python
+	VENV_ACTIVATE := venv\Scripts\activate.bat
+	RM := del /s /q
+else
+	PYTHON := python3
+	VENV_ACTIVATE := . venv/bin/activate
+	RM := rm -rf
+endif
+init: check-terraform check-aws-cli venv requirements npm-install
+
+venv:
+	@echo "Creating python virtual environment in '$(VENV_NAME)' folder..."
+	$(PYTHON) -m venv $(VENV_NAME)
+
+requirements:
+	@echo "Installing Python requirements..."
+	@$(VENV_ACTIVATE) && pip install -r requirements.txt
+
+deactivate-venv:
+	@echo "Deactivating virtual environment..."
+	$(RM) $(VENV_NAME)
+	@find . -name "*.pyc" -delete
+	@exit 0
diff --git a/backend/simple/fileprocessor/docker-compose.yml b/backend/simple/fileprocessor/docker-compose.yml
@@ -0,0 +1,9 @@
+version: "3"
+services:
+  pdf-reader:
+    build:
+      context: .
+    ports:
+      - "50051:50051"
+    volumes:
+      - ./example.pdf:/app/example.pdf
diff --git a/backend/simple/fileprocessor/example.pdf b/backend/simple/fileprocessor/example.pdf
diff --git a/backend/simple/fileprocessor/file_processor/__init__.py b/backend/simple/fileprocessor/file_processor/__init__.py
diff --git a/backend/simple/fileprocessor/file_processor/__pycache__/file_processor_pb2.cpython-39.pyc b/backend/simple/fileprocessor/file_processor/__pycache__/file_processor_pb2.cpython-39.pyc
diff --git a/...nd/simple/fileprocessor/file_processor/__pycache__/file_processor_pb2_grpc.cpython-39.pyc b/...nd/simple/fileprocessor/file_processor/__pycache__/file_processor_pb2_grpc.cpython-39.pyc
diff --git a/backend/simple/fileprocessor/file_processor/file_processor.proto b/backend/simple/fileprocessor/file_processor/file_processor.proto
@@ -0,0 +1,37 @@
+syntax = "proto3";
+
+package fileprocessor;
+
+// Service definition
+service FileProcessor {
+  rpc ProcessFile(FileUploadRequest) returns (FileProcessResponse);
+}
+
+// Message for file upload request
+message FileUploadRequest {
+  string userId = 1;
+  string fileId = 2;
+  string filename = 3;
+  bytes file = 4; // PDF file content
+}
+
+// Metadata associated with the file
+message FileMetadata {
+  string title = 1;
+  int64 pageCount = 2;
+  int64 filesize = 3;
+  string locale = 4;
+}
+
+// Response message including processed file information
+message FileProcessResponse {
+  string fileId = 1;
+  FileMetadata metadata = 2;
+  repeated Page pages = 3; // Processed pages from AWS Textract
+}
+
+// Processed page content from the file
+message Page {
+  int64 pageId = 1;
+  string content = 2; // Extracted text content of the page
+}
diff --git a/backend/simple/fileprocessor/file_processor/file_processor_pb2.py b/backend/simple/fileprocessor/file_processor/file_processor_pb2.py
diff --git a/backend/simple/fileprocessor/file_processor/file_processor_pb2_grpc.py b/backend/simple/fileprocessor/file_processor/file_processor_pb2_grpc.py
@@ -0,0 +1,69 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import file_processor_pb2 as file__processor__pb2
+
+
+class FileProcessorStub(object):
+    """Service definition
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.ProcessFile = channel.unary_unary(
+                '/fileprocessor.FileProcessor/ProcessFile',
+                request_serializer=file__processor__pb2.FileUploadRequest.SerializeToString,
+                response_deserializer=file__processor__pb2.FileProcessResponse.FromString,
+                )
+
+
+class FileProcessorServicer(object):
+    """Service definition
+    """
+
+    def ProcessFile(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_FileProcessorServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'ProcessFile': grpc.unary_unary_rpc_method_handler(
+                    servicer.ProcessFile,
+                    request_deserializer=file__processor__pb2.FileUploadRequest.FromString,
+                    response_serializer=file__processor__pb2.FileProcessResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'fileprocessor.FileProcessor', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class FileProcessor(object):
+    """Service definition
+    """
+
+    @staticmethod
+    def ProcessFile(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/fileprocessor.FileProcessor/ProcessFile',
+            file__processor__pb2.FileUploadRequest.SerializeToString,
+            file__processor__pb2.FileProcessResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/backend/simple/fileprocessor/file_processor/service.py b/backend/simple/fileprocessor/file_processor/service.py
@@ -0,0 +1,147 @@
+import subprocess
+import os
+import logging
+from PyPDF2 import PdfReader
+import asyncio
+import json
+from langdetect import detect
+from io import BytesIO
+import uuid
+import shutil
+import tempfile
+
+import grpc
+from concurrent import futures
+import file_processor_pb2
+import file_processor_pb2_grpc
+
+# Securely configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer):
+    async def ProcessFile(self, request, context):
+        input_pdf_bytes = request.file
+        file_id = request.fileId
+        filename = request.filename
+        texts, error = await ocr_pdf_and_extract_text(filename, 'eng', input_bytes=input_pdf_bytes)
+        if texts:
+            json_response = generate_json_response(file_id, filename, texts)
+            response = json.loads(json_response)
+            pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in response["pages"]]
+            metadata = file_processor_pb2.FileMetadata(title=response["metadata"]["title"],
+                                                        pageCount=response["metadata"]["pageCount"],
+                                                        filesize=response["metadata"]["filesize"],
+                                                        locale=response["metadata"]["locale"])
+            return file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=metadata, pages=pages)
+        else:
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(error)
+            return file_processor_pb2.FileProcessResponse()
+
+
+# Validate and sanitize input PDF path
+def secure_path(input_pdf):
+    if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'):
+        raise ValueError("Invalid PDF file.")
+    return os.path.abspath(input_pdf)
+
+async def ocr_pdf_and_extract_text(input_pdf, lang='eng',input_bytes=None):
+    """
+    Perform OCR on a PDF file and extract text securely, without generating an output file.
+    """
+    try:
+        input_pdf = secure_path(input_pdf)
+    except ValueError as e:
+        logging.error(f"Security check failed: {e}")
+        return None, str(e)
+
+    # comment the above line once grpc implemented
+
+    # # Generate temporary file from the GRPC bytes
+    # with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_input:
+    #     tmp_input.write(input_bytes)
+    #     tmp_input_path = tmp_input.name
+
+    # # Use a temporary file to handle OCR output securely
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output:
+        ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", input_pdf, tmp_output.name]
+        # ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", tmp_input_path, tmp_output.name] Uncomment when GRPC implemented
+
+        try:
+            process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            stdout, stderr = await process.communicate()
+            if process.returncode == 0:
+                logging.info(f"Successfully processed {input_pdf}.")
+                # Securely read the OCR output from the temporary file
+                with open(tmp_output.name, 'rb') as ocr_output:
+                    texts = extract_text_from_stream(ocr_output)
+                return texts, None
+            else:
+                error_message = f"Error processing {input_pdf}: {stderr.decode()}"
+                logging.error(error_message)
+                return None, error_message
+        except Exception as e:
+            logging.error(f"An unexpected error occurred: {e}")
+            return None, str(e)
+        finally:
+            # Ensure temporary file is securely deleted
+            os.remove(tmp_output.name)
+
+def extract_text_from_stream(pdf_stream):
+    """
+    Securely extract text from a PDF stream.
+    """
+    try:
+        reader = PdfReader(pdf_stream)
+        texts = [page.extract_text() for page in reader.pages if page.extract_text()]
+        return texts
+    except Exception as e:
+        logging.error(f"Failed to extract text from stream: {e}")
+        return []
+
+def generate_json_response(file_id, file_path, texts):
+    """
+    Generate a secure JSON response without needing to reference an output file path.
+    """
+    try:
+        metadata = {
+            "title": os.path.basename(file_path),
+            "pageCount": len(texts),
+            "filesize": os.path.getsize(file_path),
+            "locale": detect(' '.join(texts)) if texts else "unknown"
+        }
+    except Exception as e:
+        logging.error(f"Error generating metadata for {file_path}: {e}")
+        metadata = {}
+
+    response = {
+        "fileId": file_id,
+        "metadata": metadata,
+        "pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)]
+    }
+
+    return json.dumps(response, indent=4, ensure_ascii=False)
+
+async def main():
+    input_pdf = "example.pdf"
+    input_pdf_bytes = b''  # This should be the actual bytes of the PDF file
+
+    lang = "eng"
+    file_id = str(uuid.uuid4())
+
+    texts, error = await ocr_pdf_and_extract_text(input_pdf, lang)
+    if texts:
+        json_response = generate_json_response(file_id, input_pdf, texts)
+        print(json_response)
+    else:
+        logging.error(f"OCR processing failed: {error}")
+
+async def serve():
+    server = grpc.aio.server()
+    file_processor_pb2_grpc.add_FileProcessorServicer_to_server(FileProcessorServicer(), server)
+    server.add_insecure_port('[::]:50051')
+    await server.start()
+    await server.wait_for_termination()
+
+if __name__ == '__main__':
+    asyncio.run(serve())
diff --git a/backend/simple/fileprocessor/file_processor/service_client.py b/backend/simple/fileprocessor/file_processor/service_client.py
@@ -0,0 +1,26 @@
+import asyncio
+import grpc
+import file_processor_pb2
+import file_processor_pb2_grpc
+
+async def process_file_stub(pdf_bytes, filename, file_id):
+    async with grpc.aio.insecure_channel('localhost:50051') as channel:
+        stub = file_processor_pb2_grpc.FileProcessorStub(channel)
+        request = file_processor_pb2.FileUploadRequest(
+            file=pdf_bytes,
+            filename=filename,
+            fileId=file_id
+        )
+        response = await stub.ProcessFile(request)
+        return response
+
+async def main():
+    input_pdf_bytes = b'<PDF_BYTES_HERE>'  # Replace <PDF_BYTES_HERE> with actual PDF bytes
+    filename = "example.pdf"
+    file_id = "12345678-abcd-1234-abcd-1234567890ab"
+
+    response = await process_file_stub(input_pdf_bytes, filename, file_id)
+    print(response)
+
+if __name__ == "__main__":
+    asyncio.run(main())