Skip to content

Commit

Permalink
feat(frontend): add marketplace (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
ztdevelops authored Feb 27, 2024
1 parent c106982 commit 68427a2
Show file tree
Hide file tree
Showing 30 changed files with 1,245 additions and 66 deletions.
37 changes: 37 additions & 0 deletions backend/simple/fileprocessor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Start with a Python slim base image
FROM python:3.9-slim

# Avoid prompts from apt
ENV DEBIAN_FRONTEND=noninteractive

# Set timezone (required for some configurations)
ENV TZ=Asia/Singapore
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

# Install OCRmyPDF, Tesseract OCR, and other dependencies
RUN apt-get update && \
apt-get install -y \
ocrmypdf \
tesseract-ocr \
tesseract-ocr-eng \
libsm6 \
libxext6 \
libxrender-dev \
poppler-utils \
ghostscript \
qpdf \
&& apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Set the working directory in the container
WORKDIR /app

# Copy the requirements file and install Python dependencies
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the application
COPY . /app

# Set the default command to run the app
CMD ["python3", "/app/file_processor/service.py"]
30 changes: 30 additions & 0 deletions backend/simple/fileprocessor/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
PROJECT_NAME := ESD
ENVIRONMENT ?= dev
NPM_SUBDIRS := client
PYTHON_VERSION := 3.11.2
VENV_NAME := venv
VENV_DEACTIVATE := deactivate
ifeq ($(OS),Windows_NT)
PYTHON := python
VENV_ACTIVATE := venv\Scripts\activate.bat
RM := del /s /q
else
PYTHON := python3
VENV_ACTIVATE := . venv/bin/activate
RM := rm -rf
endif
init: check-terraform check-aws-cli venv requirements npm-install

venv:
@echo "Creating python virtual environment in '$(VENV_NAME)' folder..."
$(PYTHON) -m venv $(VENV_NAME)

requirements:
@echo "Installing Python requirements..."
@$(VENV_ACTIVATE) && pip install -r requirements.txt

deactivate-venv:
@echo "Deactivating virtual environment..."
$(RM) $(VENV_NAME)
@find . -name "*.pyc" -delete
@exit 0
9 changes: 9 additions & 0 deletions backend/simple/fileprocessor/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
version: "3"
services:
pdf-reader:
build:
context: .
ports:
- "50051:50051"
volumes:
- ./example.pdf:/app/example.pdf
Binary file added backend/simple/fileprocessor/example.pdf
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
37 changes: 37 additions & 0 deletions backend/simple/fileprocessor/file_processor/file_processor.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
syntax = "proto3";

package fileprocessor;

// Service definition
service FileProcessor {
rpc ProcessFile(FileUploadRequest) returns (FileProcessResponse);
}

// Message for file upload request
message FileUploadRequest {
string userId = 1;
string fileId = 2;
string filename = 3;
bytes file = 4; // PDF file content
}

// Metadata associated with the file
message FileMetadata {
string title = 1;
int64 pageCount = 2;
int64 filesize = 3;
string locale = 4;
}

// Response message including processed file information
message FileProcessResponse {
string fileId = 1;
FileMetadata metadata = 2;
repeated Page pages = 3; // Processed pages from AWS Textract
}

// Processed page content from the file
message Page {
int64 pageId = 1;
string content = 2; // Extracted text content of the page
}
34 changes: 34 additions & 0 deletions backend/simple/fileprocessor/file_processor/file_processor_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc

import file_processor_pb2 as file__processor__pb2


class FileProcessorStub(object):
"""Service definition
"""

def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.ProcessFile = channel.unary_unary(
'/fileprocessor.FileProcessor/ProcessFile',
request_serializer=file__processor__pb2.FileUploadRequest.SerializeToString,
response_deserializer=file__processor__pb2.FileProcessResponse.FromString,
)


class FileProcessorServicer(object):
"""Service definition
"""

def ProcessFile(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')


def add_FileProcessorServicer_to_server(servicer, server):
rpc_method_handlers = {
'ProcessFile': grpc.unary_unary_rpc_method_handler(
servicer.ProcessFile,
request_deserializer=file__processor__pb2.FileUploadRequest.FromString,
response_serializer=file__processor__pb2.FileProcessResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'fileprocessor.FileProcessor', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))


# This class is part of an EXPERIMENTAL API.
class FileProcessor(object):
"""Service definition
"""

@staticmethod
def ProcessFile(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/fileprocessor.FileProcessor/ProcessFile',
file__processor__pb2.FileUploadRequest.SerializeToString,
file__processor__pb2.FileProcessResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
147 changes: 147 additions & 0 deletions backend/simple/fileprocessor/file_processor/service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import subprocess
import os
import logging
from PyPDF2 import PdfReader
import asyncio
import json
from langdetect import detect
from io import BytesIO
import uuid
import shutil
import tempfile

import grpc
from concurrent import futures
import file_processor_pb2
import file_processor_pb2_grpc

# Securely configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer):
async def ProcessFile(self, request, context):
input_pdf_bytes = request.file
file_id = request.fileId
filename = request.filename
texts, error = await ocr_pdf_and_extract_text(filename, 'eng', input_bytes=input_pdf_bytes)
if texts:
json_response = generate_json_response(file_id, filename, texts)
response = json.loads(json_response)
pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in response["pages"]]
metadata = file_processor_pb2.FileMetadata(title=response["metadata"]["title"],
pageCount=response["metadata"]["pageCount"],
filesize=response["metadata"]["filesize"],
locale=response["metadata"]["locale"])
return file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=metadata, pages=pages)
else:
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(error)
return file_processor_pb2.FileProcessResponse()


# Validate and sanitize input PDF path
def secure_path(input_pdf):
if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'):
raise ValueError("Invalid PDF file.")
return os.path.abspath(input_pdf)

async def ocr_pdf_and_extract_text(input_pdf, lang='eng',input_bytes=None):
"""
Perform OCR on a PDF file and extract text securely, without generating an output file.
"""
try:
input_pdf = secure_path(input_pdf)
except ValueError as e:
logging.error(f"Security check failed: {e}")
return None, str(e)

# comment the above line once grpc implemented

# # Generate temporary file from the GRPC bytes
# with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_input:
# tmp_input.write(input_bytes)
# tmp_input_path = tmp_input.name

# # Use a temporary file to handle OCR output securely
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output:
ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", input_pdf, tmp_output.name]
# ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", tmp_input_path, tmp_output.name] Uncomment when GRPC implemented

try:
process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = await process.communicate()
if process.returncode == 0:
logging.info(f"Successfully processed {input_pdf}.")
# Securely read the OCR output from the temporary file
with open(tmp_output.name, 'rb') as ocr_output:
texts = extract_text_from_stream(ocr_output)
return texts, None
else:
error_message = f"Error processing {input_pdf}: {stderr.decode()}"
logging.error(error_message)
return None, error_message
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
return None, str(e)
finally:
# Ensure temporary file is securely deleted
os.remove(tmp_output.name)

def extract_text_from_stream(pdf_stream):
"""
Securely extract text from a PDF stream.
"""
try:
reader = PdfReader(pdf_stream)
texts = [page.extract_text() for page in reader.pages if page.extract_text()]
return texts
except Exception as e:
logging.error(f"Failed to extract text from stream: {e}")
return []

def generate_json_response(file_id, file_path, texts):
"""
Generate a secure JSON response without needing to reference an output file path.
"""
try:
metadata = {
"title": os.path.basename(file_path),
"pageCount": len(texts),
"filesize": os.path.getsize(file_path),
"locale": detect(' '.join(texts)) if texts else "unknown"
}
except Exception as e:
logging.error(f"Error generating metadata for {file_path}: {e}")
metadata = {}

response = {
"fileId": file_id,
"metadata": metadata,
"pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)]
}

return json.dumps(response, indent=4, ensure_ascii=False)

async def main():
input_pdf = "example.pdf"
input_pdf_bytes = b'' # This should be the actual bytes of the PDF file

lang = "eng"
file_id = str(uuid.uuid4())

texts, error = await ocr_pdf_and_extract_text(input_pdf, lang)
if texts:
json_response = generate_json_response(file_id, input_pdf, texts)
print(json_response)
else:
logging.error(f"OCR processing failed: {error}")

async def serve():
server = grpc.aio.server()
file_processor_pb2_grpc.add_FileProcessorServicer_to_server(FileProcessorServicer(), server)
server.add_insecure_port('[::]:50051')
await server.start()
await server.wait_for_termination()

if __name__ == '__main__':
asyncio.run(serve())
26 changes: 26 additions & 0 deletions backend/simple/fileprocessor/file_processor/service_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import asyncio
import grpc
import file_processor_pb2
import file_processor_pb2_grpc

async def process_file_stub(pdf_bytes, filename, file_id):
async with grpc.aio.insecure_channel('localhost:50051') as channel:
stub = file_processor_pb2_grpc.FileProcessorStub(channel)
request = file_processor_pb2.FileUploadRequest(
file=pdf_bytes,
filename=filename,
fileId=file_id
)
response = await stub.ProcessFile(request)
return response

async def main():
input_pdf_bytes = b'<PDF_BYTES_HERE>' # Replace <PDF_BYTES_HERE> with actual PDF bytes
filename = "example.pdf"
file_id = "12345678-abcd-1234-abcd-1234567890ab"

response = await process_file_stub(input_pdf_bytes, filename, file_id)
print(response)

if __name__ == "__main__":
asyncio.run(main())
Loading

0 comments on commit 68427a2

Please sign in to comment.