-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(frontend): add marketplace (#5)
- Loading branch information
1 parent
c106982
commit 68427a2
Showing
30 changed files
with
1,245 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Start with a Python slim base image | ||
FROM python:3.9-slim | ||
|
||
# Avoid prompts from apt | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# Set timezone (required for some configurations) | ||
ENV TZ=Asia/Singapore | ||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone | ||
|
||
# Install OCRmyPDF, Tesseract OCR, and other dependencies | ||
RUN apt-get update && \ | ||
apt-get install -y \ | ||
ocrmypdf \ | ||
tesseract-ocr \ | ||
tesseract-ocr-eng \ | ||
libsm6 \ | ||
libxext6 \ | ||
libxrender-dev \ | ||
poppler-utils \ | ||
ghostscript \ | ||
qpdf \ | ||
&& apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Set the working directory in the container | ||
WORKDIR /app | ||
|
||
# Copy the requirements file and install Python dependencies | ||
COPY requirements.txt /app/ | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# Copy the rest of the application | ||
COPY . /app | ||
|
||
# Set the default command to run the app | ||
CMD ["python3", "/app/file_processor/service.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
PROJECT_NAME := ESD | ||
ENVIRONMENT ?= dev | ||
NPM_SUBDIRS := client | ||
PYTHON_VERSION := 3.11.2 | ||
VENV_NAME := venv | ||
VENV_DEACTIVATE := deactivate | ||
ifeq ($(OS),Windows_NT) | ||
PYTHON := python | ||
VENV_ACTIVATE := venv\Scripts\activate.bat | ||
RM := del /s /q | ||
else | ||
PYTHON := python3 | ||
VENV_ACTIVATE := . venv/bin/activate | ||
RM := rm -rf | ||
endif | ||
init: check-terraform check-aws-cli venv requirements npm-install | ||
|
||
venv: | ||
@echo "Creating python virtual environment in '$(VENV_NAME)' folder..." | ||
$(PYTHON) -m venv $(VENV_NAME) | ||
|
||
requirements: | ||
@echo "Installing Python requirements..." | ||
@$(VENV_ACTIVATE) && pip install -r requirements.txt | ||
|
||
deactivate-venv: | ||
@echo "Deactivating virtual environment..." | ||
$(RM) $(VENV_NAME) | ||
@find . -name "*.pyc" -delete | ||
@exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
version: "3" | ||
services: | ||
pdf-reader: | ||
build: | ||
context: . | ||
ports: | ||
- "50051:50051" | ||
volumes: | ||
- ./example.pdf:/app/example.pdf |
Binary file not shown.
Empty file.
Binary file added
BIN
+1.51 KB
backend/simple/fileprocessor/file_processor/__pycache__/file_processor_pb2.cpython-39.pyc
Binary file not shown.
Binary file added
BIN
+2.47 KB
...nd/simple/fileprocessor/file_processor/__pycache__/file_processor_pb2_grpc.cpython-39.pyc
Binary file not shown.
37 changes: 37 additions & 0 deletions
37
backend/simple/fileprocessor/file_processor/file_processor.proto
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
syntax = "proto3"; | ||
|
||
package fileprocessor; | ||
|
||
// Service definition | ||
service FileProcessor { | ||
rpc ProcessFile(FileUploadRequest) returns (FileProcessResponse); | ||
} | ||
|
||
// Message for file upload request | ||
message FileUploadRequest { | ||
string userId = 1; | ||
string fileId = 2; | ||
string filename = 3; | ||
bytes file = 4; // PDF file content | ||
} | ||
|
||
// Metadata associated with the file | ||
message FileMetadata { | ||
string title = 1; | ||
int64 pageCount = 2; | ||
int64 filesize = 3; | ||
string locale = 4; | ||
} | ||
|
||
// Response message including processed file information | ||
message FileProcessResponse { | ||
string fileId = 1; | ||
FileMetadata metadata = 2; | ||
repeated Page pages = 3; // Processed pages from AWS Textract | ||
} | ||
|
||
// Processed page content from the file | ||
message Page { | ||
int64 pageId = 1; | ||
string content = 2; // Extracted text content of the page | ||
} |
34 changes: 34 additions & 0 deletions
34
backend/simple/fileprocessor/file_processor/file_processor_pb2.py
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
69 changes: 69 additions & 0 deletions
69
backend/simple/fileprocessor/file_processor/file_processor_pb2_grpc.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! | ||
"""Client and server classes corresponding to protobuf-defined services.""" | ||
import grpc | ||
|
||
import file_processor_pb2 as file__processor__pb2 | ||
|
||
|
||
class FileProcessorStub(object): | ||
"""Service definition | ||
""" | ||
|
||
def __init__(self, channel): | ||
"""Constructor. | ||
Args: | ||
channel: A grpc.Channel. | ||
""" | ||
self.ProcessFile = channel.unary_unary( | ||
'/fileprocessor.FileProcessor/ProcessFile', | ||
request_serializer=file__processor__pb2.FileUploadRequest.SerializeToString, | ||
response_deserializer=file__processor__pb2.FileProcessResponse.FromString, | ||
) | ||
|
||
|
||
class FileProcessorServicer(object): | ||
"""Service definition | ||
""" | ||
|
||
def ProcessFile(self, request, context): | ||
"""Missing associated documentation comment in .proto file.""" | ||
context.set_code(grpc.StatusCode.UNIMPLEMENTED) | ||
context.set_details('Method not implemented!') | ||
raise NotImplementedError('Method not implemented!') | ||
|
||
|
||
def add_FileProcessorServicer_to_server(servicer, server): | ||
rpc_method_handlers = { | ||
'ProcessFile': grpc.unary_unary_rpc_method_handler( | ||
servicer.ProcessFile, | ||
request_deserializer=file__processor__pb2.FileUploadRequest.FromString, | ||
response_serializer=file__processor__pb2.FileProcessResponse.SerializeToString, | ||
), | ||
} | ||
generic_handler = grpc.method_handlers_generic_handler( | ||
'fileprocessor.FileProcessor', rpc_method_handlers) | ||
server.add_generic_rpc_handlers((generic_handler,)) | ||
|
||
|
||
# This class is part of an EXPERIMENTAL API. | ||
class FileProcessor(object): | ||
"""Service definition | ||
""" | ||
|
||
@staticmethod | ||
def ProcessFile(request, | ||
target, | ||
options=(), | ||
channel_credentials=None, | ||
call_credentials=None, | ||
insecure=False, | ||
compression=None, | ||
wait_for_ready=None, | ||
timeout=None, | ||
metadata=None): | ||
return grpc.experimental.unary_unary(request, target, '/fileprocessor.FileProcessor/ProcessFile', | ||
file__processor__pb2.FileUploadRequest.SerializeToString, | ||
file__processor__pb2.FileProcessResponse.FromString, | ||
options, channel_credentials, | ||
insecure, call_credentials, compression, wait_for_ready, timeout, metadata) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import subprocess | ||
import os | ||
import logging | ||
from PyPDF2 import PdfReader | ||
import asyncio | ||
import json | ||
from langdetect import detect | ||
from io import BytesIO | ||
import uuid | ||
import shutil | ||
import tempfile | ||
|
||
import grpc | ||
from concurrent import futures | ||
import file_processor_pb2 | ||
import file_processor_pb2_grpc | ||
|
||
# Securely configure logging | ||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | ||
|
||
class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer): | ||
async def ProcessFile(self, request, context): | ||
input_pdf_bytes = request.file | ||
file_id = request.fileId | ||
filename = request.filename | ||
texts, error = await ocr_pdf_and_extract_text(filename, 'eng', input_bytes=input_pdf_bytes) | ||
if texts: | ||
json_response = generate_json_response(file_id, filename, texts) | ||
response = json.loads(json_response) | ||
pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in response["pages"]] | ||
metadata = file_processor_pb2.FileMetadata(title=response["metadata"]["title"], | ||
pageCount=response["metadata"]["pageCount"], | ||
filesize=response["metadata"]["filesize"], | ||
locale=response["metadata"]["locale"]) | ||
return file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=metadata, pages=pages) | ||
else: | ||
context.set_code(grpc.StatusCode.INTERNAL) | ||
context.set_details(error) | ||
return file_processor_pb2.FileProcessResponse() | ||
|
||
|
||
# Validate and sanitize input PDF path | ||
def secure_path(input_pdf): | ||
if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'): | ||
raise ValueError("Invalid PDF file.") | ||
return os.path.abspath(input_pdf) | ||
|
||
async def ocr_pdf_and_extract_text(input_pdf, lang='eng',input_bytes=None): | ||
""" | ||
Perform OCR on a PDF file and extract text securely, without generating an output file. | ||
""" | ||
try: | ||
input_pdf = secure_path(input_pdf) | ||
except ValueError as e: | ||
logging.error(f"Security check failed: {e}") | ||
return None, str(e) | ||
|
||
# comment the above line once grpc implemented | ||
|
||
# # Generate temporary file from the GRPC bytes | ||
# with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_input: | ||
# tmp_input.write(input_bytes) | ||
# tmp_input_path = tmp_input.name | ||
|
||
# # Use a temporary file to handle OCR output securely | ||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output: | ||
ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", input_pdf, tmp_output.name] | ||
# ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", tmp_input_path, tmp_output.name] Uncomment when GRPC implemented | ||
|
||
try: | ||
process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
stdout, stderr = await process.communicate() | ||
if process.returncode == 0: | ||
logging.info(f"Successfully processed {input_pdf}.") | ||
# Securely read the OCR output from the temporary file | ||
with open(tmp_output.name, 'rb') as ocr_output: | ||
texts = extract_text_from_stream(ocr_output) | ||
return texts, None | ||
else: | ||
error_message = f"Error processing {input_pdf}: {stderr.decode()}" | ||
logging.error(error_message) | ||
return None, error_message | ||
except Exception as e: | ||
logging.error(f"An unexpected error occurred: {e}") | ||
return None, str(e) | ||
finally: | ||
# Ensure temporary file is securely deleted | ||
os.remove(tmp_output.name) | ||
|
||
def extract_text_from_stream(pdf_stream): | ||
""" | ||
Securely extract text from a PDF stream. | ||
""" | ||
try: | ||
reader = PdfReader(pdf_stream) | ||
texts = [page.extract_text() for page in reader.pages if page.extract_text()] | ||
return texts | ||
except Exception as e: | ||
logging.error(f"Failed to extract text from stream: {e}") | ||
return [] | ||
|
||
def generate_json_response(file_id, file_path, texts): | ||
""" | ||
Generate a secure JSON response without needing to reference an output file path. | ||
""" | ||
try: | ||
metadata = { | ||
"title": os.path.basename(file_path), | ||
"pageCount": len(texts), | ||
"filesize": os.path.getsize(file_path), | ||
"locale": detect(' '.join(texts)) if texts else "unknown" | ||
} | ||
except Exception as e: | ||
logging.error(f"Error generating metadata for {file_path}: {e}") | ||
metadata = {} | ||
|
||
response = { | ||
"fileId": file_id, | ||
"metadata": metadata, | ||
"pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)] | ||
} | ||
|
||
return json.dumps(response, indent=4, ensure_ascii=False) | ||
|
||
async def main(): | ||
input_pdf = "example.pdf" | ||
input_pdf_bytes = b'' # This should be the actual bytes of the PDF file | ||
|
||
lang = "eng" | ||
file_id = str(uuid.uuid4()) | ||
|
||
texts, error = await ocr_pdf_and_extract_text(input_pdf, lang) | ||
if texts: | ||
json_response = generate_json_response(file_id, input_pdf, texts) | ||
print(json_response) | ||
else: | ||
logging.error(f"OCR processing failed: {error}") | ||
|
||
async def serve(): | ||
server = grpc.aio.server() | ||
file_processor_pb2_grpc.add_FileProcessorServicer_to_server(FileProcessorServicer(), server) | ||
server.add_insecure_port('[::]:50051') | ||
await server.start() | ||
await server.wait_for_termination() | ||
|
||
if __name__ == '__main__': | ||
asyncio.run(serve()) |
26 changes: 26 additions & 0 deletions
26
backend/simple/fileprocessor/file_processor/service_client.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import asyncio | ||
import grpc | ||
import file_processor_pb2 | ||
import file_processor_pb2_grpc | ||
|
||
async def process_file_stub(pdf_bytes, filename, file_id): | ||
async with grpc.aio.insecure_channel('localhost:50051') as channel: | ||
stub = file_processor_pb2_grpc.FileProcessorStub(channel) | ||
request = file_processor_pb2.FileUploadRequest( | ||
file=pdf_bytes, | ||
filename=filename, | ||
fileId=file_id | ||
) | ||
response = await stub.ProcessFile(request) | ||
return response | ||
|
||
async def main(): | ||
input_pdf_bytes = b'<PDF_BYTES_HERE>' # Replace <PDF_BYTES_HERE> with actual PDF bytes | ||
filename = "example.pdf" | ||
file_id = "12345678-abcd-1234-abcd-1234567890ab" | ||
|
||
response = await process_file_stub(input_pdf_bytes, filename, file_id) | ||
print(response) | ||
|
||
if __name__ == "__main__": | ||
asyncio.run(main()) |
Oops, something went wrong.