Skip to content

Commit

Permalink
fix(fileprocessor): issue with grpc where files are not found + minor…
Browse files Browse the repository at this point in the history
… cleanup
  • Loading branch information
neilscallywag committed Feb 28, 2024
2 parents 7e3d823 + 68427a2 commit 548627f
Show file tree
Hide file tree
Showing 28 changed files with 1,038 additions and 187 deletions.
Binary file added backend/simple/fileprocessor.zip
Binary file not shown.
15 changes: 11 additions & 4 deletions backend/simple/fileprocessor/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Singapore
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

# Install OCRmyPDF, Tesseract OCR, and other dependencies
# Install OCRmyPDF, Tesseract OCR, Protocol Buffers Compiler, and other dependencies
RUN apt-get update && \
apt-get install -y \
ocrmypdf \
Expand All @@ -20,18 +20,25 @@ RUN apt-get update && \
poppler-utils \
ghostscript \
qpdf \
protobuf-compiler \
&& apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install gRPC tools for Python
RUN pip install grpcio grpcio-tools

# Set the working directory in the container
WORKDIR /app

# Copy the requirements file and install Python dependencies
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the application
COPY . /app
# Copy the proto files and application files into the container
COPY ./src /app

# Compile the proto files to generate Python code
RUN python -m grpc_tools.protoc -I/app --python_out=/app --grpc_python_out=/app /app/file_processor.proto

# Set the default command to run the app
CMD ["python3", "-m", "file_processor.service"]
CMD ["python3", "-m", "service"]
6 changes: 4 additions & 2 deletions backend/simple/fileprocessor/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
version: '3'
version: "3"
services:
pdf-reader:
build:
context: .
ports:
- "50051:50051"
volumes:
- ./example.pdf:/app/example.pdf
- ./src/example.pdf:/app/example.pdf
114 changes: 0 additions & 114 deletions backend/simple/fileprocessor/file_processor/service.py

This file was deleted.

Binary file not shown.
Binary file not shown.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ message FileProcessResponse {
message Page {
int64 pageId = 1;
string content = 2; // Extracted text content of the page
}
}
138 changes: 138 additions & 0 deletions backend/simple/fileprocessor/src/service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import subprocess
import os
import logging
from PyPDF2 import PdfReader
import asyncio
import json
from langdetect import detect
from io import BytesIO
import uuid
import shutil
import tempfile

import grpc
from concurrent import futures
import file_processor_pb2
import file_processor_pb2_grpc

# Securely configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer):
async def ProcessFile(self, request, context):
input_pdf_bytes = request.file
file_id = request.fileId
filename = request.filename
texts, error = await ocr_pdf_and_extract_text(lang='eng', input_bytes=input_pdf_bytes)
if texts:
json_response = generate_json_response(file_id, filename, texts)
response = json.loads(json_response)
pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in response["pages"]]
metadata = file_processor_pb2.FileMetadata(title=response["metadata"]["title"],
pageCount=response["metadata"]["pageCount"],
filesize=response["metadata"]["filesize"],
locale=response["metadata"]["locale"])
return file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=metadata, pages=pages)
else:
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(error)
return file_processor_pb2.FileProcessResponse()


# Validate and sanitize input PDF path
def secure_path(input_pdf):
if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'):
raise ValueError("Invalid PDF file.")
return os.path.abspath(input_pdf)

async def ocr_pdf_and_extract_text(lang='eng', input_bytes=None):
"""
Perform OCR on a PDF file and extract text securely, directly from bytes, without generating an output file.
"""
if input_bytes is None:
return None, "No input bytes provided."

try:
# Use BytesIO to handle the input PDF bytes
input_stream = BytesIO(input_bytes)

# Use a temporary file to handle OCR output securely
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output:
ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", "-", tmp_output.name]

process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = await process.communicate(input=input_stream.read())

if process.returncode == 0:
logging.info("Successfully processed PDF from bytes.")
with open(tmp_output.name, 'rb') as ocr_output:
texts = extract_text_from_stream(ocr_output)
return texts, None
else:
error_message = f"Error processing PDF from bytes: {stderr.decode()}"
logging.error(error_message)
return None, error_message
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
return None, str(e)
finally:
input_stream.close() # Ensure the BytesIO stream is closed

def extract_text_from_stream(pdf_stream):
"""
Securely extract text from a PDF stream.
"""
try:
reader = PdfReader(pdf_stream)
texts = [page.extract_text() for page in reader.pages if page.extract_text()]
return texts
except Exception as e:
logging.error(f"Failed to extract text from stream: {e}")
return []

def generate_json_response(file_id, file_path, texts):
"""
Generate a secure JSON response without needing to reference an output file path.
"""
try:
metadata = {
"title": os.path.basename(file_path),
"pageCount": len(texts),
"filesize": os.path.getsize(file_path),
"locale": detect(' '.join(texts)) if texts else "unknown"
}
except Exception as e:
logging.error(f"Error generating metadata for {file_path}: {e}")
metadata = {}

response = {
"fileId": file_id,
"metadata": metadata,
"pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)]
}

return json.dumps(response, indent=4, ensure_ascii=False)

async def main():
input_pdf = "example.pdf"
input_pdf_bytes = b'' # This should be the actual bytes of the PDF file

lang = "eng"
file_id = str(uuid.uuid4())

texts, error = await ocr_pdf_and_extract_text(input_pdf, lang)
if texts:
json_response = generate_json_response(file_id, input_pdf, texts)
print(json_response)
else:
logging.error(f"OCR processing failed: {error}")

async def serve():
server = grpc.aio.server()
file_processor_pb2_grpc.add_FileProcessorServicer_to_server(FileProcessorServicer(), server)
server.add_insecure_port('[::]:50051')
await server.start()
await server.wait_for_termination()

if __name__ == '__main__':
asyncio.run(serve())
33 changes: 33 additions & 0 deletions backend/simple/fileprocessor/src/service_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import asyncio
import uuid
import grpc
import file_processor_pb2
import file_processor_pb2_grpc

async def process_file_stub(user_id, file, filename, file_id):
async with grpc.aio.insecure_channel('localhost:50051') as channel:
stub = file_processor_pb2_grpc.FileProcessorStub(channel)
request = file_processor_pb2.FileUploadRequest(
userId=user_id,
file=file, # Use the correct parameter name based on your proto file definition
filename=filename,
fileId=str(file_id) # Ensure file_id is a string
)
response = await stub.ProcessFile(request)
return response

async def main():
# Read the PDF file as bytes
with open("example.pdf", "rb") as pdf_file:
input_pdf_bytes = pdf_file.read()
user_id = str(uuid.uuid4())
filename = "example.pdf"
file_id = uuid.uuid4() # This will be converted to a string in the request
print(type(input_pdf_bytes))

# Correct the function call with proper argument names
response = await process_file_stub(user_id=user_id, file=input_pdf_bytes, filename=filename, file_id=file_id)
print(response)

if __name__ == "__main__":
asyncio.run(main())
17 changes: 17 additions & 0 deletions client/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"zustand": "^4.4.7"
},
"devDependencies": {
"@faker-js/faker": "^8.4.1",
"@types/react": "^18.2.55",
"@types/react-dom": "^18.2.19",
"@typescript-eslint/eslint-plugin": "^6.21.0",
Expand Down
Loading

0 comments on commit 548627f

Please sign in to comment.