-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(fileprocessor): issue with grpc where files are not found + minor…
… cleanup
- Loading branch information
Showing
28 changed files
with
1,038 additions
and
187 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
version: '3' | ||
version: "3" | ||
services: | ||
pdf-reader: | ||
build: | ||
context: . | ||
ports: | ||
- "50051:50051" | ||
volumes: | ||
- ./example.pdf:/app/example.pdf | ||
- ./src/example.pdf:/app/example.pdf |
This file was deleted.
Oops, something went wrong.
File renamed without changes.
Binary file added
BIN
+1.51 KB
backend/simple/fileprocessor/src/__pycache__/file_processor_pb2.cpython-39.pyc
Binary file not shown.
Binary file added
BIN
+2.47 KB
backend/simple/fileprocessor/src/__pycache__/file_processor_pb2_grpc.cpython-39.pyc
Binary file not shown.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import subprocess | ||
import os | ||
import logging | ||
from PyPDF2 import PdfReader | ||
import asyncio | ||
import json | ||
from langdetect import detect | ||
from io import BytesIO | ||
import uuid | ||
import shutil | ||
import tempfile | ||
|
||
import grpc | ||
from concurrent import futures | ||
import file_processor_pb2 | ||
import file_processor_pb2_grpc | ||
|
||
# Securely configure logging | ||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | ||
|
||
class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer): | ||
async def ProcessFile(self, request, context): | ||
input_pdf_bytes = request.file | ||
file_id = request.fileId | ||
filename = request.filename | ||
texts, error = await ocr_pdf_and_extract_text(lang='eng', input_bytes=input_pdf_bytes) | ||
if texts: | ||
json_response = generate_json_response(file_id, filename, texts) | ||
response = json.loads(json_response) | ||
pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in response["pages"]] | ||
metadata = file_processor_pb2.FileMetadata(title=response["metadata"]["title"], | ||
pageCount=response["metadata"]["pageCount"], | ||
filesize=response["metadata"]["filesize"], | ||
locale=response["metadata"]["locale"]) | ||
return file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=metadata, pages=pages) | ||
else: | ||
context.set_code(grpc.StatusCode.INTERNAL) | ||
context.set_details(error) | ||
return file_processor_pb2.FileProcessResponse() | ||
|
||
|
||
# Validate and sanitize input PDF path | ||
def secure_path(input_pdf): | ||
if not os.path.isfile(input_pdf) or not input_pdf.lower().endswith('.pdf'): | ||
raise ValueError("Invalid PDF file.") | ||
return os.path.abspath(input_pdf) | ||
|
||
async def ocr_pdf_and_extract_text(lang='eng', input_bytes=None): | ||
""" | ||
Perform OCR on a PDF file and extract text securely, directly from bytes, without generating an output file. | ||
""" | ||
if input_bytes is None: | ||
return None, "No input bytes provided." | ||
|
||
try: | ||
# Use BytesIO to handle the input PDF bytes | ||
input_stream = BytesIO(input_bytes) | ||
|
||
# Use a temporary file to handle OCR output securely | ||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output: | ||
ocrmypdf_cmd = ["ocrmypdf", "-l", lang, "--force-ocr", "--output-type", "pdf", "-", tmp_output.name] | ||
|
||
process = await asyncio.create_subprocess_exec(*ocrmypdf_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
stdout, stderr = await process.communicate(input=input_stream.read()) | ||
|
||
if process.returncode == 0: | ||
logging.info("Successfully processed PDF from bytes.") | ||
with open(tmp_output.name, 'rb') as ocr_output: | ||
texts = extract_text_from_stream(ocr_output) | ||
return texts, None | ||
else: | ||
error_message = f"Error processing PDF from bytes: {stderr.decode()}" | ||
logging.error(error_message) | ||
return None, error_message | ||
except Exception as e: | ||
logging.error(f"An unexpected error occurred: {e}") | ||
return None, str(e) | ||
finally: | ||
input_stream.close() # Ensure the BytesIO stream is closed | ||
|
||
def extract_text_from_stream(pdf_stream): | ||
""" | ||
Securely extract text from a PDF stream. | ||
""" | ||
try: | ||
reader = PdfReader(pdf_stream) | ||
texts = [page.extract_text() for page in reader.pages if page.extract_text()] | ||
return texts | ||
except Exception as e: | ||
logging.error(f"Failed to extract text from stream: {e}") | ||
return [] | ||
|
||
def generate_json_response(file_id, file_path, texts): | ||
""" | ||
Generate a secure JSON response without needing to reference an output file path. | ||
""" | ||
try: | ||
metadata = { | ||
"title": os.path.basename(file_path), | ||
"pageCount": len(texts), | ||
"filesize": os.path.getsize(file_path), | ||
"locale": detect(' '.join(texts)) if texts else "unknown" | ||
} | ||
except Exception as e: | ||
logging.error(f"Error generating metadata for {file_path}: {e}") | ||
metadata = {} | ||
|
||
response = { | ||
"fileId": file_id, | ||
"metadata": metadata, | ||
"pages": [{"pageId": idx + 1, "content": text or "Error extracting text"} for idx, text in enumerate(texts)] | ||
} | ||
|
||
return json.dumps(response, indent=4, ensure_ascii=False) | ||
|
||
async def main(): | ||
input_pdf = "example.pdf" | ||
input_pdf_bytes = b'' # This should be the actual bytes of the PDF file | ||
|
||
lang = "eng" | ||
file_id = str(uuid.uuid4()) | ||
|
||
texts, error = await ocr_pdf_and_extract_text(input_pdf, lang) | ||
if texts: | ||
json_response = generate_json_response(file_id, input_pdf, texts) | ||
print(json_response) | ||
else: | ||
logging.error(f"OCR processing failed: {error}") | ||
|
||
async def serve(): | ||
server = grpc.aio.server() | ||
file_processor_pb2_grpc.add_FileProcessorServicer_to_server(FileProcessorServicer(), server) | ||
server.add_insecure_port('[::]:50051') | ||
await server.start() | ||
await server.wait_for_termination() | ||
|
||
if __name__ == '__main__': | ||
asyncio.run(serve()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import asyncio | ||
import uuid | ||
import grpc | ||
import file_processor_pb2 | ||
import file_processor_pb2_grpc | ||
|
||
async def process_file_stub(user_id, file, filename, file_id): | ||
async with grpc.aio.insecure_channel('localhost:50051') as channel: | ||
stub = file_processor_pb2_grpc.FileProcessorStub(channel) | ||
request = file_processor_pb2.FileUploadRequest( | ||
userId=user_id, | ||
file=file, # Use the correct parameter name based on your proto file definition | ||
filename=filename, | ||
fileId=str(file_id) # Ensure file_id is a string | ||
) | ||
response = await stub.ProcessFile(request) | ||
return response | ||
|
||
async def main(): | ||
# Read the PDF file as bytes | ||
with open("example.pdf", "rb") as pdf_file: | ||
input_pdf_bytes = pdf_file.read() | ||
user_id = str(uuid.uuid4()) | ||
filename = "example.pdf" | ||
file_id = uuid.uuid4() # This will be converted to a string in the request | ||
print(type(input_pdf_bytes)) | ||
|
||
# Correct the function call with proper argument names | ||
response = await process_file_stub(user_id=user_id, file=input_pdf_bytes, filename=filename, file_id=file_id) | ||
print(response) | ||
|
||
if __name__ == "__main__": | ||
asyncio.run(main()) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.