-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat(simple file processor): added simple file processor service using ocrmypdf. No GRPC implemented * feat(fileprocessor): changed port and added proto files * fix(fileprocessor): did further cleanup and fixed minor error handling bugs * chore(fileprocessor): added extensive logging and error handling --------- Co-authored-by: Thaddeaus Low <thaddeausl.2022@scis.smu.edu.sg>
- Loading branch information
1 parent
68427a2
commit 30628bf
Showing
18 changed files
with
245 additions
and
181 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,4 @@ services: | |
ports: | ||
- "50051:50051" | ||
volumes: | ||
- ./example.pdf:/app/example.pdf | ||
- ./src/example.pdf:/app/example.pdf |
Empty file.
This file was deleted.
Oops, something went wrong.
26 changes: 0 additions & 26 deletions
26
backend/simple/fileprocessor/file_processor/service_client.py
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
Binary file renamed
BIN
+1.51 KB
...cache__/file_processor_pb2.cpython-39.pyc → ...cache__/file_processor_pb2.cpython-39.pyc
Binary file not shown.
Binary file renamed
BIN
+2.47 KB
...__/file_processor_pb2_grpc.cpython-39.pyc → ...__/file_processor_pb2_grpc.cpython-39.pyc
Binary file not shown.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
53 changes: 53 additions & 0 deletions
53
backend/simple/fileprocessor/src/file_processor_service.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# file_processor_service.py | ||
import grpc | ||
import logging | ||
import file_processor_pb2_grpc | ||
import file_processor_pb2 | ||
from ocr_processing import process_pdf_file | ||
from datetime import datetime | ||
import uuid | ||
import os | ||
|
||
class FileProcessorServicer(file_processor_pb2_grpc.FileProcessorServicer): | ||
def ProcessFile(self, request, context): | ||
file_id = request.fileId | ||
filename = request.filename | ||
input_pdf_bytes = request.file | ||
environment_mode = os.getenv('ENVIRONMENT_MODE', 'development') # Default to development if not set | ||
|
||
# Check for kong-request-id in metadata if the mode is production | ||
request_metadata = None | ||
if environment_mode.lower() == 'production': | ||
if 'kong-request-id' not in request.metadata or not request.metadata['kong-request-id']: | ||
context.abort( | ||
code=grpc.StatusCode.INVALID_ARGUMENT, | ||
details="Missing required 'kong-request-id' in metadata for production mode.", | ||
) | ||
request_metadata = request.metadata | ||
|
||
try: | ||
texts, metadata = process_pdf_file(input_pdf_bytes, filename) | ||
pages = [file_processor_pb2.Page(pageId=p["pageId"], content=p["content"]) for p in texts] | ||
file_metadata = file_processor_pb2.FileMetadata(title=metadata["title"], | ||
pageCount=metadata["pageCount"], | ||
filesize=metadata["filesize"], | ||
locale=metadata["locale"]) | ||
response_payload = file_processor_pb2.FileProcessResponse(fileId=file_id, metadata=file_metadata, pages=pages) | ||
|
||
# Wrap the response payload in ServiceResponseWrapper | ||
response_wrapper = file_processor_pb2.ServiceResponseWrapper() | ||
kong_request_id = request.metadata.get('kong-request-id') if request_metadata else str(uuid.uuid4()) | ||
response_wrapper.metadata.request_id = kong_request_id | ||
response_wrapper.metadata.timestamp.FromDatetime(datetime.now()) | ||
response_wrapper.payload.Pack(response_payload) | ||
|
||
return response_wrapper | ||
except Exception as e: | ||
logging.error(f"Error processing file {file_id}: {str(e)}", exc_info=True) | ||
|
||
# Use standard gRPC status codes and metadata for error handling | ||
context.abort( | ||
code=grpc.StatusCode.INTERNAL, | ||
details="Internal server error occurred.", | ||
metadata=(('error-details', str(e)),) # Include the exception message in error-details | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# ocr_processing.py | ||
import logging | ||
from PyPDF2 import PdfReader | ||
from io import BytesIO | ||
import tempfile | ||
import subprocess | ||
import os | ||
from utilities import generate_json_response, detect_locale | ||
|
||
def process_pdf_file(input_pdf_bytes, filename): | ||
try: | ||
input_stream = BytesIO(input_pdf_bytes) | ||
texts, temp_pdf_path = ocr_pdf(input_stream) | ||
metadata = generate_metadata(filename, temp_pdf_path, texts) | ||
|
||
if os.path.exists(temp_pdf_path): | ||
os.remove(temp_pdf_path) | ||
|
||
return texts, metadata | ||
except Exception as e: | ||
logging.error(f"Error in OCR processing for file {filename}: {str(e)}", exc_info=True) | ||
raise | ||
|
||
def ocr_pdf(input_stream): | ||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_output: | ||
ocrmypdf_cmd = ["ocrmypdf", "-l", "eng", "--force-ocr", "--output-type", "pdf", "-", tmp_output.name] | ||
process = subprocess.run(ocrmypdf_cmd, input=input_stream.read(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
|
||
if process.returncode != 0: | ||
logging.error(f"OCR command failed: {process.stderr.decode()}") | ||
raise Exception(f"OCR failed for file {tmp_output.name}: {process.stderr.decode()}") | ||
|
||
texts = extract_text_from_pdf(tmp_output.name) | ||
return texts, tmp_output.name | ||
|
||
def extract_text_from_pdf(pdf_path): | ||
texts = [] | ||
with open(pdf_path, 'rb') as pdf_file: | ||
reader = PdfReader(pdf_file) | ||
for page_num, page in enumerate(reader.pages, start=1): | ||
text = page.extract_text() or "Error extracting text" | ||
texts.append({"pageId": page_num, "content": text}) | ||
return texts | ||
|
||
def generate_metadata(filename, pdf_path, texts): | ||
locale = detect_locale(' '.join([text["content"] for text in texts])) | ||
metadata = { | ||
"title": os.path.basename(filename), | ||
"pageCount": len(texts), | ||
"filesize": os.path.getsize(pdf_path), | ||
"locale": locale | ||
} | ||
return metadata |
Oops, something went wrong.