From f6bdf5b6d06ec37cb1a802e0abc44265e1653797 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Fri, 2 Feb 2024 08:39:07 +0000 Subject: [PATCH] Resolve issue pdf-alto failed due to some chars Spaces + ampersand in url were seen to cause issues. PDF saved with safe name and source saved in txt file alongside for troubleshooting --- app/pdf_processor.py | 12 ++++++++---- compose/localstack/Dockerfile | 2 +- compose/localstack/seed-resources.sh | 14 +++++++------- docker-compose.local.yml | 3 +-- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/app/pdf_processor.py b/app/pdf_processor.py index fb8ba56..aa92bdb 100644 --- a/app/pdf_processor.py +++ b/app/pdf_processor.py @@ -51,12 +51,13 @@ def extract_alto(self) -> bool: :return: boolean indicating success """ - logger.info(f"Processing {self.pdf_location} with id {self.pdf_identifier}") + logger.info(f"Processing '{self.pdf_location}' with id {self.pdf_identifier}") work_folder = self._create_work_folder() try: - target_file = Path(work_folder, self.pdf_location.split("/")[-1]) - downloaded_pdf = self._download_pdf(target_file) + target_file = Path(work_folder, "download.pdf") + marker_file = Path(work_folder, "details.txt") + downloaded_pdf = self._download_pdf(target_file, marker_file) if not downloaded_pdf: logger.error("Unable to download PDF") @@ -79,7 +80,7 @@ def _create_work_folder(self): work_folder.mkdir(parents=True, exist_ok=False) return work_folder - def _download_pdf(self, target_file: Path): + def _download_pdf(self, target_file: Path, marker_file: Path): try: download_request = requests.get(self.pdf_location, stream=True) download_request.raise_for_status() @@ -87,6 +88,9 @@ def _download_pdf(self, target_file: Path): with open(target_file, "wb") as file: for chunk in download_request.iter_content(DOWNLOAD_CHUNK_SIZE): file.write(chunk) + + with open(marker_file, 'w') as marker: + marker.write(f"id:{self.pdf_identifier}, src:{self.pdf_location}") return True except Exception as download_exception: logger.exception( diff --git a/compose/localstack/Dockerfile b/compose/localstack/Dockerfile index 4be457c..78a2a80 100644 --- a/compose/localstack/Dockerfile +++ b/compose/localstack/Dockerfile @@ -1,2 +1,2 @@ -FROM localstack/localstack:2.2.0 +FROM localstack/localstack:2.3.0 COPY seed-resources.sh /etc/localstack/init/ready.d/ diff --git a/compose/localstack/seed-resources.sh b/compose/localstack/seed-resources.sh index 6636cb5..ef27d03 100644 --- a/compose/localstack/seed-resources.sh +++ b/compose/localstack/seed-resources.sh @@ -1,15 +1,15 @@ #! /bin/bash # create bucket -awslocal s3 mb s3://pdf-to-alto +awslocal s3 mb s3://pdf-to-alto --region eu-west-1 # create incoming and complete queue -awslocal sqs create-queue --queue-name incoming -awslocal sqs create-queue --queue-name completed +awslocal sqs create-queue --queue-name incoming --region eu-west-1 +awslocal sqs create-queue --queue-name completed --region eu-west-1 # create incoming and complete bucket -awslocal sns create-topic --name incoming-topic -awslocal sns create-topic --name completed-topic +awslocal sns create-topic --name incoming-topic --region eu-west-1 +awslocal sns create-topic --name completed-topic --region eu-west-1 # create subscribe sqs queue to respective topic -awslocal sns subscribe --topic-arn arn:aws:sns:eu-west-1:000000000000:incoming-topic --protocol sqs --notification-endpoint arn:aws:sqs:eu-west-1:000000000000:incoming -awslocal sns subscribe --topic-arn arn:aws:sns:eu-west-1:000000000000:completed-topic --protocol sqs --notification-endpoint arn:aws:sqs:eu-west-1:000000000000:completed +awslocal sns subscribe --topic-arn arn:aws:sns:eu-west-1:000000000000:incoming-topic --protocol sqs --notification-endpoint arn:aws:sqs:eu-west-1:000000000000:incoming --region eu-west-1 +awslocal sns subscribe --topic-arn arn:aws:sns:eu-west-1:000000000000:completed-topic --protocol sqs --notification-endpoint arn:aws:sqs:eu-west-1:000000000000:completed --region eu-west-1 diff --git a/docker-compose.local.yml b/docker-compose.local.yml index 5f49158..dd898bb 100644 --- a/docker-compose.local.yml +++ b/docker-compose.local.yml @@ -12,7 +12,6 @@ services: - "4566:4566" environment: - SERVICES=s3,sqs,sns - - HOSTNAME_EXTERNAL=localstack - - DEFAULT_REGION=eu-west-1 + - LOCALSTACK_HOST=localstack volumes: - "localstack_tmp:/tmp/localstack"