Skip to content

Commit

Permalink
Merge pull request #7 from dlcs/feature/handle_url_chars
Browse files Browse the repository at this point in the history
Prevent pdfalto subprocess call failing due input filename
  • Loading branch information
donaldgray authored Feb 2, 2024
2 parents 3a3ab75 + f6bdf5b commit cbf52ac
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 14 deletions.
12 changes: 8 additions & 4 deletions app/pdf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ def extract_alto(self) -> bool:
:return: boolean indicating success
"""

logger.info(f"Processing {self.pdf_location} with id {self.pdf_identifier}")
logger.info(f"Processing '{self.pdf_location}' with id {self.pdf_identifier}")
work_folder = self._create_work_folder()

try:
target_file = Path(work_folder, self.pdf_location.split("/")[-1])
downloaded_pdf = self._download_pdf(target_file)
target_file = Path(work_folder, "download.pdf")
marker_file = Path(work_folder, "details.txt")
downloaded_pdf = self._download_pdf(target_file, marker_file)

if not downloaded_pdf:
logger.error("Unable to download PDF")
Expand All @@ -79,14 +80,17 @@ def _create_work_folder(self):
work_folder.mkdir(parents=True, exist_ok=False)
return work_folder

def _download_pdf(self, target_file: Path):
def _download_pdf(self, target_file: Path, marker_file: Path):
try:
download_request = requests.get(self.pdf_location, stream=True)
download_request.raise_for_status()

with open(target_file, "wb") as file:
for chunk in download_request.iter_content(DOWNLOAD_CHUNK_SIZE):
file.write(chunk)

with open(marker_file, 'w') as marker:
marker.write(f"id:{self.pdf_identifier}, src:{self.pdf_location}")
return True
except Exception as download_exception:
logger.exception(
Expand Down
2 changes: 1 addition & 1 deletion compose/localstack/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
FROM localstack/localstack:2.2.0
FROM localstack/localstack:2.3.0
COPY seed-resources.sh /etc/localstack/init/ready.d/
14 changes: 7 additions & 7 deletions compose/localstack/seed-resources.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#! /bin/bash
# create bucket
awslocal s3 mb s3://pdf-to-alto
awslocal s3 mb s3://pdf-to-alto --region eu-west-1

# create incoming and complete queue
awslocal sqs create-queue --queue-name incoming
awslocal sqs create-queue --queue-name completed
awslocal sqs create-queue --queue-name incoming --region eu-west-1
awslocal sqs create-queue --queue-name completed --region eu-west-1

# create incoming and complete bucket
awslocal sns create-topic --name incoming-topic
awslocal sns create-topic --name completed-topic
awslocal sns create-topic --name incoming-topic --region eu-west-1
awslocal sns create-topic --name completed-topic --region eu-west-1

# create subscribe sqs queue to respective topic
awslocal sns subscribe --topic-arn arn:aws:sns:eu-west-1:000000000000:incoming-topic --protocol sqs --notification-endpoint arn:aws:sqs:eu-west-1:000000000000:incoming
awslocal sns subscribe --topic-arn arn:aws:sns:eu-west-1:000000000000:completed-topic --protocol sqs --notification-endpoint arn:aws:sqs:eu-west-1:000000000000:completed
awslocal sns subscribe --topic-arn arn:aws:sns:eu-west-1:000000000000:incoming-topic --protocol sqs --notification-endpoint arn:aws:sqs:eu-west-1:000000000000:incoming --region eu-west-1
awslocal sns subscribe --topic-arn arn:aws:sns:eu-west-1:000000000000:completed-topic --protocol sqs --notification-endpoint arn:aws:sqs:eu-west-1:000000000000:completed --region eu-west-1
3 changes: 1 addition & 2 deletions docker-compose.local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ services:
- "4566:4566"
environment:
- SERVICES=s3,sqs,sns
- HOSTNAME_EXTERNAL=localstack
- DEFAULT_REGION=eu-west-1
- LOCALSTACK_HOST=localstack
volumes:
- "localstack_tmp:/tmp/localstack"

0 comments on commit cbf52ac

Please sign in to comment.