Skip to content

Commit

Permalink
Better handling of fetching page dimensions
Browse files Browse the repository at this point in the history
Not all PDFs have image masks which led to some pages being skipped if
resizing requested
  • Loading branch information
donaldgray committed Nov 21, 2023
1 parent e9efbbd commit 2a0c270
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
8 changes: 7 additions & 1 deletion app/pdf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,16 @@ def _get_pdf_page_attributes(pdf: str) -> dict:

pdf_addrs = {}
for i in range(len(doc)):
found_dimensions = False
for img in doc.get_page_images(i):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
pdf_addrs[i] = [pix.w, pix.h]
found_dimensions = True

if not found_dimensions:
page = doc[i]
pdf_addrs[i] = [int(page.rect.width), int(page.rect.height)]

return pdf_addrs

Expand Down Expand Up @@ -236,6 +242,6 @@ def generate_guid():

if __name__ == "__main__":
args = sys.argv[1:]
processor = PDFProcessor(args[0], args[1])
processor = PDFProcessor(args[0], args[1], args[2])
processor.extract_alto()
print(processor.generated_alto)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jmespath==0.10.0
logzero==1.7.0
lxml==4.9.3
pycryptodome==3.12.0
PyMuPDF==1.22.5
PyMuPDF==1.23.6
python-dateutil==2.8.2
requests==2.27.1
s3transfer==0.7.0
Expand Down

0 comments on commit 2a0c270

Please sign in to comment.