Skip to content

Commit

Permalink
Convert PDFs to PNGs before scanning (#113)
Browse files Browse the repository at this point in the history
  • Loading branch information
rw-access authored Dec 20, 2024
1 parent 3af1871 commit 65ff68d
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 7 deletions.
5 changes: 4 additions & 1 deletion build/configs/scanners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -408,13 +408,16 @@ scanners:
- 'jpeg_file'
- 'image/png'
- 'png_file'
- 'image/tiff'
- 'image/tif'
- 'type_is_tiff'
- 'image/x-ms-bmp'
- 'image/bmp'
- 'bmp_file'
- 'image/webp'
- 'pdf_file'
priority: 5
options:
pdf_to_png: True
'ScanRar':
- positive:
flavors:
Expand Down
3 changes: 3 additions & 0 deletions configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,10 @@ scanners:
- 'type_is_tiff'
- 'image/x-ms-bmp'
- 'bmp_file'
- 'pdf_file'
priority: 5
options:
pdf_to_png: True
'ScanRar':
- positive:
flavors:
Expand Down
5 changes: 3 additions & 2 deletions src/python/strelka/scanners/scan_ocr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import fitz
import os
import subprocess
import tempfile

import fitz
from strelka import strelka


Expand All @@ -16,6 +16,7 @@ class ScanOcr(strelka.Scanner):
tmp_directory: Location where tempfile writes temporary files.
Defaults to '/tmp/'.
"""

def scan(self, data, file, options, expire_at):
extract_text = options.get('extract_text', False)
tmp_directory = options.get('tmp_directory', '/tmp/')
Expand All @@ -34,7 +35,7 @@ def scan(self, data, file, options, expire_at):
tess_return = subprocess.call(
['tesseract', tmp_data.name, tmp_tess.name],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
stderr=subprocess.DEVNULL,
)
tess_txt_name = f'{tmp_tess.name}.txt'
if tess_return == 0:
Expand Down
4 changes: 3 additions & 1 deletion src/python/strelka/scanners/scan_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,10 @@ def scan(self, data, file, options, expire_at):
for link in links:
if "uri" in link:
self.event["annotated_uris"].append(link["uri"])
if extract_text:
if extract_text and hasattr(page, "getText"):
extracted_text += page.getText()
if extract_text and hasattr(page, "get_text"):
extracted_text += page.get_text()

# PDF Text Extraction
# Caution: Will increase time and object storage size
Expand Down
16 changes: 13 additions & 3 deletions src/python/strelka/scanners/scan_qr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from pyzbar.pyzbar import decode
from PIL import Image
import io
import re

import fitz
from PIL import Image
from pyzbar.pyzbar import decode, ZBarSymbol
from strelka import strelka

# Regex to match URL
Expand All @@ -14,9 +15,18 @@ class ScanQr(strelka.Scanner):
"""
Collects QR code metadata from image files.
"""

def scan(self, data, file, options, expire_at):
pdf_to_png = options.get('pdf_to_png', False)

try:
barcodes = decode(Image.open(io.BytesIO(data)))
if pdf_to_png and 'application/pdf' in file.flavors.get('mime', []):
# TODO: Use fitz builtin OCR support which also wraps tesseract
doc = fitz.open(stream=data, filetype='pdf')
data = doc.get_page_pixmap(0, dpi=150).tobytes()

img = Image.open(io.BytesIO(data))
barcodes = decode(img, symbols=[ZBarSymbol.QRCODE])

try:
self.event['data'] = barcodes[0].data.decode('utf-8')
Expand Down

0 comments on commit 65ff68d

Please sign in to comment.