Skip to content

Commit

Permalink
feat: scrape church registers, i.e. images (#3)
Browse files Browse the repository at this point in the history
This commit implements a basic version allowing users to scrape
Matricula's most important resource: scanned images of church registers.
With `matricula-online-scraper get church-register <URL|stdin>` can the images
of one or more church registers be scraped. Use `--help` for that specific
commit for details.

Note that the terminology wasn't fixed. Instead of "church registers", previous
code might refer to this as "parish registers". From now on, the former
should be used. This is the term Matricula uses itself.

The spider is already grouped within the new command `get`, while others are
still grouped in `fetch`. This conforms with a future refactoring,
enhancing the CLI structure.

#3 (comment)
was not implemented directly as a CLI option, because the command can handle
multipel URLs, either per argument or read from stdin.
In a future refactoring the other commands should be refined so that chainig
is possible: `command1 | command2`. This allows to pipe a list of all
church registers from e.g. one specific parish into the image scraper.
  • Loading branch information
lsg551 committed Dec 11, 2024
1 parent 470dd28 commit bf18fb2
Show file tree
Hide file tree
Showing 6 changed files with 311 additions and 0 deletions.
35 changes: 35 additions & 0 deletions matricula_online_scraper/cli/cli_utils/badges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from attr import dataclass
from rich.text import Text
from rich.style import Style
from .color import Color


@dataclass
class Badge:
Error = Text(
" ERROR ",
style=Style(bgcolor=Color.Red, bold=True),
justify="center",
end="",
)

Warning = Text(
" WARNING ",
style=Style(bgcolor=Color.Orange, bold=True),
justify="center",
end="",
)

Info = Text(
" INFO ",
style=Style(bgcolor=Color.Blue, bold=True),
justify="center",
end="",
)

Success = Text(
" SUCCESS ",
style=Style(bgcolor=Color.Green, bold=True),
justify="center",
end="",
)
9 changes: 9 additions & 0 deletions matricula_online_scraper/cli/cli_utils/color.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from attr import dataclass


@dataclass
class Color:
Red = "red"
Green = "green"
Blue = "dodger_blue1"
Orange = "orange1"
102 changes: 102 additions & 0 deletions matricula_online_scraper/cli/get.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from typing import Annotated, List, Optional
import sys
import typer
from pathlib import Path
from rich import console
from rich.text import Text
from .cli_utils.badges import Badge
from .cli_utils.color import Color
import logging
from ..spiders.church_register import ChurchRegisterSpider
from scrapy import crawler
import select
# from ..utils.pipeline_observer import PipelineObserver


logger = logging.getLogger(__name__)
stderr = console.Console(stderr=True)

app = typer.Typer()


@app.command()
def church_register(
urls: Annotated[
Optional[List[str]],
typer.Argument(
help=(
"One or more URLs to church register pages,"
" for example https://data.matricula-online.eu/de/deutschland/augsburg/aach/1-THS/"
" '/1-THS' is the identifier of one church register from Aach, a parish in Augsburg, Germany."
" Note that the parameter '?pg=1' may or may not be included in the URL."
" It will by ignored anyway, because it does not alter the behavior of the scraper."
" If no URL is provided, this argument is expected to be read from stdin."
)
),
] = None,
directory: Annotated[
Path,
typer.Option(
"--directory",
"-d",
help="Directory to save the image files in.",
),
] = Path.cwd() / "church_register_images",
debug: Annotated[
bool,
typer.Option(
help="Enable debug mode for scrapy.",
),
] = False,
):
# timeout in seconds
TIMEOUT = 0.1

if not urls:
readable, _, _ = select.select([sys.stdin], [], [], TIMEOUT)

if readable:
urls = sys.stdin.read().splitlines()
else:
stderr.print(
Badge.Error,
Text("No URLs provided via stdin.", style=Color.Red),
"Please provide at least one URL as argument or via stdin.",
"Use the --help flag for more information.",
)
raise typer.Exit(1)

# won't happen, only to satisfy the type checker
if not urls:
raise NotImplementedError()

# observer = PipelineObserver(start_urls=urls)

try:
process = crawler.CrawlerProcess(
settings={
"LOG_ENABLED": debug,
"LOG_LEVEL": "DEBUG" if debug else "CRITICAL",
"ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1},
"IMAGES_STORE": directory.resolve(),
}
)
process.crawl(
ChurchRegisterSpider,
# observer=observer,
start_urls=urls,
)
process.start()

except Exception as err:
raise typer.Exit(1) from err

else:
stderr.print(
Badge.Success,
Text("Finished scraping church register images.", style=Color.Green),
)

# finally:
# observer.update_initiator_statuses()
# stderr.print(observer.start_urls)
5 changes: 5 additions & 0 deletions matricula_online_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pkg_resources
import typer
from matricula_online_scraper.cli.fetch import app as fetch_app
from matricula_online_scraper.cli.get import app as get_app

app = typer.Typer(
help="Command Line Interface tool for scraping Matricula Online https://data.matricula-online.eu.",
Expand All @@ -17,6 +18,10 @@
fetch_app,
name="fetch",
)
app.add_typer(
get_app,
name="get",
)


@app.callback()
Expand Down
67 changes: 67 additions & 0 deletions matricula_online_scraper/middlewares/Catch404.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from typing import Any, Iterable
import logging

import pip
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy import Spider
from scrapy.http.response import Response

from rich import console
from rich.text import Text
from matricula_online_scraper.cli.cli_utils.color import Color
from matricula_online_scraper.cli.cli_utils.badges import Badge
from matricula_online_scraper.utils.pipeline_observer import PipelineObserver

logger = logging.getLogger(__name__)
stderr = console.Console(stderr=True)


class Catch404:
# This runs directly after the scrapy.spidermiddlewares.httperror.HttpErrorMiddleware
# It catches 404 errors and prints a message to the user

def process_spider_exception(
self, response: Response, exception: Exception, spider: Spider
) -> Iterable[Any] | None:
# try:
# observer: PipelineObserver | None = spider.__dict__["pipeline_observer"]

# if observer is None or not isinstance(observer, PipelineObserver):
# raise AttributeError()

# except AttributeError as err:
# observer = None
# logger.exception(f"PipelineObserver not found in spider: {err}")

# if observer:
# url = response.url
# status = "failed"
# try:
# observer.update(url, status)
# except Exception as err:
# logger.exception(
# f"Failed to update observer for {url} with new status '{status}': {err}"
# )

if isinstance(exception, HttpError):
if exception.response.status == 404:
stderr.print(
Badge.Error,
Text(
f"The URL {exception.response.url} returned a 404 status code."
" This is likely due to the page not existing or the URL being incorrect."
" Please check the URL and try again.",
style=Color.Red,
),
)

else:
stderr.print(
Badge.Error,
Text(
f"The URL {exception.response.url} returned a {exception.response.status} status code.",
style=Color.Red,
),
)

return None # pass to next middleware
93 changes: 93 additions & 0 deletions matricula_online_scraper/spiders/church_register.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
Scrapy spider to scrape church registers (= scanned church books) from Matricula Online.
"""

import re
import scrapy
import json
import base64
from rich import console
# from ..utils.pipeline_observer import PipelineObserver

stderr = console.Console(stderr=True)


class ChurchRegisterSpider(scrapy.Spider):
name = "church_register"

# see the order of middleware here: https://doc.scrapy.org/en/latest/topics/settings.html#std-setting-SPIDER_MIDDLEWARES_BASE
# 51 is right after the built-in middleware `HttpErrorMiddleware` which handles 404s
custom_settings = {
"SPIDER_MIDDLEWARES": {
"matricula_online_scraper.middlewares.Catch404.Catch404": 51
},
# "DOWNLOADER_MIDDLEWARES": {
# "matricula_online_scraper.middlewares.DownloadMiddleware.DownloadMiddleware": 901
# },
}

# def __init__(self, *args, observer: PipelineObserver, **kwargs):
# super().__init__(*args, **kwargs)
# self.pipeline_observer = observer

def parse(self, response):
# Note: a "church register url" like https://data.matricula-online.eu/de/deutschland/aachen/aachen-hl-kreuz/KB+001/?pg=1
# leads to a page where the image with some page number is embedded in a canvas. The user can navigate to the next page,
# manipulate the image etc.
# Unfortunatly, there are no direct URLs pointing to a PNG file (see https://github.com/lsg551/matricula-online-scraper/issues/3)
# which could be easily used to scrape the source image.
# Instead, Matricula encodes those paths in base64 and loads them via JavaScript. Each page's (whether `?pg=2` or `?pg=3`) HTML
# has a variable `dv1` in a script tag. This variable contains the base64-encoded image paths to all scanned images of
# the church register in question. This needs to be extracted and decoded to obtain a list of URLs to the images.

# self.pipeline_observer.mark_as_started(response.url)

# found in the last script tag in the body of the HTML
dv1_var = response.xpath("//body/script[last()]/text()").get()

# this regex matches the JavaScript variable `dv1` and extracts the values from it
# keys `labels` and `files` are JSON fields in the variable `dv1`
# `dv1 = new arc.imageview.MatriculaDocView("document", { "labels": […], "files": […] })`
pattern = r"dv1\s*=\s*new\s+arc\.imageview\.MatriculaDocView\(\"document\",\s*\{[^}]*\"labels\"\s*:\s*(\[[^\]]*\]),[^}]*\"files\"\s*:\s*(\[[^\]]*\])"
matches = re.search(pattern, dv1_var, re.DOTALL)

if not matches:
self.logger.error(
"Could not extract 'labels' and 'files' from JavaScript variable 'dv1'"
)
return

labels = matches.group(1)
labels = json.loads(labels)

files = matches.group(2)
files = json.loads(files)
# [7:][:-1] removes the leading `/image/` and trailing `/`
# files = [base64.b64decode(file[7:][:-1]).decode("utf-8") for file in files]
for idx, file in enumerate(files):
try:
raw_base64_str = file[7:][:-1]
# counteract malformed base64 strings with padding
missing_padding = len(raw_base64_str) % 4
if missing_padding:
raw_base64_str += "=" * (4 - missing_padding)
files[idx] = base64.b64decode(raw_base64_str).decode("utf-8")
except Exception as err:
self.logger.error(
f"Could not decode base64-encoded image URL {file}. Error {err}",
exc_info=True,
)
continue

# TODO: implement option `--dump-decoded-urls-only` to only output the decoded URLs and labels
# if dump_decoded_urls_only:
# yield from (
# {"label": label, "file": file} for label, file in zip(labels, files)
# )

# if len(files) > 0:
# for file, label in zip(files, labels):
# self.pipeline_observer.observe(file, label, initiator=response.url)
# self.pipeline_observer.mark_as_in_process(response.url)

yield {"image_urls": files}

0 comments on commit bf18fb2

Please sign in to comment.