From bf18fb22d4372fb11d0aa97a6accb66659b7f59e Mon Sep 17 00:00:00 2001 From: Luis Schulte <63458548+lsg551@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:24:02 +0100 Subject: [PATCH] feat: scrape church registers, i.e. images (#3) This commit implements a basic version allowing users to scrape Matricula's most important resource: scanned images of church registers. With `matricula-online-scraper get church-register ` can the images of one or more church registers be scraped. Use `--help` for that specific commit for details. Note that the terminology wasn't fixed. Instead of "church registers", previous code might refer to this as "parish registers". From now on, the former should be used. This is the term Matricula uses itself. The spider is already grouped within the new command `get`, while others are still grouped in `fetch`. This conforms with a future refactoring, enhancing the CLI structure. https://github.com/lsg551/matricula-online-scraper/issues/3#issuecomment-2498586139 was not implemented directly as a CLI option, because the command can handle multipel URLs, either per argument or read from stdin. In a future refactoring the other commands should be refined so that chainig is possible: `command1 | command2`. This allows to pipe a list of all church registers from e.g. one specific parish into the image scraper. --- .../cli/cli_utils/badges.py | 35 ++++++ .../cli/cli_utils/color.py | 9 ++ matricula_online_scraper/cli/get.py | 102 ++++++++++++++++++ matricula_online_scraper/main.py | 5 + .../middlewares/Catch404.py | 67 ++++++++++++ .../spiders/church_register.py | 93 ++++++++++++++++ 6 files changed, 311 insertions(+) create mode 100644 matricula_online_scraper/cli/cli_utils/badges.py create mode 100644 matricula_online_scraper/cli/cli_utils/color.py create mode 100644 matricula_online_scraper/cli/get.py create mode 100644 matricula_online_scraper/middlewares/Catch404.py create mode 100644 matricula_online_scraper/spiders/church_register.py diff --git a/matricula_online_scraper/cli/cli_utils/badges.py b/matricula_online_scraper/cli/cli_utils/badges.py new file mode 100644 index 0000000..25cf414 --- /dev/null +++ b/matricula_online_scraper/cli/cli_utils/badges.py @@ -0,0 +1,35 @@ +from attr import dataclass +from rich.text import Text +from rich.style import Style +from .color import Color + + +@dataclass +class Badge: + Error = Text( + " ERROR ", + style=Style(bgcolor=Color.Red, bold=True), + justify="center", + end="", + ) + + Warning = Text( + " WARNING ", + style=Style(bgcolor=Color.Orange, bold=True), + justify="center", + end="", + ) + + Info = Text( + " INFO ", + style=Style(bgcolor=Color.Blue, bold=True), + justify="center", + end="", + ) + + Success = Text( + " SUCCESS ", + style=Style(bgcolor=Color.Green, bold=True), + justify="center", + end="", + ) diff --git a/matricula_online_scraper/cli/cli_utils/color.py b/matricula_online_scraper/cli/cli_utils/color.py new file mode 100644 index 0000000..9a3410e --- /dev/null +++ b/matricula_online_scraper/cli/cli_utils/color.py @@ -0,0 +1,9 @@ +from attr import dataclass + + +@dataclass +class Color: + Red = "red" + Green = "green" + Blue = "dodger_blue1" + Orange = "orange1" diff --git a/matricula_online_scraper/cli/get.py b/matricula_online_scraper/cli/get.py new file mode 100644 index 0000000..e2f9388 --- /dev/null +++ b/matricula_online_scraper/cli/get.py @@ -0,0 +1,102 @@ +from typing import Annotated, List, Optional +import sys +import typer +from pathlib import Path +from rich import console +from rich.text import Text +from .cli_utils.badges import Badge +from .cli_utils.color import Color +import logging +from ..spiders.church_register import ChurchRegisterSpider +from scrapy import crawler +import select +# from ..utils.pipeline_observer import PipelineObserver + + +logger = logging.getLogger(__name__) +stderr = console.Console(stderr=True) + +app = typer.Typer() + + +@app.command() +def church_register( + urls: Annotated[ + Optional[List[str]], + typer.Argument( + help=( + "One or more URLs to church register pages," + " for example https://data.matricula-online.eu/de/deutschland/augsburg/aach/1-THS/" + " '/1-THS' is the identifier of one church register from Aach, a parish in Augsburg, Germany." + " Note that the parameter '?pg=1' may or may not be included in the URL." + " It will by ignored anyway, because it does not alter the behavior of the scraper." + " If no URL is provided, this argument is expected to be read from stdin." + ) + ), + ] = None, + directory: Annotated[ + Path, + typer.Option( + "--directory", + "-d", + help="Directory to save the image files in.", + ), + ] = Path.cwd() / "church_register_images", + debug: Annotated[ + bool, + typer.Option( + help="Enable debug mode for scrapy.", + ), + ] = False, +): + # timeout in seconds + TIMEOUT = 0.1 + + if not urls: + readable, _, _ = select.select([sys.stdin], [], [], TIMEOUT) + + if readable: + urls = sys.stdin.read().splitlines() + else: + stderr.print( + Badge.Error, + Text("No URLs provided via stdin.", style=Color.Red), + "Please provide at least one URL as argument or via stdin.", + "Use the --help flag for more information.", + ) + raise typer.Exit(1) + + # won't happen, only to satisfy the type checker + if not urls: + raise NotImplementedError() + + # observer = PipelineObserver(start_urls=urls) + + try: + process = crawler.CrawlerProcess( + settings={ + "LOG_ENABLED": debug, + "LOG_LEVEL": "DEBUG" if debug else "CRITICAL", + "ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1}, + "IMAGES_STORE": directory.resolve(), + } + ) + process.crawl( + ChurchRegisterSpider, + # observer=observer, + start_urls=urls, + ) + process.start() + + except Exception as err: + raise typer.Exit(1) from err + + else: + stderr.print( + Badge.Success, + Text("Finished scraping church register images.", style=Color.Green), + ) + + # finally: + # observer.update_initiator_statuses() + # stderr.print(observer.start_urls) diff --git a/matricula_online_scraper/main.py b/matricula_online_scraper/main.py index 80b33d2..65ce104 100755 --- a/matricula_online_scraper/main.py +++ b/matricula_online_scraper/main.py @@ -8,6 +8,7 @@ import pkg_resources import typer from matricula_online_scraper.cli.fetch import app as fetch_app +from matricula_online_scraper.cli.get import app as get_app app = typer.Typer( help="Command Line Interface tool for scraping Matricula Online https://data.matricula-online.eu.", @@ -17,6 +18,10 @@ fetch_app, name="fetch", ) +app.add_typer( + get_app, + name="get", +) @app.callback() diff --git a/matricula_online_scraper/middlewares/Catch404.py b/matricula_online_scraper/middlewares/Catch404.py new file mode 100644 index 0000000..b901fd0 --- /dev/null +++ b/matricula_online_scraper/middlewares/Catch404.py @@ -0,0 +1,67 @@ +from typing import Any, Iterable +import logging + +import pip +from scrapy.spidermiddlewares.httperror import HttpError +from scrapy import Spider +from scrapy.http.response import Response + +from rich import console +from rich.text import Text +from matricula_online_scraper.cli.cli_utils.color import Color +from matricula_online_scraper.cli.cli_utils.badges import Badge +from matricula_online_scraper.utils.pipeline_observer import PipelineObserver + +logger = logging.getLogger(__name__) +stderr = console.Console(stderr=True) + + +class Catch404: + # This runs directly after the scrapy.spidermiddlewares.httperror.HttpErrorMiddleware + # It catches 404 errors and prints a message to the user + + def process_spider_exception( + self, response: Response, exception: Exception, spider: Spider + ) -> Iterable[Any] | None: + # try: + # observer: PipelineObserver | None = spider.__dict__["pipeline_observer"] + + # if observer is None or not isinstance(observer, PipelineObserver): + # raise AttributeError() + + # except AttributeError as err: + # observer = None + # logger.exception(f"PipelineObserver not found in spider: {err}") + + # if observer: + # url = response.url + # status = "failed" + # try: + # observer.update(url, status) + # except Exception as err: + # logger.exception( + # f"Failed to update observer for {url} with new status '{status}': {err}" + # ) + + if isinstance(exception, HttpError): + if exception.response.status == 404: + stderr.print( + Badge.Error, + Text( + f"The URL {exception.response.url} returned a 404 status code." + " This is likely due to the page not existing or the URL being incorrect." + " Please check the URL and try again.", + style=Color.Red, + ), + ) + + else: + stderr.print( + Badge.Error, + Text( + f"The URL {exception.response.url} returned a {exception.response.status} status code.", + style=Color.Red, + ), + ) + + return None # pass to next middleware diff --git a/matricula_online_scraper/spiders/church_register.py b/matricula_online_scraper/spiders/church_register.py new file mode 100644 index 0000000..f849b2b --- /dev/null +++ b/matricula_online_scraper/spiders/church_register.py @@ -0,0 +1,93 @@ +""" +Scrapy spider to scrape church registers (= scanned church books) from Matricula Online. +""" + +import re +import scrapy +import json +import base64 +from rich import console +# from ..utils.pipeline_observer import PipelineObserver + +stderr = console.Console(stderr=True) + + +class ChurchRegisterSpider(scrapy.Spider): + name = "church_register" + + # see the order of middleware here: https://doc.scrapy.org/en/latest/topics/settings.html#std-setting-SPIDER_MIDDLEWARES_BASE + # 51 is right after the built-in middleware `HttpErrorMiddleware` which handles 404s + custom_settings = { + "SPIDER_MIDDLEWARES": { + "matricula_online_scraper.middlewares.Catch404.Catch404": 51 + }, + # "DOWNLOADER_MIDDLEWARES": { + # "matricula_online_scraper.middlewares.DownloadMiddleware.DownloadMiddleware": 901 + # }, + } + + # def __init__(self, *args, observer: PipelineObserver, **kwargs): + # super().__init__(*args, **kwargs) + # self.pipeline_observer = observer + + def parse(self, response): + # Note: a "church register url" like https://data.matricula-online.eu/de/deutschland/aachen/aachen-hl-kreuz/KB+001/?pg=1 + # leads to a page where the image with some page number is embedded in a canvas. The user can navigate to the next page, + # manipulate the image etc. + # Unfortunatly, there are no direct URLs pointing to a PNG file (see https://github.com/lsg551/matricula-online-scraper/issues/3) + # which could be easily used to scrape the source image. + # Instead, Matricula encodes those paths in base64 and loads them via JavaScript. Each page's (whether `?pg=2` or `?pg=3`) HTML + # has a variable `dv1` in a script tag. This variable contains the base64-encoded image paths to all scanned images of + # the church register in question. This needs to be extracted and decoded to obtain a list of URLs to the images. + + # self.pipeline_observer.mark_as_started(response.url) + + # found in the last script tag in the body of the HTML + dv1_var = response.xpath("//body/script[last()]/text()").get() + + # this regex matches the JavaScript variable `dv1` and extracts the values from it + # keys `labels` and `files` are JSON fields in the variable `dv1` + # `dv1 = new arc.imageview.MatriculaDocView("document", { "labels": […], "files": […] })` + pattern = r"dv1\s*=\s*new\s+arc\.imageview\.MatriculaDocView\(\"document\",\s*\{[^}]*\"labels\"\s*:\s*(\[[^\]]*\]),[^}]*\"files\"\s*:\s*(\[[^\]]*\])" + matches = re.search(pattern, dv1_var, re.DOTALL) + + if not matches: + self.logger.error( + "Could not extract 'labels' and 'files' from JavaScript variable 'dv1'" + ) + return + + labels = matches.group(1) + labels = json.loads(labels) + + files = matches.group(2) + files = json.loads(files) + # [7:][:-1] removes the leading `/image/` and trailing `/` + # files = [base64.b64decode(file[7:][:-1]).decode("utf-8") for file in files] + for idx, file in enumerate(files): + try: + raw_base64_str = file[7:][:-1] + # counteract malformed base64 strings with padding + missing_padding = len(raw_base64_str) % 4 + if missing_padding: + raw_base64_str += "=" * (4 - missing_padding) + files[idx] = base64.b64decode(raw_base64_str).decode("utf-8") + except Exception as err: + self.logger.error( + f"Could not decode base64-encoded image URL {file}. Error {err}", + exc_info=True, + ) + continue + + # TODO: implement option `--dump-decoded-urls-only` to only output the decoded URLs and labels + # if dump_decoded_urls_only: + # yield from ( + # {"label": label, "file": file} for label, file in zip(labels, files) + # ) + + # if len(files) > 0: + # for file, label in zip(files, labels): + # self.pipeline_observer.observe(file, label, initiator=response.url) + # self.pipeline_observer.mark_as_in_process(response.url) + + yield {"image_urls": files}