diff --git a/matricula_online_scraper/cli/cli_utils/badges.py b/matricula_online_scraper/cli/cli_utils/badges.py new file mode 100644 index 0000000..25cf414 --- /dev/null +++ b/matricula_online_scraper/cli/cli_utils/badges.py @@ -0,0 +1,35 @@ +from attr import dataclass +from rich.text import Text +from rich.style import Style +from .color import Color + + +@dataclass +class Badge: + Error = Text( + " ERROR ", + style=Style(bgcolor=Color.Red, bold=True), + justify="center", + end="", + ) + + Warning = Text( + " WARNING ", + style=Style(bgcolor=Color.Orange, bold=True), + justify="center", + end="", + ) + + Info = Text( + " INFO ", + style=Style(bgcolor=Color.Blue, bold=True), + justify="center", + end="", + ) + + Success = Text( + " SUCCESS ", + style=Style(bgcolor=Color.Green, bold=True), + justify="center", + end="", + ) diff --git a/matricula_online_scraper/cli/cli_utils/color.py b/matricula_online_scraper/cli/cli_utils/color.py new file mode 100644 index 0000000..9a3410e --- /dev/null +++ b/matricula_online_scraper/cli/cli_utils/color.py @@ -0,0 +1,9 @@ +from attr import dataclass + + +@dataclass +class Color: + Red = "red" + Green = "green" + Blue = "dodger_blue1" + Orange = "orange1" diff --git a/matricula_online_scraper/cli/get.py b/matricula_online_scraper/cli/get.py new file mode 100644 index 0000000..e2f9388 --- /dev/null +++ b/matricula_online_scraper/cli/get.py @@ -0,0 +1,102 @@ +from typing import Annotated, List, Optional +import sys +import typer +from pathlib import Path +from rich import console +from rich.text import Text +from .cli_utils.badges import Badge +from .cli_utils.color import Color +import logging +from ..spiders.church_register import ChurchRegisterSpider +from scrapy import crawler +import select +# from ..utils.pipeline_observer import PipelineObserver + + +logger = logging.getLogger(__name__) +stderr = console.Console(stderr=True) + +app = typer.Typer() + + +@app.command() +def church_register( + urls: Annotated[ + Optional[List[str]], + typer.Argument( + help=( + "One or more URLs to church register pages," + " for example https://data.matricula-online.eu/de/deutschland/augsburg/aach/1-THS/" + " '/1-THS' is the identifier of one church register from Aach, a parish in Augsburg, Germany." + " Note that the parameter '?pg=1' may or may not be included in the URL." + " It will by ignored anyway, because it does not alter the behavior of the scraper." + " If no URL is provided, this argument is expected to be read from stdin." + ) + ), + ] = None, + directory: Annotated[ + Path, + typer.Option( + "--directory", + "-d", + help="Directory to save the image files in.", + ), + ] = Path.cwd() / "church_register_images", + debug: Annotated[ + bool, + typer.Option( + help="Enable debug mode for scrapy.", + ), + ] = False, +): + # timeout in seconds + TIMEOUT = 0.1 + + if not urls: + readable, _, _ = select.select([sys.stdin], [], [], TIMEOUT) + + if readable: + urls = sys.stdin.read().splitlines() + else: + stderr.print( + Badge.Error, + Text("No URLs provided via stdin.", style=Color.Red), + "Please provide at least one URL as argument or via stdin.", + "Use the --help flag for more information.", + ) + raise typer.Exit(1) + + # won't happen, only to satisfy the type checker + if not urls: + raise NotImplementedError() + + # observer = PipelineObserver(start_urls=urls) + + try: + process = crawler.CrawlerProcess( + settings={ + "LOG_ENABLED": debug, + "LOG_LEVEL": "DEBUG" if debug else "CRITICAL", + "ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1}, + "IMAGES_STORE": directory.resolve(), + } + ) + process.crawl( + ChurchRegisterSpider, + # observer=observer, + start_urls=urls, + ) + process.start() + + except Exception as err: + raise typer.Exit(1) from err + + else: + stderr.print( + Badge.Success, + Text("Finished scraping church register images.", style=Color.Green), + ) + + # finally: + # observer.update_initiator_statuses() + # stderr.print(observer.start_urls) diff --git a/matricula_online_scraper/main.py b/matricula_online_scraper/main.py index 80b33d2..65ce104 100755 --- a/matricula_online_scraper/main.py +++ b/matricula_online_scraper/main.py @@ -8,6 +8,7 @@ import pkg_resources import typer from matricula_online_scraper.cli.fetch import app as fetch_app +from matricula_online_scraper.cli.get import app as get_app app = typer.Typer( help="Command Line Interface tool for scraping Matricula Online https://data.matricula-online.eu.", @@ -17,6 +18,10 @@ fetch_app, name="fetch", ) +app.add_typer( + get_app, + name="get", +) @app.callback() diff --git a/matricula_online_scraper/middlewares/Catch404.py b/matricula_online_scraper/middlewares/Catch404.py new file mode 100644 index 0000000..b901fd0 --- /dev/null +++ b/matricula_online_scraper/middlewares/Catch404.py @@ -0,0 +1,67 @@ +from typing import Any, Iterable +import logging + +import pip +from scrapy.spidermiddlewares.httperror import HttpError +from scrapy import Spider +from scrapy.http.response import Response + +from rich import console +from rich.text import Text +from matricula_online_scraper.cli.cli_utils.color import Color +from matricula_online_scraper.cli.cli_utils.badges import Badge +from matricula_online_scraper.utils.pipeline_observer import PipelineObserver + +logger = logging.getLogger(__name__) +stderr = console.Console(stderr=True) + + +class Catch404: + # This runs directly after the scrapy.spidermiddlewares.httperror.HttpErrorMiddleware + # It catches 404 errors and prints a message to the user + + def process_spider_exception( + self, response: Response, exception: Exception, spider: Spider + ) -> Iterable[Any] | None: + # try: + # observer: PipelineObserver | None = spider.__dict__["pipeline_observer"] + + # if observer is None or not isinstance(observer, PipelineObserver): + # raise AttributeError() + + # except AttributeError as err: + # observer = None + # logger.exception(f"PipelineObserver not found in spider: {err}") + + # if observer: + # url = response.url + # status = "failed" + # try: + # observer.update(url, status) + # except Exception as err: + # logger.exception( + # f"Failed to update observer for {url} with new status '{status}': {err}" + # ) + + if isinstance(exception, HttpError): + if exception.response.status == 404: + stderr.print( + Badge.Error, + Text( + f"The URL {exception.response.url} returned a 404 status code." + " This is likely due to the page not existing or the URL being incorrect." + " Please check the URL and try again.", + style=Color.Red, + ), + ) + + else: + stderr.print( + Badge.Error, + Text( + f"The URL {exception.response.url} returned a {exception.response.status} status code.", + style=Color.Red, + ), + ) + + return None # pass to next middleware diff --git a/matricula_online_scraper/spiders/church_register.py b/matricula_online_scraper/spiders/church_register.py new file mode 100644 index 0000000..f849b2b --- /dev/null +++ b/matricula_online_scraper/spiders/church_register.py @@ -0,0 +1,93 @@ +""" +Scrapy spider to scrape church registers (= scanned church books) from Matricula Online. +""" + +import re +import scrapy +import json +import base64 +from rich import console +# from ..utils.pipeline_observer import PipelineObserver + +stderr = console.Console(stderr=True) + + +class ChurchRegisterSpider(scrapy.Spider): + name = "church_register" + + # see the order of middleware here: https://doc.scrapy.org/en/latest/topics/settings.html#std-setting-SPIDER_MIDDLEWARES_BASE + # 51 is right after the built-in middleware `HttpErrorMiddleware` which handles 404s + custom_settings = { + "SPIDER_MIDDLEWARES": { + "matricula_online_scraper.middlewares.Catch404.Catch404": 51 + }, + # "DOWNLOADER_MIDDLEWARES": { + # "matricula_online_scraper.middlewares.DownloadMiddleware.DownloadMiddleware": 901 + # }, + } + + # def __init__(self, *args, observer: PipelineObserver, **kwargs): + # super().__init__(*args, **kwargs) + # self.pipeline_observer = observer + + def parse(self, response): + # Note: a "church register url" like https://data.matricula-online.eu/de/deutschland/aachen/aachen-hl-kreuz/KB+001/?pg=1 + # leads to a page where the image with some page number is embedded in a canvas. The user can navigate to the next page, + # manipulate the image etc. + # Unfortunatly, there are no direct URLs pointing to a PNG file (see https://github.com/lsg551/matricula-online-scraper/issues/3) + # which could be easily used to scrape the source image. + # Instead, Matricula encodes those paths in base64 and loads them via JavaScript. Each page's (whether `?pg=2` or `?pg=3`) HTML + # has a variable `dv1` in a script tag. This variable contains the base64-encoded image paths to all scanned images of + # the church register in question. This needs to be extracted and decoded to obtain a list of URLs to the images. + + # self.pipeline_observer.mark_as_started(response.url) + + # found in the last script tag in the body of the HTML + dv1_var = response.xpath("//body/script[last()]/text()").get() + + # this regex matches the JavaScript variable `dv1` and extracts the values from it + # keys `labels` and `files` are JSON fields in the variable `dv1` + # `dv1 = new arc.imageview.MatriculaDocView("document", { "labels": […], "files": […] })` + pattern = r"dv1\s*=\s*new\s+arc\.imageview\.MatriculaDocView\(\"document\",\s*\{[^}]*\"labels\"\s*:\s*(\[[^\]]*\]),[^}]*\"files\"\s*:\s*(\[[^\]]*\])" + matches = re.search(pattern, dv1_var, re.DOTALL) + + if not matches: + self.logger.error( + "Could not extract 'labels' and 'files' from JavaScript variable 'dv1'" + ) + return + + labels = matches.group(1) + labels = json.loads(labels) + + files = matches.group(2) + files = json.loads(files) + # [7:][:-1] removes the leading `/image/` and trailing `/` + # files = [base64.b64decode(file[7:][:-1]).decode("utf-8") for file in files] + for idx, file in enumerate(files): + try: + raw_base64_str = file[7:][:-1] + # counteract malformed base64 strings with padding + missing_padding = len(raw_base64_str) % 4 + if missing_padding: + raw_base64_str += "=" * (4 - missing_padding) + files[idx] = base64.b64decode(raw_base64_str).decode("utf-8") + except Exception as err: + self.logger.error( + f"Could not decode base64-encoded image URL {file}. Error {err}", + exc_info=True, + ) + continue + + # TODO: implement option `--dump-decoded-urls-only` to only output the decoded URLs and labels + # if dump_decoded_urls_only: + # yield from ( + # {"label": label, "file": file} for label, file in zip(labels, files) + # ) + + # if len(files) > 0: + # for file, label in zip(files, labels): + # self.pipeline_observer.observe(file, label, initiator=response.url) + # self.pipeline_observer.mark_as_in_process(response.url) + + yield {"image_urls": files}