From 14f1f3697c6f5b39563c12b4aad0c9dcb5b34ad6 Mon Sep 17 00:00:00 2001 From: Wesley van Lee Date: Fri, 10 Jan 2025 11:31:53 +0100 Subject: [PATCH] Adjust logic to always use downloader middleware when SW_WACZ_SOURCE_URI is configured --- docs/settings.md | 2 +- pyproject.toml | 2 +- scrapy_webarchive/downloadermiddlewares.py | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/docs/settings.md b/docs/settings.md index de5160d..cdbb072 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -57,4 +57,4 @@ This setting defines the location of the WACZ file that should be used as a sour SW_WACZ_CRAWL = True ``` -Setting to control the scraping behavior. If set to `False`, the scraper will bypass the WACZ middleware/downloadermiddleware during the crawling process. +Setting to ignore original `start_requests`, just yield all responses found in WACZ. diff --git a/pyproject.toml b/pyproject.toml index 9fa60d8..45820d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ description = "A webarchive extension for Scrapy" readme = "README.md" keywords = ["Scrapy", "Webarchive", "WARC", "WACZ"] classifiers = [ - "Development Status :: 3 - Alpha", + "Development Status :: 4 - Beta", "Programming Language :: Python :: 3", "Programming Language :: Python", ] diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py index a920813..244222f 100644 --- a/scrapy_webarchive/downloadermiddlewares.py +++ b/scrapy_webarchive/downloadermiddlewares.py @@ -39,12 +39,8 @@ def _check_ignore_conditions(self, request: Request, spider: Spider) -> None: def process_request(self, request: Request, spider: Spider): """Called for each request that goes through the downloader.""" - # Continue default crawl behaviour. - if not self.crawl: - return None - # If the attribute has not been set, none of the WACZ could be opened. - if self.crawl and not hasattr(self, "wacz"): + if not hasattr(self, "wacz"): raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.") # Check if the request should be ignored.