Skip to content

Commit

Permalink
feat(cl_back_scrape_opinions): add optional wait argument
Browse files Browse the repository at this point in the history
Solves #4564
  • Loading branch information
grossir committed Oct 18, 2024
1 parent ec8bc90 commit 6af9bf9
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
21 changes: 20 additions & 1 deletion cl/scrapers/management/commands/cl_back_scrape_opinions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

from juriscraper import AbstractSite
from juriscraper.AbstractSite import logger
from juriscraper.lib.importer import site_yielder
Expand Down Expand Up @@ -27,6 +29,14 @@ def add_backscraper_arguments(parser) -> None:
"imposes a limit of returned documents",
type=int,
)
parser.add_argument(
"--backscrape-wait",
type=int,
default=0,
help="Seconds to wait after consuming each element "
"of the backscrape iterable. Useful to avoid overloading"
" a target server when backscraping.",
)


class Command(cl_scrape_opinions.Command):
Expand All @@ -41,7 +51,7 @@ def parse_and_scrape_site(
) -> None:
"""Parse the site and scrape it using the backscraper
:param mod: The jusriscraper Site object to scrape
:param mod: The juriscraper Site object to scrape
:param options: argparse kwargs dictionary. May contain the following keys:
- full_crawl: Whether or not to do a full crawl (Ignored value)
- backscrape_start: string which may be a date, year, index, etc.
Expand All @@ -50,6 +60,8 @@ def parse_and_scrape_site(
- backscrape_end: end value for backscraper range
- days_interval: days between each (start, end) date pairs in the
Site.back_scrape_iterable
- backscrape_wait: Seconds to wait after consuming each element
of the backscrape iterable
:return: None
"""
Expand All @@ -66,5 +78,12 @@ def parse_and_scrape_site(
site.parse()
self.scrape_court(site, full_crawl=True)

if wait := options["backscrape_wait"]:
logger.info(
"Sleeping for %s seconds before continuing backscrape",
wait,
)
time.sleep(wait)

def save_everything(self, items, index=False, backscrape=True):
super().save_everything(items, index, backscrape)
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

from juriscraper.AbstractSite import logger
from juriscraper.lib.importer import site_yielder

Expand Down Expand Up @@ -26,3 +28,10 @@ def parse_and_scrape_site(self, mod, options: dict):
):
site.parse()
self.scrape_court(site, full_crawl=True, backscrape=True)

if wait := options["backscrape_wait"]:
logger.info(
"Sleeping for %s seconds before continuing backscrape",
wait,
)
time.sleep(wait)

0 comments on commit 6af9bf9

Please sign in to comment.