Skip to content

Commit

Permalink
Merge pull request #71 from lunakv/fix-docs-scraper
Browse files Browse the repository at this point in the history
Fix WPN docs scraper
  • Loading branch information
lunakv authored Feb 7, 2024
2 parents 11f7fb3 + 614c3ed commit 40c124d
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 8 deletions.
12 changes: 6 additions & 6 deletions src/scraper/docs_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,15 @@ def parse_nuxt_object(page):
obj = match.group(2)

# replace function parameters (key:a) with null values (key:null) to make this parseable
# also replace parameter values in arrays ([a,a,b])
for param in params:
# some values have a trailing $ at the end for some reason (key:a$)
obj = re.sub(":" + param + r"\b\$?", ":null", obj)
obj = re.sub(r"\b" + param + r"([,\]])", r"null\1", obj)

# similarly, some values are just $, which breaks the parser
obj = obj.replace(":$,", ":null,")
# there's also a "void 0" value somewhere in there
obj = obj.replace(":$,", ":null,").replace("void 0", "0")

try:
parsed = hjson.loads(obj)
Expand All @@ -58,8 +61,6 @@ def parse_nuxt_object(page):
docs = parsed.get("fetch", {}).get("DocumentationDownload:0", {}).get("documents")
if not docs:
notify_scrape_error("List of policy documents not found in parsed NUXT object")
logger.error("List of policy documents not found in parsed NUXT object")
logger.error(parsed)
return None

return docs
Expand Down Expand Up @@ -96,7 +97,7 @@ def can_scrape(session: Session):
def scrape_docs_page():
with SessionLocal() as session:
with session.begin():
if not (can_scrape(session)):
if not can_scrape(session):
logger.info("Skipping broken scrape, retry moved to daily")
return

Expand All @@ -113,7 +114,7 @@ def scrape_docs_page():
response = requests.get(docs_page_uri)
if response.status_code != requests.codes.ok:
notify_scrape_error(f"Couldn't fetch WPN docs page (code {response.status_code})")
logger.error("Couldn't fetch WPN docs page: %s", response.reason)
logger.error(response.reason)
set_broken(session)
return

Expand All @@ -132,7 +133,6 @@ def scrape_docs_page():
if len(found) != len(docs):
# not all links were found correctly, so we don't wanna update anything to be safe
notify_scrape_error("Couldn't find link for all WPN documents")
logger.error("Couldn't find link for all WPN documents")
logger.error(found)
set_broken(session)
return
Expand Down
7 changes: 5 additions & 2 deletions src/utils/notifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

import requests

from src.utils import logger

_uri = "https://api.pushover.net/1/messages.json"
_token = os.environ.get("PUSHOVER_APP_TOKEN")
_user = os.environ.get("PUSHOVER_USER_KEY")


def notify(message, title=None, uri=None, uri_title=None, formatted=None):
def notify(message, title=None, uri=None, uri_title=None, formatted=None, log_level="debug"):
if os.environ.get("USE_PUSHOVER") != "1":
return
payload = {"token": _token, "user": _user, "message": message}
Expand All @@ -21,10 +23,11 @@ def notify(message, title=None, uri=None, uri_title=None, formatted=None):
payload["html"] = 1

requests.post(_uri, data=payload)
logger.log(log_level, "Sending notification: %s", message)


def notify_scrape_error(message):
notify(message, title="Scraping Error")
notify(message, title="Scraping Error", log_level="error")


def _confirm_refresh_uri(doctype):
Expand Down

0 comments on commit 40c124d

Please sign in to comment.