temporary

amadolid · Nov 28, 2023 · cc1ea5f · cc1ea5f
1 parent 76c8c48
commit cc1ea5f
Showing 1 changed file with 94 additions and 15 deletions.
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
@@ -1,9 +1,18 @@
 from jaseci.jsorc.live_actions import jaseci_action
 from playwright.sync_api import sync_playwright, Page
+from typing import Union
+from re import match
 
 
 @jaseci_action(act_group=["ws"], allow_remote=True)
-def scrape(urls: str, depth: int = 1):
+def scrape(
+ urls: set,
+ scripts: dict = {},
+ filters: list = [],
+ depth: int = 1,
+ detailed: bool = False,
+ exclude: list = ["script", "style", "link", "noscript"],
+):
  all_content = ""
 
  scraped = set()
@@ -12,38 +21,91 @@ def scrape(urls: str, depth: int = 1):
  page = browser.new_page()
 
  while depth > 0:
- content, urls = scraping(page, urls, scraped)
+ content, urls = scraping(page, urls, scripts, filters, scraped, exclude)
  all_content += f"\n{content}"
  depth -= 1
 
  browser.close()
 
- return " ".join(all_content.split())
+ contents = " ".join(all_content.split())
 
+ if detailed:
+ return {"contents": contents, "scraped": scraped}
+ return contents
+
+
+def load_and_save(
+ page: Page, target: str, script: Union[dict, str], scraped: set, exclude
+):
+ wait_for = script.get("wait_for")
+ selector = script.get("selector")
+ custom = script.get("custom")
+
+ pre = script.get("pre") or {}
+ post = script.get("post") or {}
 
-def load_and_save(page: Page, target: str, scraped: set):
  print("#############################")
  try:
  scraped.add(target)
  print(f"loading {target} ...")
  page.goto(target, wait_until="networkidle")
 
+ if wait_for:
+ page.wait_for_selector(**wait_for)
+
+ run_script(page, pre, "pre")
+
  # print(f"capturing {target} ...")
  # page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True)
 
- print(f"getting relevant content {target} ...")
- return page.evaluate(
- """() =>
- document.body.textContent;
- """
- )
+ exclusion = ""
+ for exc in exclude:
+ exclusion += f'clone.querySelectorAll("{exc}").forEach(d => d.remove());\n'
+
+ query = f"""{{
+ clone = document.body.cloneNode(true);
+ {exclusion}
+ return clone.textContent;
+ }}"""
+ if custom:
+ query = f"{{{custom}}}"
+ elif selector:
+ query = f"""
+ Array.prototype.map.call(
+ document.querySelectorAll("{selector}"),
+ d => {{
+ clone = d.cloneNode(true);
+ {exclusion}
+ return clone.textContent;
+ }}).join("\n");
+ """
+
+ print(f"getting relevant content using {query} ...")
+ content = page.evaluate(f"() =>{query}")
+
+ run_script(page, post, "post")
+
+ return content
  except Exception as e:
  print(
  f"Error occurs when trying to load and save {target} ...\n{e}",
  )
  return ""
 
 
+def run_script(page: Page, script: dict, title: str):
+ if script:
+ expr = script["expr"]
+ print(f"running {title} script {expr}")
+ page.evaluate(f"() =>{{{expr}}}")
+
+ wait_for = script.get("wait_for") or {}
+ if wait_for:
+ page.wait_for_selector(**wait_for)
+
+ page.wait_for_load_state("networkidle")
+
+
 def crawling(page: Page):
  try:
  return page.query_selector_all("a[href]")
@@ -52,20 +114,37 @@ def crawling(page: Page):
  return []
 
 
-def scraping(page: Page, urls: set, scraped: set):
+def scraping(
+ page: Page, urls: set, scripts: dict, filters: list, scraped: set, exclude: list
+):
  content = ""
  next_scrape = set()
 
  while urls:
  url: str = urls.pop()
  if url not in scraped:
- content += load_and_save(page, url, scraped)
+ script = {}
+ for key, val in scripts.items():
+ if match(key, url):
+ script = val
+ break
+
+ content += load_and_save(page, url, script, scraped, exclude)
 
  for ahref in crawling(page):
  href = ahref.get_attribute("href")
+ if href.startswith("/"):
+ href = f"{url}{href}"
+
  if href.startswith("http"):
- next_scrape.add(href)
- elif href.startswith("/"):
- next_scrape.add(f"{url}{href}")
+ included = True
+
+ for filter in filters:
+ if match(filter, href):
+ included = False
+ break
+
+ if included:
+ next_scrape.add(href)
 
  return content, next_scrape