Merge branch 'feature-request/scraper' into zsb/test

amadolid · Nov 29, 2023 · e2dd5b6 · e2dd5b6
2 parents bf8c2cf + 075a598
commit e2dd5b6
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 1 deletion.
diff --git a/jaseci_ai_kit/install.sh b/jaseci_ai_kit/install.sh
@@ -2,7 +2,7 @@
 
 JAC_NLP_MODULES=("bart_sum" "cl_summer" "ent_ext" "fast_enc" "sbert_sim" "t5_sum" "text_seg" "tfm_ner" "use_enc" "use_qa" "zs_classifier" "bi_enc" "topic_ext" "gpt2" "gpt3" "dolly" "llm")
 JAC_SPEECH_MODULES=("stt" "vc_tts")
-JAC_MISC_MODULES=("pdf_ext" "translator" "cluster" "ph" "openai" "elastic_retrieval" "huggingface" "langchain")
+JAC_MISC_MODULES=("pdf_ext" "translator" "cluster" "ph" "openai" "elastic_retrieval" "huggingface" "langchain" "scraper")
 JAC_VISION_MODULES=("detr" "rftm" "yolos" "dpt")
 
 install_modules() {

diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py
@@ -0,0 +1 @@
+from .scraper import * # noqa
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt b/jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt
@@ -0,0 +1 @@
+playwright>=1.39.0
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
@@ -0,0 +1,164 @@
+from jaseci.jsorc.live_actions import jaseci_action
+from playwright.sync_api import sync_playwright, Page
+from typing import Union
+from re import search
+
+
+@jaseci_action(act_group=["ws"], allow_remote=True)
+def scrape(
+ urls: set,
+ scripts: dict = {},
+ url_filters: list = [],
+ timeout: int = 60000,
+ depth: int = 1,
+ detailed: bool = False,
+ excluded_elem: list = ["script", "style", "link", "noscript"],
+):
+ all_content = ""
+
+ scraped = set()
+ with sync_playwright() as spw:
+ browser = spw.chromium.launch()
+ page = browser.new_page()
+
+ while depth > 0:
+ content, urls = scraping(
+ page, urls, scripts, url_filters, timeout, scraped, excluded_elem
+ )
+ all_content += f"\n{content}"
+ depth -= 1
+
+ browser.close()
+
+ contents = " ".join(all_content.split())
+
+ if detailed:
+ return {"contents": contents, "scraped": scraped}
+ return contents
+
+
+def load_and_save(
+ page: Page,
+ target: str,
+ script: Union[dict, str],
+ timeout: int,
+ scraped: set,
+ excluded_elem: list,
+):
+ wait_for = script.get("wait_for")
+ selector = script.get("selector")
+ custom = script.get("custom")
+
+ pre = script.get("pre") or {}
+ post = script.get("post") or {}
+
+ print("#############################")
+ try:
+ scraped.add(target)
+ print(f"loading {target} ...")
+ page.goto(target, wait_until="networkidle", timeout=timeout)
+
+ if wait_for:
+ page.wait_for_selector(**wait_for)
+
+ run_script(page, pre, "pre")
+
+ # print(f"capturing {target} ...")
+ # page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True)
+
+ exclusion = ""
+ for exc in excluded_elem:
+ exclusion += f'clone.querySelectorAll("{exc}").forEach(d => d.remove());\n'
+
+ query = f"""{{
+ clone = document.body.cloneNode(true);
+ {exclusion}
+ return clone.textContent;
+ }}"""
+ if custom:
+ query = f"{{{custom}}}"
+ elif selector:
+ query = f"""
+ Array.prototype.map.call(
+ document.querySelectorAll("{selector}"),
+ d => {{
+ clone = d.cloneNode(true);
+ {exclusion}
+ return clone.textContent;
+ }}).join("\n");
+ """
+
+ print(f"getting relevant content using {query} ...")
+ content = page.evaluate(f"() =>{query}")
+
+ run_script(page, post, "post")
+
+ return content
+ except Exception as e:
+ print(
+ f"Error occurs when trying to load and save {target} ...\n{e}",
+ )
+ return ""
+
+
+def run_script(page: Page, script: dict, title: str):
+ if script:
+ expr = script["expr"]
+ print(f"running {title} script {expr}")
+ page.evaluate(f"() =>{{{expr}}}")
+
+ wait_for = script.get("wait_for") or {}
+ if wait_for:
+ page.wait_for_selector(**wait_for)
+
+ page.wait_for_load_state("networkidle")
+
+
+def crawling(page: Page):
+ try:
+ return page.query_selector_all("a[href]")
+ except Exception as e:
+ print(f"Error occurs when trying to crawl {page.url} !\n{e}")
+ return []
+
+
+def scraping(
+ page: Page,
+ urls: set,
+ scripts: dict,
+ url_filters: list,
+ timeout: int,
+ scraped: set,
+ excluded_elem: list,
+):
+ content = ""
+ next_scrape = set()
+
+ while urls:
+ url: str = urls.pop()
+ if url not in scraped:
+ script = {}
+ for key, val in scripts.items():
+ if search(key, url):
+ script = val
+ break
+
+ content += load_and_save(page, url, script, timeout, scraped, excluded_elem)
+
+ for ahref in crawling(page):
+ href = ahref.get_attribute("href")
+ if href.startswith("/"):
+ href = f"{url}{href}"
+
+ if href.startswith("http"):
+ included = True
+
+ for filter in url_filters:
+ if search(filter, href):
+ included = False
+ break
+
+ if included:
+ next_scrape.add(href)
+
+ return content, next_scrape
diff --git a/jaseci_ai_kit/jac_misc/setup.py b/jaseci_ai_kit/jac_misc/setup.py
@@ -11,6 +11,7 @@
  "huggingface",
  "langchain",
  "forecast",
+ "scraper",
 ]