From 76c8c48196a32f6018df63513a766f9cf22f7f69 Mon Sep 17 00:00:00 2001 From: "Alexie (Boyong) Madolid" Date: Fri, 10 Nov 2023 14:13:56 +0800 Subject: [PATCH 1/2] [SCRAPER]: Initial implementation --- jaseci_ai_kit/install.sh | 2 +- .../jac_misc/jac_misc/scraper/__init__.py | 1 + .../jac_misc/scraper/requirements.txt | 1 + .../jac_misc/jac_misc/scraper/scraper.py | 71 +++++++++++++++++++ jaseci_ai_kit/jac_misc/setup.py | 1 + 5 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py create mode 100644 jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt create mode 100644 jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py diff --git a/jaseci_ai_kit/install.sh b/jaseci_ai_kit/install.sh index 157ea5e092..443592344c 100644 --- a/jaseci_ai_kit/install.sh +++ b/jaseci_ai_kit/install.sh @@ -2,7 +2,7 @@ JAC_NLP_MODULES=("bart_sum" "cl_summer" "ent_ext" "fast_enc" "sbert_sim" "t5_sum" "text_seg" "tfm_ner" "use_enc" "use_qa" "zs_classifier" "bi_enc" "topic_ext" "gpt2" "gpt3" "dolly" "llm") JAC_SPEECH_MODULES=("stt" "vc_tts") -JAC_MISC_MODULES=("pdf_ext" "translator" "cluster" "ph" "openai" "elastic_retrieval" "huggingface" "langchain") +JAC_MISC_MODULES=("pdf_ext" "translator" "cluster" "ph" "openai" "elastic_retrieval" "huggingface" "langchain" "scraper") JAC_VISION_MODULES=("detr" "rftm" "yolos" "dpt") install_modules() { diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py new file mode 100644 index 0000000000..8c161d389a --- /dev/null +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py @@ -0,0 +1 @@ +from .scraper import * # noqa diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt b/jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt new file mode 100644 index 0000000000..8b3fba4a13 --- /dev/null +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt @@ -0,0 +1 @@ +playwright>=1.39.0 \ No newline at end of file diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py new file mode 100644 index 0000000000..a6d97ca8b9 --- /dev/null +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py @@ -0,0 +1,71 @@ +from jaseci.jsorc.live_actions import jaseci_action +from playwright.sync_api import sync_playwright, Page + + +@jaseci_action(act_group=["ws"], allow_remote=True) +def scrape(urls: str, depth: int = 1): + all_content = "" + + scraped = set() + with sync_playwright() as spw: + browser = spw.chromium.launch() + page = browser.new_page() + + while depth > 0: + content, urls = scraping(page, urls, scraped) + all_content += f"\n{content}" + depth -= 1 + + browser.close() + + return " ".join(all_content.split()) + + +def load_and_save(page: Page, target: str, scraped: set): + print("#############################") + try: + scraped.add(target) + print(f"loading {target} ...") + page.goto(target, wait_until="networkidle") + + # print(f"capturing {target} ...") + # page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True) + + print(f"getting relevant content {target} ...") + return page.evaluate( + """() => + document.body.textContent; + """ + ) + except Exception as e: + print( + f"Error occurs when trying to load and save {target} ...\n{e}", + ) + return "" + + +def crawling(page: Page): + try: + return page.query_selector_all("a[href]") + except Exception as e: + print(f"Error occurs when trying to crawl {page.url} !\n{e}") + return [] + + +def scraping(page: Page, urls: set, scraped: set): + content = "" + next_scrape = set() + + while urls: + url: str = urls.pop() + if url not in scraped: + content += load_and_save(page, url, scraped) + + for ahref in crawling(page): + href = ahref.get_attribute("href") + if href.startswith("http"): + next_scrape.add(href) + elif href.startswith("/"): + next_scrape.add(f"{url}{href}") + + return content, next_scrape diff --git a/jaseci_ai_kit/jac_misc/setup.py b/jaseci_ai_kit/jac_misc/setup.py index 453a07f003..a3d260dc63 100644 --- a/jaseci_ai_kit/jac_misc/setup.py +++ b/jaseci_ai_kit/jac_misc/setup.py @@ -11,6 +11,7 @@ "huggingface", "langchain", "forecast", + "scraper", ] From 075a59870c69261ee62d4a37583bf13adbf52388 Mon Sep 17 00:00:00 2001 From: "Alexie (Boyong) Madolid" Date: Tue, 28 Nov 2023 17:36:47 +0800 Subject: [PATCH 2/2] [SCRAPER]: Complex conditions --- .../jac_misc/jac_misc/scraper/scraper.py | 125 +++++++++++++++--- 1 file changed, 109 insertions(+), 16 deletions(-) diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py index a6d97ca8b9..0b950e3c75 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py @@ -1,9 +1,19 @@ from jaseci.jsorc.live_actions import jaseci_action from playwright.sync_api import sync_playwright, Page +from typing import Union +from re import search @jaseci_action(act_group=["ws"], allow_remote=True) -def scrape(urls: str, depth: int = 1): +def scrape( + urls: set, + scripts: dict = {}, + url_filters: list = [], + timeout: int = 60000, + depth: int = 1, + detailed: bool = False, + excluded_elem: list = ["script", "style", "link", "noscript"], +): all_content = "" scraped = set() @@ -12,31 +22,78 @@ def scrape(urls: str, depth: int = 1): page = browser.new_page() while depth > 0: - content, urls = scraping(page, urls, scraped) + content, urls = scraping( + page, urls, scripts, url_filters, timeout, scraped, excluded_elem + ) all_content += f"\n{content}" depth -= 1 browser.close() - return " ".join(all_content.split()) + contents = " ".join(all_content.split()) + if detailed: + return {"contents": contents, "scraped": scraped} + return contents + + +def load_and_save( + page: Page, + target: str, + script: Union[dict, str], + timeout: int, + scraped: set, + excluded_elem: list, +): + wait_for = script.get("wait_for") + selector = script.get("selector") + custom = script.get("custom") + + pre = script.get("pre") or {} + post = script.get("post") or {} -def load_and_save(page: Page, target: str, scraped: set): print("#############################") try: scraped.add(target) print(f"loading {target} ...") - page.goto(target, wait_until="networkidle") + page.goto(target, wait_until="networkidle", timeout=timeout) + + if wait_for: + page.wait_for_selector(**wait_for) + + run_script(page, pre, "pre") # print(f"capturing {target} ...") # page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True) - print(f"getting relevant content {target} ...") - return page.evaluate( - """() => - document.body.textContent; - """ - ) + exclusion = "" + for exc in excluded_elem: + exclusion += f'clone.querySelectorAll("{exc}").forEach(d => d.remove());\n' + + query = f"""{{ + clone = document.body.cloneNode(true); + {exclusion} + return clone.textContent; + }}""" + if custom: + query = f"{{{custom}}}" + elif selector: + query = f""" + Array.prototype.map.call( + document.querySelectorAll("{selector}"), + d => {{ + clone = d.cloneNode(true); + {exclusion} + return clone.textContent; + }}).join("\n"); + """ + + print(f"getting relevant content using {query} ...") + content = page.evaluate(f"() =>{query}") + + run_script(page, post, "post") + + return content except Exception as e: print( f"Error occurs when trying to load and save {target} ...\n{e}", @@ -44,6 +101,19 @@ def load_and_save(page: Page, target: str, scraped: set): return "" +def run_script(page: Page, script: dict, title: str): + if script: + expr = script["expr"] + print(f"running {title} script {expr}") + page.evaluate(f"() =>{{{expr}}}") + + wait_for = script.get("wait_for") or {} + if wait_for: + page.wait_for_selector(**wait_for) + + page.wait_for_load_state("networkidle") + + def crawling(page: Page): try: return page.query_selector_all("a[href]") @@ -52,20 +122,43 @@ def crawling(page: Page): return [] -def scraping(page: Page, urls: set, scraped: set): +def scraping( + page: Page, + urls: set, + scripts: dict, + url_filters: list, + timeout: int, + scraped: set, + excluded_elem: list, +): content = "" next_scrape = set() while urls: url: str = urls.pop() if url not in scraped: - content += load_and_save(page, url, scraped) + script = {} + for key, val in scripts.items(): + if search(key, url): + script = val + break + + content += load_and_save(page, url, script, timeout, scraped, excluded_elem) for ahref in crawling(page): href = ahref.get_attribute("href") + if href.startswith("/"): + href = f"{url}{href}" + if href.startswith("http"): - next_scrape.add(href) - elif href.startswith("/"): - next_scrape.add(f"{url}{href}") + included = True + + for filter in url_filters: + if search(filter, href): + included = False + break + + if included: + next_scrape.add(href) return content, next_scrape