From 76c8c48196a32f6018df63513a766f9cf22f7f69 Mon Sep 17 00:00:00 2001
From: "Alexie (Boyong) Madolid" <maex@tuta.io>
Date: Fri, 10 Nov 2023 14:13:56 +0800
Subject: [PATCH 1/2] [SCRAPER]: Initial implementation

---
 jaseci_ai_kit/install.sh                      |  2 +-
 .../jac_misc/jac_misc/scraper/__init__.py     |  1 +
 .../jac_misc/scraper/requirements.txt         |  1 +
 .../jac_misc/jac_misc/scraper/scraper.py      | 71 +++++++++++++++++++
 jaseci_ai_kit/jac_misc/setup.py               |  1 +
 5 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py
 create mode 100644 jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt
 create mode 100644 jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py

diff --git a/jaseci_ai_kit/install.sh b/jaseci_ai_kit/install.sh
index 157ea5e092..443592344c 100644
--- a/jaseci_ai_kit/install.sh
+++ b/jaseci_ai_kit/install.sh
@@ -2,7 +2,7 @@
 
 JAC_NLP_MODULES=("bart_sum" "cl_summer" "ent_ext" "fast_enc" "sbert_sim" "t5_sum" "text_seg" "tfm_ner" "use_enc" "use_qa" "zs_classifier" "bi_enc" "topic_ext" "gpt2" "gpt3" "dolly" "llm")
 JAC_SPEECH_MODULES=("stt" "vc_tts")
-JAC_MISC_MODULES=("pdf_ext" "translator" "cluster" "ph" "openai" "elastic_retrieval" "huggingface" "langchain")
+JAC_MISC_MODULES=("pdf_ext" "translator" "cluster" "ph" "openai" "elastic_retrieval" "huggingface" "langchain" "scraper")
 JAC_VISION_MODULES=("detr" "rftm" "yolos" "dpt")
 
 install_modules() {
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py
new file mode 100644
index 0000000000..8c161d389a
--- /dev/null
+++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py
@@ -0,0 +1 @@
+from .scraper import *  # noqa
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt b/jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt
new file mode 100644
index 0000000000..8b3fba4a13
--- /dev/null
+++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt
@@ -0,0 +1 @@
+playwright>=1.39.0
\ No newline at end of file
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
new file mode 100644
index 0000000000..a6d97ca8b9
--- /dev/null
+++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
@@ -0,0 +1,71 @@
+from jaseci.jsorc.live_actions import jaseci_action
+from playwright.sync_api import sync_playwright, Page
+
+
+@jaseci_action(act_group=["ws"], allow_remote=True)
+def scrape(urls: str, depth: int = 1):
+    all_content = ""
+
+    scraped = set()
+    with sync_playwright() as spw:
+        browser = spw.chromium.launch()
+        page = browser.new_page()
+
+        while depth > 0:
+            content, urls = scraping(page, urls, scraped)
+            all_content += f"\n{content}"
+            depth -= 1
+
+        browser.close()
+
+    return " ".join(all_content.split())
+
+
+def load_and_save(page: Page, target: str, scraped: set):
+    print("#############################")
+    try:
+        scraped.add(target)
+        print(f"loading {target} ...")
+        page.goto(target, wait_until="networkidle")
+
+        # print(f"capturing {target} ...")
+        # page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True)
+
+        print(f"getting relevant content {target} ...")
+        return page.evaluate(
+            """() =>
+            document.body.textContent;
+        """
+        )
+    except Exception as e:
+        print(
+            f"Error occurs when trying to load and save {target} ...\n{e}",
+        )
+        return ""
+
+
+def crawling(page: Page):
+    try:
+        return page.query_selector_all("a[href]")
+    except Exception as e:
+        print(f"Error occurs when trying to crawl {page.url} !\n{e}")
+        return []
+
+
+def scraping(page: Page, urls: set, scraped: set):
+    content = ""
+    next_scrape = set()
+
+    while urls:
+        url: str = urls.pop()
+        if url not in scraped:
+            content += load_and_save(page, url, scraped)
+
+            for ahref in crawling(page):
+                href = ahref.get_attribute("href")
+                if href.startswith("http"):
+                    next_scrape.add(href)
+                elif href.startswith("/"):
+                    next_scrape.add(f"{url}{href}")
+
+    return content, next_scrape
diff --git a/jaseci_ai_kit/jac_misc/setup.py b/jaseci_ai_kit/jac_misc/setup.py
index 453a07f003..a3d260dc63 100644
--- a/jaseci_ai_kit/jac_misc/setup.py
+++ b/jaseci_ai_kit/jac_misc/setup.py
@@ -11,6 +11,7 @@
     "huggingface",
     "langchain",
     "forecast",
+    "scraper",
 ]
 
 

From 075a59870c69261ee62d4a37583bf13adbf52388 Mon Sep 17 00:00:00 2001
From: "Alexie (Boyong) Madolid" <maex@tuta.io>
Date: Tue, 28 Nov 2023 17:36:47 +0800
Subject: [PATCH 2/2] [SCRAPER]: Complex conditions

---
 .../jac_misc/jac_misc/scraper/scraper.py      | 125 +++++++++++++++---
 1 file changed, 109 insertions(+), 16 deletions(-)

diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
index a6d97ca8b9..0b950e3c75 100644
--- a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
+++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
@@ -1,9 +1,19 @@
 from jaseci.jsorc.live_actions import jaseci_action
 from playwright.sync_api import sync_playwright, Page
+from typing import Union
+from re import search
 
 
 @jaseci_action(act_group=["ws"], allow_remote=True)
-def scrape(urls: str, depth: int = 1):
+def scrape(
+    urls: set,
+    scripts: dict = {},
+    url_filters: list = [],
+    timeout: int = 60000,
+    depth: int = 1,
+    detailed: bool = False,
+    excluded_elem: list = ["script", "style", "link", "noscript"],
+):
     all_content = ""
 
     scraped = set()
@@ -12,31 +22,78 @@ def scrape(urls: str, depth: int = 1):
         page = browser.new_page()
 
         while depth > 0:
-            content, urls = scraping(page, urls, scraped)
+            content, urls = scraping(
+                page, urls, scripts, url_filters, timeout, scraped, excluded_elem
+            )
             all_content += f"\n{content}"
             depth -= 1
 
         browser.close()
 
-    return " ".join(all_content.split())
+    contents = " ".join(all_content.split())
 
+    if detailed:
+        return {"contents": contents, "scraped": scraped}
+    return contents
+
+
+def load_and_save(
+    page: Page,
+    target: str,
+    script: Union[dict, str],
+    timeout: int,
+    scraped: set,
+    excluded_elem: list,
+):
+    wait_for = script.get("wait_for")
+    selector = script.get("selector")
+    custom = script.get("custom")
+
+    pre = script.get("pre") or {}
+    post = script.get("post") or {}
 
-def load_and_save(page: Page, target: str, scraped: set):
     print("#############################")
     try:
         scraped.add(target)
         print(f"loading {target} ...")
-        page.goto(target, wait_until="networkidle")
+        page.goto(target, wait_until="networkidle", timeout=timeout)
+
+        if wait_for:
+            page.wait_for_selector(**wait_for)
+
+        run_script(page, pre, "pre")
 
         # print(f"capturing {target} ...")
         # page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True)
 
-        print(f"getting relevant content {target} ...")
-        return page.evaluate(
-            """() =>
-            document.body.textContent;
-        """
-        )
+        exclusion = ""
+        for exc in excluded_elem:
+            exclusion += f'clone.querySelectorAll("{exc}").forEach(d => d.remove());\n'
+
+        query = f"""{{
+            clone = document.body.cloneNode(true);
+            {exclusion}
+            return clone.textContent;
+        }}"""
+        if custom:
+            query = f"{{{custom}}}"
+        elif selector:
+            query = f"""
+                Array.prototype.map.call(
+                    document.querySelectorAll("{selector}"),
+                    d => {{
+                        clone = d.cloneNode(true);
+                        {exclusion}
+                        return clone.textContent;
+                    }}).join("\n");
+            """
+
+        print(f"getting relevant content using {query} ...")
+        content = page.evaluate(f"() =>{query}")
+
+        run_script(page, post, "post")
+
+        return content
     except Exception as e:
         print(
             f"Error occurs when trying to load and save {target} ...\n{e}",
@@ -44,6 +101,19 @@ def load_and_save(page: Page, target: str, scraped: set):
         return ""
 
 
+def run_script(page: Page, script: dict, title: str):
+    if script:
+        expr = script["expr"]
+        print(f"running {title} script {expr}")
+        page.evaluate(f"() =>{{{expr}}}")
+
+        wait_for = script.get("wait_for") or {}
+        if wait_for:
+            page.wait_for_selector(**wait_for)
+
+        page.wait_for_load_state("networkidle")
+
+
 def crawling(page: Page):
     try:
         return page.query_selector_all("a[href]")
@@ -52,20 +122,43 @@ def crawling(page: Page):
         return []
 
 
-def scraping(page: Page, urls: set, scraped: set):
+def scraping(
+    page: Page,
+    urls: set,
+    scripts: dict,
+    url_filters: list,
+    timeout: int,
+    scraped: set,
+    excluded_elem: list,
+):
     content = ""
     next_scrape = set()
 
     while urls:
         url: str = urls.pop()
         if url not in scraped:
-            content += load_and_save(page, url, scraped)
+            script = {}
+            for key, val in scripts.items():
+                if search(key, url):
+                    script = val
+                    break
+
+            content += load_and_save(page, url, script, timeout, scraped, excluded_elem)
 
             for ahref in crawling(page):
                 href = ahref.get_attribute("href")
+                if href.startswith("/"):
+                    href = f"{url}{href}"
+
                 if href.startswith("http"):
-                    next_scrape.add(href)
-                elif href.startswith("/"):
-                    next_scrape.add(f"{url}{href}")
+                    included = True
+
+                    for filter in url_filters:
+                        if search(filter, href):
+                            included = False
+                            break
+
+                    if included:
+                        next_scrape.add(href)
 
     return content, next_scrape