Skip to content

Commit

Permalink
temporary
Browse files Browse the repository at this point in the history
  • Loading branch information
amadolid committed Nov 28, 2023
1 parent 76c8c48 commit cc1ea5f
Showing 1 changed file with 94 additions and 15 deletions.
109 changes: 94 additions & 15 deletions jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
from jaseci.jsorc.live_actions import jaseci_action
from playwright.sync_api import sync_playwright, Page
from typing import Union
from re import match


@jaseci_action(act_group=["ws"], allow_remote=True)
def scrape(urls: str, depth: int = 1):
def scrape(
urls: set,
scripts: dict = {},
filters: list = [],
depth: int = 1,
detailed: bool = False,
exclude: list = ["script", "style", "link", "noscript"],
):
all_content = ""

scraped = set()
Expand All @@ -12,38 +21,91 @@ def scrape(urls: str, depth: int = 1):
page = browser.new_page()

while depth > 0:
content, urls = scraping(page, urls, scraped)
content, urls = scraping(page, urls, scripts, filters, scraped, exclude)
all_content += f"\n{content}"
depth -= 1

browser.close()

return " ".join(all_content.split())
contents = " ".join(all_content.split())

if detailed:
return {"contents": contents, "scraped": scraped}
return contents


def load_and_save(
page: Page, target: str, script: Union[dict, str], scraped: set, exclude
):
wait_for = script.get("wait_for")
selector = script.get("selector")
custom = script.get("custom")

pre = script.get("pre") or {}
post = script.get("post") or {}

def load_and_save(page: Page, target: str, scraped: set):
print("#############################")
try:
scraped.add(target)
print(f"loading {target} ...")
page.goto(target, wait_until="networkidle")

if wait_for:
page.wait_for_selector(**wait_for)

run_script(page, pre, "pre")

# print(f"capturing {target} ...")
# page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True)

print(f"getting relevant content {target} ...")
return page.evaluate(
"""() =>
document.body.textContent;
"""
)
exclusion = ""
for exc in exclude:
exclusion += f'clone.querySelectorAll("{exc}").forEach(d => d.remove());\n'

query = f"""{{
clone = document.body.cloneNode(true);
{exclusion}
return clone.textContent;
}}"""
if custom:
query = f"{{{custom}}}"
elif selector:
query = f"""
Array.prototype.map.call(
document.querySelectorAll("{selector}"),
d => {{
clone = d.cloneNode(true);
{exclusion}
return clone.textContent;
}}).join("\n");
"""

print(f"getting relevant content using {query} ...")
content = page.evaluate(f"() =>{query}")

run_script(page, post, "post")

return content
except Exception as e:
print(
f"Error occurs when trying to load and save {target} ...\n{e}",
)
return ""


def run_script(page: Page, script: dict, title: str):
if script:
expr = script["expr"]
print(f"running {title} script {expr}")
page.evaluate(f"() =>{{{expr}}}")

wait_for = script.get("wait_for") or {}
if wait_for:
page.wait_for_selector(**wait_for)

page.wait_for_load_state("networkidle")


def crawling(page: Page):
try:
return page.query_selector_all("a[href]")
Expand All @@ -52,20 +114,37 @@ def crawling(page: Page):
return []


def scraping(page: Page, urls: set, scraped: set):
def scraping(
page: Page, urls: set, scripts: dict, filters: list, scraped: set, exclude: list
):
content = ""
next_scrape = set()

while urls:
url: str = urls.pop()
if url not in scraped:
content += load_and_save(page, url, scraped)
script = {}
for key, val in scripts.items():
if match(key, url):
script = val
break

content += load_and_save(page, url, script, scraped, exclude)

for ahref in crawling(page):
href = ahref.get_attribute("href")
if href.startswith("/"):
href = f"{url}{href}"

if href.startswith("http"):
next_scrape.add(href)
elif href.startswith("/"):
next_scrape.add(f"{url}{href}")
included = True

for filter in filters:
if match(filter, href):
included = False
break

if included:
next_scrape.add(href)

return content, next_scrape

0 comments on commit cc1ea5f

Please sign in to comment.