diff --git a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py index 53d48e80af..ab796e8427 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/elastic_retrieval/elastic_retrieval.py @@ -1,5 +1,5 @@ from openai import OpenAI -from os import environ, unlink +from os import getenv, unlink from datetime import datetime from requests import get from uuid import uuid4 @@ -12,21 +12,19 @@ ES_CLIENT = None CONFIG = { "elastic": { - "url": environ.get("ELASTICSEARCH_URL", "http://localhost:9200"), - "key": environ.get("ELASTICSEARCH_API_KEY"), + "url": getenv("ELASTICSEARCH_URL", "http://localhost:9200"), + "key": getenv("ELASTICSEARCH_API_KEY"), "index_template": { - "name": environ.get("ELASTICSEARCH_INDEX_TEMPLATE") or "openai-embeddings", + "name": getenv("ELASTICSEARCH_INDEX_TEMPLATE") or "openai-embeddings", "index_patterns": ( - environ.get("ELASTICSEARCH_INDEX_PATTERNS") or "oai-emb-*" + getenv("ELASTICSEARCH_INDEX_PATTERNS") or "oai-emb-*" ).split(","), "priority": 500, "version": 1, "template": { "settings": { - "number_of_shards": int(environ.get("ELASTICSEARCH_SHARDS", "1")), - "number_of_replicas": int( - environ.get("ELASTICSEARCH_REPLICAS", "1") - ), + "number_of_shards": int(getenv("ELASTICSEARCH_SHARDS", "1")), + "number_of_replicas": int(getenv("ELASTICSEARCH_REPLICAS", "1")), "refresh_interval": "1s", }, "mappings": { @@ -35,13 +33,9 @@ "id": {"type": "keyword"}, "embedding": { "type": "dense_vector", - "dims": int( - environ.get("ELASTICSEARCH_VECTOR_SIZE", "1536") - ), + "dims": int(getenv("ELASTICSEARCH_VECTOR_SIZE", "1536")), "index": True, - "similarity": environ.get( - "ELASTICSEARCH_SIMILARITY", "cosine" - ), + "similarity": getenv("ELASTICSEARCH_SIMILARITY", "cosine"), }, "version": {"type": "keyword"}, }, @@ -49,17 +43,17 @@ }, }, }, - "openai": {"api_key": environ.get("OPENAI_API_KEY")}, + "openai": {"api_key": getenv("OPENAI_API_KEY")}, "openai_embedding": { - "model": environ.get("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"), + "model": getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"), }, "chunk_config": { - "chunk_size": int(environ.get("CHUNK_SIZE", "200")), - "min_chunk_size_chars": int(environ.get("MIN_CHUNK_SIZE_CHARS", "350")), - "min_chunk_length_to_embed": int(environ.get("MIN_CHUNK_LENGTH_TO_EMBED", "5")), - "max_num_chunks": int(environ.get("MAX_NUM_CHUNKS", "10000")), + "chunk_size": int(getenv("CHUNK_SIZE", "200")), + "min_chunk_size_chars": int(getenv("MIN_CHUNK_SIZE_CHARS", "350")), + "min_chunk_length_to_embed": int(getenv("MIN_CHUNK_LENGTH_TO_EMBED", "5")), + "max_num_chunks": int(getenv("MAX_NUM_CHUNKS", "10000")), }, - "batch_size": int(environ.get("BATCH_SIZE", "100")), + "batch_size": int(getenv("BATCH_SIZE", "100")), } diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/README.md b/jaseci_ai_kit/jac_misc/jac_misc/scraper/README.md index 22475eeb4d..132f9741a3 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/scraper/README.md +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/README.md @@ -1,26 +1,18 @@ # **SCRAPER (`Playwright Python`)** - -## wbs.**`url_to_filename`** -> **`Arguments`:** \ -> **url**: str -> -> **`Return`:** \ -> str -> -> **`Usage`:** \ -> To convert url to a valid file name -> - -##### **`HOW TO TRIGGER`** -```js -wbs.url_to_filename("https://google.com") +## **`!IMPORTANT!`** +remote action scraper needs to be run via `FastAPI` instead of `jaseci serv_action`. Remote action runs via uvicorn and this will require running playwright asynchronously and currently, remote action use non async function hence creating dedicated async scrape api. This is to allow local and remote scraper. +To run remotely: +```bash +export SCRAPER_SOCKET_URL=ws://your-websocket/ws # defaults to ws://jaseci-socket/ws +uvicorn jac_misc.scraper:app --host 0.0.0.0 --port 80 --reload ``` ## wbs.**`scrape`** > **`Arguments`:** \ > **pages**: list (structure below) \ > **pre_configs**: list (structure below)\ -> **detailed**: bool = False +> **detailed**: bool = False\ +> **target**: str = None > > **`Return`:** \ > str or dict @@ -29,7 +21,8 @@ wbs.url_to_filename("https://google.com") > To scrape specified url > > **`Remarks`:** \ -> **detailed** true will return dict with scanned/scraped urls +> **detailed** true will return dict with scanned/scraped urls\ +> **target** optional client id for websocket progress notifications > ##### **`STRUCTURE`** ```python diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py new file mode 100644 index 0000000000..f45a34b4b3 --- /dev/null +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py @@ -0,0 +1,210 @@ +import asyncio +import websocket +from os import getenv +from re import search +from orjson import dumps +from playwright.async_api import async_playwright, Page +from websocket import create_connection as _create_connection + +from jac_misc.scraper.utils import ( + add_url, + add_crawl, + get_script, + get_hostname, +) + + +def create_connection() -> websocket: + if getenv("SCRAPER_SOCKET_ENABLED") == "true": + ws = _create_connection(getenv("SCRAPER_SOCKET_URL", "ws://jaseci-socket/ws")) + ws.send(dumps({"type": "client_connect", "data": {}})) + return ws + return None + + +def notify_client( + websocket: websocket, + target: str, + pages: list, + urls: dict, + processing: dict, + content=None, +): + if websocket and target: + data = { + "processing": processing, + "pending": [p["goto"]["url"] for p in pages], + "scanned": list(urls["scanned"]), + } + if content: + data["response"] = content + + websocket.send( + dumps( + { + "type": "notify_client", + "data": { + "target": target, + "data": {"type": "scraper", "data": data}, + }, + } + ) + ) + + +async def scrape( + pages: list, pre_configs: list = [], detailed: bool = False, target: str = None +): + content = "" + urls = {"scanned": set(), "scraped": set(), "crawled": set()} + + ws = create_connection() + + async with async_playwright() as aspw: + browser = await aspw.chromium.launch() + page = await browser.new_page() + + while pages: + pg: dict = pages.pop(0) + + pg_goto = pg.get("goto") or {} + url = pg_goto.get("url") or "N/A" + + notify_client(ws, target, pages, urls, {"url": url, "status": "started"}) + + await goto(page, pg_goto, urls) + + content += await getters(page, pg.get("getters") or [], urls) + + await crawler(page, pg.get("crawler") or {}, urls, pages, pre_configs) + + notify_client(ws, target, pages, urls, {"url": url, "status": "completed"}) + + await browser.close() + + content = " ".join(content.split()) + + if detailed: + content = { + "content": content, + "scanned": list(urls["scanned"]), + "scraped": list(urls["scraped"]), + } + + notify_client(ws, target, pages, urls, None, content) + ws.close() + + return content + + +async def goto(page: Page, specs: dict, urls: dict): + if specs: + post = get_script(specs, "post") + await run_scripts(page, get_script(specs, "pre"), urls) + + print(f'[goto]: loading {specs["url"]}') + + await page.goto(**specs) + add_url(page, urls) + + await run_scripts(page, post, urls) + + +async def getters(page: Page, specss: list[dict], urls: dict): + content = "" + for specs in specss: + if specs: + post = get_script(specs, "post") + await run_scripts(page, get_script(specs, "pre"), urls) + + exel_str = "" + for exel in ( + specs.get("excluded_element", ["script", "style", "link", "noscript"]) + or [] + ): + exel_str += ( + f'clone.querySelectorAll("{exel}").forEach(d => d.remove());\n' + ) + + method = specs.get("method") + if method == "selector": + expression = f""" + Array.prototype.map.call( + document.querySelectorAll("{specs.get("expression")}"), + d => {{ + clone = d.cloneNode(true); + {exel_str} + return clone.textContent; + }}).join("\n"); + """ + elif method == "custom": + expression = f'{{{specs.get("expression")}}}' + elif method == "none": + expression = '""' + else: + expression = f"""{{ + clone = document.body.cloneNode(true); + {exel_str} + return clone.textContent; + }}""" + + if expression: + print(f"[getters]: getting content from {page.url}") + content += await page.evaluate(f"() =>{expression}") + add_url(page, urls, expression) + + await run_scripts(page, post, urls) + + return content + + +async def crawler(page: Page, specs: dict, urls: dict, pages: list, pre_configs: list): + if specs: + post = get_script(specs, "post") + await run_scripts(page, get_script(specs, "pre"), urls) + + queries = specs.get("queries") or [{"selector": "a[href]", "attribute": "href"}] + filters = specs.get("filters") or [] + depth = specs.get("depth", 1) or 0 + + if depth > 0: + for query in queries: + for node in await page.query_selector_all( + query.get("selector") or "a[href]" + ): + url = await node.get_attribute(query.get("attribute") or "href") + c_url = get_hostname(page) + + if url.startswith("/"): + url = f"{c_url}{url}" + + if url.startswith("http") and url not in urls["crawled"]: + included = not bool(filters) + + for filter in filters: + if search(filter, url): + included = True + break + + if included: + add_crawl( + pages, + pre_configs, + urls, + url, + { + "queries": queries, + "depth": depth - 1, + "filters": filters, + }, + ) + + await run_scripts(page, post, urls) + + +async def run_scripts(page: Page, scripts: list[dict], urls: dict): + for script in scripts: + method = script.pop("method", "evalutate") or "evaluate" + print(f"[script]: running method {method}\n{str(script)}") + await getattr(page, method)(**script) + add_url(page, urls) diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py index d98bca4803..4b5713742e 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py @@ -1,365 +1,54 @@ -from jaseci.jsorc.live_actions import jaseci_action -from playwright.sync_api import sync_playwright, Page -from typing import Union -from re import search -from copy import deepcopy - - -@jaseci_action(act_group=["wbs"], allow_remote=True) -def setup(): - pass +import asyncio +from sys import argv +from fastapi import FastAPI +from fastapi.responses import Response +from pydantic import BaseModel - -@jaseci_action(act_group=["wbs"], allow_remote=True) -def url_to_filename(url: str): - return "".join(c for c in url if c.isalnum()) +from jaseci.jsorc.live_actions import jaseci_action +from jac_misc.scraper.sync_scraper import scrape as sync_scrape +from jac_misc.scraper.async_scraper import scrape as async_scrape -@jaseci_action(act_group=["wbs"], allow_remote=True) -def scrape(pages: list, pre_configs: list = [], detailed: bool = False): - content = "" - urls = {"scanned": set(), "scraped": set(), "crawled": set()} - with sync_playwright() as spw: - browser = spw.chromium.launch() - page = browser.new_page() +if any(["uvicorn" in arg for arg in argv]): - while pages: - pg: dict = pages.pop(0) + class ScraperRequest(BaseModel): + pages: list + pre_configs: list = [] + detailed: bool = False + target: str = None + is_async: bool = False - goto(page, pg.get("goto") or {}, urls) - content += getters(page, pg.get("getters") or [], urls) - crawler(page, pg.get("crawler") or {}, urls, pages, pre_configs) + app = FastAPI() - browser.close() + @app.post("/setup/") + def setup(): + pass - content = " ".join(content.split()) + @app.post("/scrape/") + async def scrape(sr: ScraperRequest): + if sr.is_async: + task = asyncio.create_task( + async_scrape(sr.pages, sr.pre_configs, sr.detailed, sr.target) + ) + return {"task": task.get_name()} + else: + return await async_scrape(sr.pages, sr.pre_configs, sr.detailed, sr.target) - if detailed: + @app.get("/jaseci_actions_spec/") + def action_list(): return { - "content": content, - "scanned": list(urls["scanned"]), - "scraped": list(urls["scraped"]), + "wbs.setup": [], + "wbs.scrape": ["pages", "pre_configs", "detailed", "target", "is_async"], } - return content - - -def goto(page: Page, specs: dict, urls: dict): - if specs: - post = get_script(specs, "post") - run_scripts(page, get_script(specs, "pre"), urls) - - print(f'[goto]: loading {specs["url"]}') - - page.goto(**specs) - add_url(page, urls) - - run_scripts(page, post, urls) - - -def getters(page: Page, specss: list[dict], urls: dict): - content = "" - for specs in specss: - if specs: - post = get_script(specs, "post") - run_scripts(page, get_script(specs, "pre"), urls) - - exel_str = "" - for exel in ( - specs.get("excluded_element", ["script", "style", "link", "noscript"]) - or [] - ): - exel_str += ( - f'clone.querySelectorAll("{exel}").forEach(d => d.remove());\n' - ) - - method = specs.get("method") - if method == "selector": - expression = f""" - Array.prototype.map.call( - document.querySelectorAll("{specs.get("expression")}"), - d => {{ - clone = d.cloneNode(true); - {exel_str} - return clone.textContent; - }}).join("\n"); - """ - elif method == "custom": - expression = f'{{{specs.get("expression")}}}' - elif method == "none": - expression = '""' - else: - expression = f"""{{ - clone = document.body.cloneNode(true); - {exel_str} - return clone.textContent; - }}""" - - if expression: - print(f"[getters]: getting content from {page.url}") - content += page.evaluate(f"() =>{expression}") - add_url(page, urls, expression) - - run_scripts(page, post, urls) - return content - - -def crawler(page: Page, specs: dict, urls: dict, pages: list, pre_configs: list): - if specs: - post = get_script(specs, "post") - run_scripts(page, get_script(specs, "pre"), urls) - - queries = specs.get("queries") or [{"selector": "a[href]", "attribute": "href"}] - filters = specs.get("filters") or [] - depth = specs.get("depth", 1) or 0 - - if depth > 0: - for query in queries: - for node in page.query_selector_all(query.get("selector") or "a[href]"): - url = node.get_attribute(query.get("attribute") or "href") - c_url = get_hostname(page) - - if url.startswith("/"): - url = f"{c_url}{url}" - - if url.startswith("http") and url not in urls["crawled"]: - included = not bool(filters) - - for filter in filters: - if search(filter, url): - included = True - break - - if included: - add_crawl( - pages, - pre_configs, - urls, - url, - { - "queries": queries, - "depth": depth - 1, - "filters": filters, - }, - ) - - run_scripts(page, post, urls) - - -def get_script(specs: dict, name: str): - return specs.pop(f"{name}_scripts", []) or [] - - -def run_scripts(page: Page, scripts: list[dict], urls: dict): - for script in scripts: - method = script.pop("method", "evalutate") or "evaluate" - print(f"[script]: running method {method}\n{str(script)}") - getattr(page, method)(**script) - add_url(page, urls) - - -def add_url(page: Page, urls: dict, scraped: bool = False): - url = page.url - if url: - if url not in urls["scanned"]: - urls["scanned"].add(url) - - if scraped and url not in urls["scraped"]: - urls["scraped"].add(url) - - -def add_crawl(pages: list, pre_configs: list, urls: dict, url: str, def_crawl: dict): - urls["crawled"].add(url) - scraper = { - "goto": { - "url": url, - "wait_until": "networkidle", - "pre_scripts": [], - "post_scripts": [], - }, - "getters": [{"method": "default"}], - "crawler": def_crawl, - } - for pconf in pre_configs: - if search(pconf["regex"], url): - scraper = deepcopy(pconf["scraper"]) - (scraper.get("goto") or {})["url"] = url - scraper["crawler"] = scraper.get("crawler") or def_crawl - break - - pages.append(scraper) - - -def get_hostname(page: Page): - url = page.url - if url: - splitter = url.split("//") - protocol = splitter[0] - hostname = splitter[1].split("/")[0] - return f"{protocol}//{hostname}" - return url - - -########################################################################### -# OLD SCRAPER # -########################################################################### - - -@jaseci_action(act_group=["wbs"], allow_remote=True) -def old_scrape( - urls: set, - scripts: dict = {}, - url_filters: list = [], - timeout: int = 60000, - depth: int = 1, - detailed: bool = False, - excluded_elem: list = ["script", "style", "link", "noscript"], -): - all_content = "" - - scraped = set() - with sync_playwright() as spw: - browser = spw.chromium.launch() - page = browser.new_page() - - while depth > 0: - content, urls = scraping( - page, urls, scripts, url_filters, timeout, scraped, excluded_elem - ) - all_content += f"\n{content}" - depth -= 1 - - browser.close() - - contents = " ".join(all_content.split()) - - if detailed: - return {"contents": contents, "scraped": list(scraped)} - return contents - - -def load_and_save( - page: Page, - target: str, - script: Union[dict, str], - timeout: int, - scraped: set, - excluded_elem: list, -): - wait_for = script.get("wait_for") - selector = script.get("selector") - custom = script.get("custom") - - pre = script.get("pre") or {} - post = script.get("post") or {} - - print("#############################") - try: - scraped.add(target) - print(f"loading {target} ...") - page.goto(target, wait_until="networkidle", timeout=timeout) - - if wait_for: - page.wait_for_selector(**wait_for) - - run_script(page, pre, "pre") - - # print(f"capturing {target} ...") - # page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True) - - exclusion = "" - for exc in excluded_elem: - exclusion += f'clone.querySelectorAll("{exc}").forEach(d => d.remove());\n' - - query = f"""{{ - clone = document.body.cloneNode(true); - {exclusion} - return clone.textContent; - }}""" - if custom: - query = f"{{{custom}}}" - elif selector: - query = f""" - Array.prototype.map.call( - document.querySelectorAll("{selector}"), - d => {{ - clone = d.cloneNode(true); - {exclusion} - return clone.textContent; - }}).join("\n"); - """ - - print(f"getting relevant content using {query} ...") - content = page.evaluate(f"() =>{query}") - - run_script(page, post, "post") - - return content - except Exception as e: - print( - f"Error occurs when trying to load and save {target} ...\n{e}", - ) - return "" - - -def run_script(page: Page, script: dict, title: str): - if script: - expr = script["expr"] - print(f"running {title} script {expr}") - page.evaluate(f"() =>{{{expr}}}") - - wait_for = script.get("wait_for") or {} - if wait_for: - page.wait_for_selector(**wait_for) - - page.wait_for_load_state("networkidle") - - -def crawling(page: Page): - try: - return page.query_selector_all("a[href]") - except Exception as e: - print(f"Error occurs when trying to crawl {page.url} !\n{e}") - return [] - - -def scraping( - page: Page, - urls: set, - scripts: dict, - url_filters: list, - timeout: int, - scraped: set, - excluded_elem: list, -): - content = "" - next_scrape = set() - - while urls: - url: str = urls.pop() - if url not in scraped: - script = {} - for key, val in scripts.items(): - if search(key, url): - script = val - break - - content += load_and_save(page, url, script, timeout, scraped, excluded_elem) - - for ahref in crawling(page): - href = ahref.get_attribute("href") - if href.startswith("/"): - href = f"{url}{href}" - - if href.startswith("http"): - included = True - - for filter in url_filters: - if search(filter, href): - included = False - break +else: - if included: - next_scrape.add(href) + @jaseci_action(act_group=["wbs"]) + def setup(): + pass - return content, next_scrape + @jaseci_action(act_group=["wbs"]) + def scrape( + pages: list, pre_configs: list = [], detailed: bool = False, target: str = None + ): + return sync_scrape(pages, pre_configs, detailed, target) diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py new file mode 100644 index 0000000000..5e844a898c --- /dev/null +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py @@ -0,0 +1,179 @@ +from re import search +from playwright.sync_api import sync_playwright, Page + +from jaseci.jsorc.jsorc import JsOrc +from jaseci.extens.svc import SocketService as Ss +from jac_misc.scraper.utils import ( + add_url, + add_crawl, + get_script, + get_hostname, +) + + +def notify_client(target: str, pages: list, urls: dict, processing: dict, content=None): + if target: + socket = JsOrc.svc("socket", Ss) + if socket.is_running(): + data = { + "processing": processing, + "pending": [p["goto"]["url"] for p in pages], + "scanned": list(urls["scanned"]), + } + if content: + data["response"] = content + + socket.notify("client", target, {"type": "scraper", "data": data}) + + +def scrape( + pages: list, pre_configs: list = [], detailed: bool = False, target: str = None +): + content = "" + urls = {"scanned": set(), "scraped": set(), "crawled": set()} + + with sync_playwright() as spw: + browser = spw.chromium.launch() + page = browser.new_page() + + while pages: + pg: dict = pages.pop(0) + + pg_goto = pg.get("goto") or {} + url = pg_goto.get("url") or "N/A" + + notify_client(target, pages, urls, {"url": url, "status": "started"}) + + goto(page, pg_goto, urls) + + content += getters(page, pg.get("getters") or [], urls) + + crawler(page, pg.get("crawler") or {}, urls, pages, pre_configs) + + notify_client(target, pages, urls, {"url": url, "status": "completed"}) + + browser.close() + + content = " ".join(content.split()) + + if detailed: + content = { + "content": content, + "scanned": list(urls["scanned"]), + "scraped": list(urls["scraped"]), + } + + notify_client(target, pages, urls, None, content) + + return content + + +def goto(page: Page, specs: dict, urls: dict): + if specs: + post = get_script(specs, "post") + run_scripts(page, get_script(specs, "pre"), urls) + + print(f'[goto]: loading {specs["url"]}') + + page.goto(**specs) + add_url(page, urls) + + run_scripts(page, post, urls) + + +def getters(page: Page, specss: list[dict], urls: dict): + content = "" + for specs in specss: + if specs: + post = get_script(specs, "post") + run_scripts(page, get_script(specs, "pre"), urls) + + exel_str = "" + for exel in ( + specs.get("excluded_element", ["script", "style", "link", "noscript"]) + or [] + ): + exel_str += ( + f'clone.querySelectorAll("{exel}").forEach(d => d.remove());\n' + ) + + method = specs.get("method") + if method == "selector": + expression = f""" + Array.prototype.map.call( + document.querySelectorAll("{specs.get("expression")}"), + d => {{ + clone = d.cloneNode(true); + {exel_str} + return clone.textContent; + }}).join("\n"); + """ + elif method == "custom": + expression = f'{{{specs.get("expression")}}}' + elif method == "none": + expression = '""' + else: + expression = f"""{{ + clone = document.body.cloneNode(true); + {exel_str} + return clone.textContent; + }}""" + + if expression: + print(f"[getters]: getting content from {page.url}") + content += page.evaluate(f"() =>{expression}") + add_url(page, urls, expression) + + run_scripts(page, post, urls) + + return content + + +def crawler(page: Page, specs: dict, urls: dict, pages: list, pre_configs: list): + if specs: + post = get_script(specs, "post") + run_scripts(page, get_script(specs, "pre"), urls) + + queries = specs.get("queries") or [{"selector": "a[href]", "attribute": "href"}] + filters = specs.get("filters") or [] + depth = specs.get("depth", 1) or 0 + + if depth > 0: + for query in queries: + for node in page.query_selector_all(query.get("selector") or "a[href]"): + url = node.get_attribute(query.get("attribute") or "href") + c_url = get_hostname(page) + + if url.startswith("/"): + url = f"{c_url}{url}" + + if url.startswith("http") and url not in urls["crawled"]: + included = not bool(filters) + + for filter in filters: + if search(filter, url): + included = True + break + + if included: + add_crawl( + pages, + pre_configs, + urls, + url, + { + "queries": queries, + "depth": depth - 1, + "filters": filters, + }, + ) + + run_scripts(page, post, urls) + + +def run_scripts(page: Page, scripts: list[dict], urls: dict): + for script in scripts: + method = script.pop("method", "evalutate") or "evaluate" + print(f"[script]: running method {method}\n{str(script)}") + getattr(page, method)(**script) + add_url(page, urls) diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/tests/fixtures/scraper.jac b/jaseci_ai_kit/jac_misc/jac_misc/scraper/tests/fixtures/scraper.jac index 67eff40ecb..e4936c734d 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/scraper/tests/fixtures/scraper.jac +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/tests/fixtures/scraper.jac @@ -1,6 +1,6 @@ walker test_scraper { - can wbs.scrape, wbs.url_to_filename; - report wbs.url_to_filename("https://www.google.com/search?q=speed+test"); + can wbs.scrape, file.url_to_filename; + report file.url_to_filename("https://www.google.com/search?q=speed+test"); report wbs.scrape(pages = [{ "goto": { "url": "https://www.google.com/search?q=speed+test" diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/utils.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/utils.py new file mode 100644 index 0000000000..30a010828d --- /dev/null +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/utils.py @@ -0,0 +1,48 @@ +from re import search +from copy import deepcopy + + +def add_url(page, urls: dict, scraped: bool = False): + url = page.url + if url: + if url not in urls["scanned"]: + urls["scanned"].add(url) + + if scraped and url not in urls["scraped"]: + urls["scraped"].add(url) + + +def add_crawl(pages: list, pre_configs: list, urls: dict, url: str, def_crawl: dict): + urls["crawled"].add(url) + scraper = { + "goto": { + "url": url, + "wait_until": "networkidle", + "pre_scripts": [], + "post_scripts": [], + }, + "getters": [{"method": "default"}], + "crawler": def_crawl, + } + for pconf in pre_configs: + if search(pconf["regex"], url): + scraper = deepcopy(pconf["scraper"]) + (scraper.get("goto") or {})["url"] = url + scraper["crawler"] = scraper.get("crawler") or def_crawl + break + + pages.append(scraper) + + +def get_hostname(page): + url = page.url + if url: + splitter = url.split("//") + protocol = splitter[0] + hostname = splitter[1].split("/")[0] + return f"{protocol}//{hostname}" + return url + + +def get_script(specs: dict, name: str): + return specs.pop(f"{name}_scripts", []) or [] diff --git a/jaseci_core/jaseci/extens/act_lib/file.py b/jaseci_core/jaseci/extens/act_lib/file.py index 266124395b..363cacf72d 100644 --- a/jaseci_core/jaseci/extens/act_lib/file.py +++ b/jaseci_core/jaseci/extens/act_lib/file.py @@ -74,3 +74,8 @@ def delete(fn: str): return True else: return False + + +@jaseci_action() +def url_to_filename(url: str): + return "".join(c for c in url if c.isalnum()) diff --git a/jaseci_core/jaseci/jsorc/jsorc_settings.py b/jaseci_core/jaseci/jsorc/jsorc_settings.py index 8fc9791e11..1fd4647441 100644 --- a/jaseci_core/jaseci/jsorc/jsorc_settings.py +++ b/jaseci_core/jaseci/jsorc/jsorc_settings.py @@ -1,4 +1,4 @@ -import os +from os import getenv from time import time from jaseci.jsorc.jsorc_utils import load_default_yaml, get_service_map @@ -30,10 +30,10 @@ class JsOrcSettings: # -------------------------------------------------- KUBE --------------------------------------------------- # ############################################################################################################### - KUBE_NAMESPACE = os.getenv("KUBE_NAMESPACE", f"jaseci-{int(time() * 100000)}") + KUBE_NAMESPACE = getenv("KUBE_NAMESPACE", f"jaseci-{int(time() * 100000)}") KUBE_CONFIG = { - "enabled": bool(os.getenv("KUBE_NAMESPACE")), + "enabled": bool(getenv("KUBE_NAMESPACE")), "quiet": False, "automated": False, "namespace": KUBE_NAMESPACE, @@ -49,9 +49,9 @@ class JsOrcSettings: "enabled": True, "quiet": False, "automated": False, - "host": os.getenv("REDIS_HOST", "localhost"), - "port": os.getenv("REDIS_PORT", "6379"), - "db": os.getenv("REDIS_DB", "1"), + "host": getenv("REDIS_HOST", "localhost"), + "port": getenv("REDIS_PORT", "6379"), + "db": getenv("REDIS_DB", "1"), } REDIS_MANIFEST = load_default_yaml("redis") @@ -61,8 +61,8 @@ class JsOrcSettings: ############################################################################################################### DEFAULT_REDIS_URL = ( - f'redis://{os.getenv("REDIS_HOST", "localhost")}' - f':{os.getenv("REDIS_PORT", "6379")}/{os.getenv("REDIS_DB", "1")}' + f'redis://{getenv("REDIS_HOST", "localhost")}' + f':{getenv("REDIS_PORT", "6379")}/{getenv("REDIS_DB", "1")}' ) TASK_CONFIG = { @@ -125,12 +125,12 @@ class JsOrcSettings: ############################################################################################################### PROME_CONFIG = { - "enabled": bool(os.getenv("PROME_HOST")), + "enabled": bool(getenv("PROME_HOST")), "quiet": False, "automated": True, "url": ( - f'http://{os.getenv("PROME_HOST", "localhost")}' - f':{os.getenv("PROME_PORT", "9090")}' + f'http://{getenv("PROME_HOST", "localhost")}' + f':{getenv("PROME_PORT", "9090")}' ), } @@ -162,14 +162,14 @@ class JsOrcSettings: } ELASTIC_CONFIG = { - "enabled": bool(os.getenv("ELASTIC_HOST")), + "enabled": bool(getenv("ELASTIC_HOST")), "quiet": False, "automated": True, "url": ( - f'https://{os.getenv("ELASTIC_HOST", "localhost")}' - f':{os.getenv("ELASTIC_PORT", "9200")}' + f'https://{getenv("ELASTIC_HOST", "localhost")}' + f':{getenv("ELASTIC_PORT", "9200")}' ), - "auth": os.getenv("ELASTIC_AUTH"), + "auth": getenv("ELASTIC_AUTH"), "common_index": f"{KUBE_NAMESPACE}-common", "activity_index": f"{KUBE_NAMESPACE}-activity", "core_log_index": "core", @@ -200,12 +200,12 @@ class JsOrcSettings: ############################################################################################################### DB_REGEN_CONFIG = { - "enabled": os.environ.get("JSORC_DB_REGEN") == "true", - "host": os.environ.get("POSTGRES_HOST", "jaseci-db"), - "db": os.environ.get("DBNAME", "postgres"), - "user": os.environ.get("POSTGRES_USER"), - "password": os.environ.get("POSTGRES_PASSWORD"), - "port": os.getenv("POSTGRES_PORT", 5432), + "enabled": getenv("JSORC_DB_REGEN") == "true", + "host": getenv("POSTGRES_HOST", "jaseci-db"), + "db": getenv("DBNAME", "postgres"), + "user": getenv("POSTGRES_USER"), + "password": getenv("POSTGRES_PASSWORD"), + "port": getenv("POSTGRES_PORT", 5432), } DB_REGEN_MANIFEST = load_default_yaml("database") diff --git a/jaseci_core/jaseci/jsorc/jsorc_utils.py b/jaseci_core/jaseci/jsorc/jsorc_utils.py index e00b943611..dfe58b6944 100644 --- a/jaseci_core/jaseci/jsorc/jsorc_utils.py +++ b/jaseci_core/jaseci/jsorc/jsorc_utils.py @@ -245,10 +245,14 @@ def poke(self, cast: T = None, msg: str = None) -> Union[T, Any]: return ( self if cast and cast.__name__ == self.__class__.__name__ else self.app ) - raise Exception( + exec = Exception( msg or f"{self.__class__.__name__} is disabled or not yet configured!" ) + if self.error: + raise exec from self.error + raise exec + def is_ready(self): return self.state.is_ready() and self.app is None