diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py index f3d0f3cf7b..5a1220e473 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py @@ -17,6 +17,8 @@ get_hostname, ) +MAX_LENGTH = 1000000 # 1,048,576 (48, 576 as buffer) + class Client: def __init__(self) -> None: @@ -43,16 +45,19 @@ def close(self): def notify_client( self, target: str, pages: list, urls: dict, processing: dict, content=None ): - if self.enabled and self.socket and target: - data = { - "processing": processing, - "pending": [p["goto"]["url"] for p in pages], - "scanned": urls["scanned"], - "scraped": urls["scraped"], - } - if content: - data["content"] = content + data = { + "processing": processing, + "pending": [p["goto"]["url"] for p in pages], + "scanned": urls["scanned"], + "scraped": urls["scraped"], + } + if content: + data["content"] = content + + self.custom_notify_client(target, {"type": "scraper", "data": data}) + def custom_notify_client(self, target: str, data: dict, trial: int = 0): + if self.enabled and self.socket and target and trial < 5: try: callback = uuid4() self.socket.send( @@ -62,7 +67,7 @@ def notify_client( "data": { "target": target, "callback": callback, - "data": {"type": "scraper", "data": data}, + "data": data, }, } ) @@ -75,7 +80,7 @@ def notify_client( except Exception: exception("Error sending notification!") self.create_connection() - self.notify_client(target, pages, urls, processing, content) + self.custom_notify_client(target, data, trial + 1) async def scrape( @@ -244,16 +249,58 @@ async def run_scripts(page: Page, scripts: list[dict], urls: dict): add_url(page, urls) -async def scrape_preview(page: dict): +async def scrape_preview(page: dict, target: str = None): + ws = Client() + async with async_playwright() as aspw: browser = await aspw.chromium.launch() b_page = await browser.new_page() pg_goto = page.get("goto") or {} post_scripts = pg_goto.pop("post_scripts") or [] + ws.custom_notify_client( + target, + {"type": "scraper-preview", "data": {"status": "started", "data": pg_goto}}, + ) + await b_page.goto(**pg_goto) for script in post_scripts: + ws.custom_notify_client( + target, + { + "type": "scraper-preview", + "data": {"status": "processing", "data": script}, + }, + ) + method = script.pop("method", "evalutate") or "evaluate" await getattr(b_page, method)(**script) - return await b_page.evaluate(f"() => document.documentElement.outerHTML") + content = await b_page.evaluate(f"() => document.documentElement.outerHTML") + + size = len(content) + seq = 0 + max_seq = size - 1 + for chunks in ( + content[0 + i : MAX_LENGTH + i] for i in range(0, size, MAX_LENGTH) + ): + ws.custom_notify_client( + target, + { + "type": "scraper-preview", + "data": { + "status": "finalizing", + "chunk": seq, + "max": max_seq, + "data": chunks, + }, + }, + ) + seq += 1 + + ws.custom_notify_client( + target, + {"type": "scraper-preview", "data": {"status": "done"}}, + ) + + return content diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py index 876875e7c6..0d0b95ee77 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py @@ -25,6 +25,8 @@ class ScraperRequest(BaseModel): class ScraperPreviewRequest(BaseModel): page: dict + target: str = None + is_async: bool = False app = FastAPI() @@ -39,12 +41,14 @@ async def scrape(sr: ScraperRequest): async_scrape(sr.pages, sr.pre_configs, sr.detailed, sr.target) ) return {"task": task.get_name()} - else: - return await async_scrape(sr.pages, sr.pre_configs, sr.detailed, sr.target) + return await async_scrape(sr.pages, sr.pre_configs, sr.detailed, sr.target) @app.post("/scrape_preview/") async def scrape_preview(spr: ScraperPreviewRequest): - return await async_scrape_preview(spr.page) + if spr.is_async: + task = asyncio.create_task(async_scrape_preview(spr.page, spr.target)) + return {"task": task.get_name()} + return await async_scrape_preview(spr.page, spr.target) @app.get("/jaseci_actions_spec/") def action_list(): @@ -67,5 +71,5 @@ def scrape( return sync_scrape(pages, pre_configs, detailed, target) @jaseci_action(act_group=["wbs"]) - def scrape_preview(page: dict): - return sync_scrape_preview(page) + def scrape_preview(page: dict, target: str = None): + return sync_scrape_preview(page, target) diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py index 5ee4a44bbd..e4c1207f58 100644 --- a/jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py +++ b/jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py @@ -9,21 +9,27 @@ get_hostname, ) +MAX_LENGTH = 1000000 # 1,048,576 (48, 576 as buffer) -def notify_client(target: str, pages: list, urls: dict, processing: dict, content=None): + +def custom_notify_client(target: str, data: dict): if target: socket = JsOrc.svc("socket") if socket.is_running(): - data = { - "processing": processing, - "pending": [p["goto"]["url"] for p in pages], - "scanned": urls["scanned"], - "scraped": urls["scraped"], - } - if content: - data["content"] = content + socket.notify("client", target, data) + + +def notify_client(target: str, pages: list, urls: dict, processing: dict, content=None): + data = { + "processing": processing, + "pending": [p["goto"]["url"] for p in pages], + "scanned": urls["scanned"], + "scraped": urls["scraped"], + } + if content: + data["content"] = content - socket.notify("client", target, {"type": "scraper", "data": data}) + custom_notify_client(target, {"type": "scraper", "data": data}) def scrape( @@ -33,7 +39,7 @@ def scrape( urls = {"scanned": [], "scanned_urls": set(), "scraped": [], "crawled": set()} with sync_playwright() as spw: - browser = spw.chromium.launch() + browser = spw.chromium.launch(headless=False) page = browser.new_page() while pages: @@ -184,16 +190,56 @@ def run_scripts(page: Page, scripts: list[dict], urls: dict): add_url(page, urls) -def scrape_preview(page: dict): +def scrape_preview(page: dict, target: str): with sync_playwright() as spw: browser = spw.chromium.launch() b_page = browser.new_page() pg_goto = page.get("goto") or {} post_scripts = pg_goto.pop("post_scripts") or [] + custom_notify_client( + target, + {"type": "scraper-preview", "data": {"status": "started", "data": pg_goto}}, + ) + b_page.goto(**pg_goto) for script in post_scripts: + custom_notify_client( + target, + { + "type": "scraper-preview", + "data": {"status": "processing", "data": script}, + }, + ) + method = script.pop("method", "evalutate") or "evaluate" getattr(b_page, method)(**script) - return b_page.evaluate(f"() => document.documentElement.outerHTML") + content = b_page.evaluate(f"() => document.documentElement.outerHTML") + + size = len(content) + seq = 0 + max_seq = size - 1 + for chunks in ( + content[0 + i : MAX_LENGTH + i] for i in range(0, size, MAX_LENGTH) + ): + custom_notify_client( + target, + { + "type": "scraper-preview", + "data": { + "status": "finalizing", + "chunk": seq, + "max": max_seq, + "data": chunks, + }, + }, + ) + seq += 1 + + custom_notify_client( + target, + {"type": "scraper-preview", "data": {"status": "done"}}, + ) + + return content diff --git a/jaseci_serv/templates/examples/social_auth.html b/jaseci_serv/templates/examples/social_auth.html index 6a2a8e75ba..3e779d2212 100644 --- a/jaseci_serv/templates/examples/social_auth.html +++ b/jaseci_serv/templates/examples/social_auth.html @@ -75,7 +75,7 @@

Google Identity Services Authorization Token model

{% endif %} - + @@ -159,10 +159,11 @@

Google Identity Services Authorization Token model

function notify_server() { socket.send(JSON.stringify({"message": "test"})) } - +zsb_doc = document.querySelector("#zsb").contentWindow.document; window.onmessage = function(e) { data = e.data; if (data.type == 'iframe-scraper-target') { + console.log(zsb_doc.querySelectorAll(data.target)); alert(data.target); } };