Skip to content

Commit

Permalink
[SCRAPER]: Experimental html preview websocket
Browse files Browse the repository at this point in the history
  • Loading branch information
amadolid committed Jan 24, 2024
1 parent 544a990 commit d31850e
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 33 deletions.
73 changes: 60 additions & 13 deletions jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
get_hostname,
)

MAX_LENGTH = 1000000 # 1,048,576 (48, 576 as buffer)


class Client:
def __init__(self) -> None:
Expand All @@ -43,16 +45,19 @@ def close(self):
def notify_client(
self, target: str, pages: list, urls: dict, processing: dict, content=None
):
if self.enabled and self.socket and target:
data = {
"processing": processing,
"pending": [p["goto"]["url"] for p in pages],
"scanned": urls["scanned"],
"scraped": urls["scraped"],
}
if content:
data["content"] = content
data = {
"processing": processing,
"pending": [p["goto"]["url"] for p in pages],
"scanned": urls["scanned"],
"scraped": urls["scraped"],
}
if content:
data["content"] = content

self.custom_notify_client(target, {"type": "scraper", "data": data})

def custom_notify_client(self, target: str, data: dict, trial: int = 0):
if self.enabled and self.socket and target and trial < 5:
try:
callback = uuid4()
self.socket.send(
Expand All @@ -62,7 +67,7 @@ def notify_client(
"data": {
"target": target,
"callback": callback,
"data": {"type": "scraper", "data": data},
"data": data,
},
}
)
Expand All @@ -75,7 +80,7 @@ def notify_client(
except Exception:
exception("Error sending notification!")
self.create_connection()
self.notify_client(target, pages, urls, processing, content)
self.custom_notify_client(target, data, trial + 1)


async def scrape(
Expand Down Expand Up @@ -244,16 +249,58 @@ async def run_scripts(page: Page, scripts: list[dict], urls: dict):
add_url(page, urls)


async def scrape_preview(page: dict):
async def scrape_preview(page: dict, target: str = None):
ws = Client()

async with async_playwright() as aspw:
browser = await aspw.chromium.launch()
b_page = await browser.new_page()
pg_goto = page.get("goto") or {}
post_scripts = pg_goto.pop("post_scripts") or []

ws.custom_notify_client(
target,
{"type": "scraper-preview", "data": {"status": "started", "data": pg_goto}},
)

await b_page.goto(**pg_goto)
for script in post_scripts:
ws.custom_notify_client(
target,
{
"type": "scraper-preview",
"data": {"status": "processing", "data": script},
},
)

method = script.pop("method", "evalutate") or "evaluate"
await getattr(b_page, method)(**script)

return await b_page.evaluate(f"() => document.documentElement.outerHTML")
content = await b_page.evaluate(f"() => document.documentElement.outerHTML")

size = len(content)
seq = 0
max_seq = size - 1
for chunks in (
content[0 + i : MAX_LENGTH + i] for i in range(0, size, MAX_LENGTH)
):
ws.custom_notify_client(
target,
{
"type": "scraper-preview",
"data": {
"status": "finalizing",
"chunk": seq,
"max": max_seq,
"data": chunks,
},
},
)
seq += 1

ws.custom_notify_client(
target,
{"type": "scraper-preview", "data": {"status": "done"}},
)

return content
14 changes: 9 additions & 5 deletions jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class ScraperRequest(BaseModel):

class ScraperPreviewRequest(BaseModel):
page: dict
target: str = None
is_async: bool = False

app = FastAPI()

Expand All @@ -39,12 +41,14 @@ async def scrape(sr: ScraperRequest):
async_scrape(sr.pages, sr.pre_configs, sr.detailed, sr.target)
)
return {"task": task.get_name()}
else:
return await async_scrape(sr.pages, sr.pre_configs, sr.detailed, sr.target)
return await async_scrape(sr.pages, sr.pre_configs, sr.detailed, sr.target)

@app.post("/scrape_preview/")
async def scrape_preview(spr: ScraperPreviewRequest):
return await async_scrape_preview(spr.page)
if spr.is_async:
task = asyncio.create_task(async_scrape_preview(spr.page, spr.target))
return {"task": task.get_name()}
return await async_scrape_preview(spr.page, spr.target)

@app.get("/jaseci_actions_spec/")
def action_list():
Expand All @@ -67,5 +71,5 @@ def scrape(
return sync_scrape(pages, pre_configs, detailed, target)

@jaseci_action(act_group=["wbs"])
def scrape_preview(page: dict):
return sync_scrape_preview(page)
def scrape_preview(page: dict, target: str = None):
return sync_scrape_preview(page, target)
72 changes: 59 additions & 13 deletions jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,27 @@
get_hostname,
)

MAX_LENGTH = 1000000 # 1,048,576 (48, 576 as buffer)

def notify_client(target: str, pages: list, urls: dict, processing: dict, content=None):

def custom_notify_client(target: str, data: dict):
if target:
socket = JsOrc.svc("socket")
if socket.is_running():
data = {
"processing": processing,
"pending": [p["goto"]["url"] for p in pages],
"scanned": urls["scanned"],
"scraped": urls["scraped"],
}
if content:
data["content"] = content
socket.notify("client", target, data)


def notify_client(target: str, pages: list, urls: dict, processing: dict, content=None):
data = {
"processing": processing,
"pending": [p["goto"]["url"] for p in pages],
"scanned": urls["scanned"],
"scraped": urls["scraped"],
}
if content:
data["content"] = content

socket.notify("client", target, {"type": "scraper", "data": data})
custom_notify_client(target, {"type": "scraper", "data": data})


def scrape(
Expand All @@ -33,7 +39,7 @@ def scrape(
urls = {"scanned": [], "scanned_urls": set(), "scraped": [], "crawled": set()}

with sync_playwright() as spw:
browser = spw.chromium.launch()
browser = spw.chromium.launch(headless=False)
page = browser.new_page()

while pages:
Expand Down Expand Up @@ -184,16 +190,56 @@ def run_scripts(page: Page, scripts: list[dict], urls: dict):
add_url(page, urls)


def scrape_preview(page: dict):
def scrape_preview(page: dict, target: str):
with sync_playwright() as spw:
browser = spw.chromium.launch()
b_page = browser.new_page()
pg_goto = page.get("goto") or {}
post_scripts = pg_goto.pop("post_scripts") or []

custom_notify_client(
target,
{"type": "scraper-preview", "data": {"status": "started", "data": pg_goto}},
)

b_page.goto(**pg_goto)
for script in post_scripts:
custom_notify_client(
target,
{
"type": "scraper-preview",
"data": {"status": "processing", "data": script},
},
)

method = script.pop("method", "evalutate") or "evaluate"
getattr(b_page, method)(**script)

return b_page.evaluate(f"() => document.documentElement.outerHTML")
content = b_page.evaluate(f"() => document.documentElement.outerHTML")

size = len(content)
seq = 0
max_seq = size - 1
for chunks in (
content[0 + i : MAX_LENGTH + i] for i in range(0, size, MAX_LENGTH)
):
custom_notify_client(
target,
{
"type": "scraper-preview",
"data": {
"status": "finalizing",
"chunk": seq,
"max": max_seq,
"data": chunks,
},
},
)
seq += 1

custom_notify_client(
target,
{"type": "scraper-preview", "data": {"status": "done"}},
)

return content
5 changes: 3 additions & 2 deletions jaseci_serv/templates/examples/social_auth.html
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ <h1>Google Identity Services Authorization Token model</h1>
</div>
{% endif %}
</a>
<iframe id="zsb" src="about:blank" height="500px" width="500px" sandbox="allow-scripts allow-same-origin"></iframe>
<iframe id="zsb" src="about:blank" height="500px" width="500px" sandbox="allow-scripts allow-same-origin" style="display: none;"></iframe>
</div>
</div>
</div>
Expand Down Expand Up @@ -159,10 +159,11 @@ <h1>Google Identity Services Authorization Token model</h1>
function notify_server() {
socket.send(JSON.stringify({"message": "test"}))
}

zsb_doc = document.querySelector("#zsb").contentWindow.document;
window.onmessage = function(e) {
data = e.data;
if (data.type == 'iframe-scraper-target') {
console.log(zsb_doc.querySelectorAll(data.target));
alert(data.target);
}
};
Expand Down

0 comments on commit d31850e

Please sign in to comment.