From 83c7e07629f29bc3c98bc2541ad5c2b24151b91e Mon Sep 17 00:00:00 2001 From: j-mendez Date: Wed, 20 Mar 2024 12:35:09 -0400 Subject: [PATCH] feat(openai): add openai flag dynamic js snippets --- Cargo.toml | 11 ++++--- book/src/website.md | 41 +++++++++++++++++++++++++ examples/builder.py | 9 ++++-- examples/screenshot.py | 29 +++++++++++++++++ pyproject.toml | 6 +++- src/lib.rs | 2 ++ src/utils.rs | 70 ++++++++++++++++++++++++++++++++++++++++++ src/website.rs | 70 ++++++++++++++++++++++++++++++++++-------- 8 files changed, 217 insertions(+), 21 deletions(-) create mode 100644 examples/screenshot.py create mode 100644 src/utils.rs diff --git a/Cargo.toml b/Cargo.toml index 818dd9f..a6de24a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,9 @@ [package] edition = "2021" name = "spider_rs" -version = "0.0.27" -description = "The fastest web crawler written in Rust ported to nodejs." -repository = "https://github.com/spider-rs/spider-nodejs" +version = "0.0.30" +repository = "https://github.com/spider-rs/spider-py" +license = "MIT" [lib] crate-type = ["cdylib"] @@ -11,9 +11,10 @@ crate-type = ["cdylib"] [dependencies] indexmap = "2.1.0" num_cpus = "1.16.0" -spider = { version = "1.85.4", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] } -pyo3 = { version = "0.20.3", features = ["extension-module"] } +spider = { version = "1.86.11", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache", "serde", "openai" ] } +pyo3 = { version = "0.20.3", features = ["extension-module", "serde"] } pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] } +serde_json = "1.0.114" [target.x86_64-unknown-linux-gnu.dependencies] openssl-sys = { version = "0.9.96", features = ["vendored"] } diff --git a/book/src/website.md b/book/src/website.md index 616a691..5c64b04 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -236,6 +236,47 @@ async def main(): asyncio.run(main()) ``` +### OpenAI + +Use OpenAI to generate dynamic scripts to use with headless. Make sure to set the `OPENAI_API_KEY` env variable. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com").with_openai({ model: "gpt-3.5-turbo", prompt: "Search for movies", maxTokens: 300 }) + +asyncio.run(main()) +``` + +### Screenshots + +Take a screenshot of the pages on crawl when using headless chrome. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = ( + Website("https://choosealicense.com", False) + .with_screenshot({ + "params": { + "cdp_params": None, + "full_page": True, + "omit_background": False + }, + "bytes": False, + "save": True, + "output_dir": None + }) + ) + +asyncio.run(main()) +``` + + ### Http2 Prior Knowledge Use http2 to connect if you know the website servers supports this. diff --git a/examples/builder.py b/examples/builder.py index e518072..77d8501 100644 --- a/examples/builder.py +++ b/examples/builder.py @@ -3,8 +3,13 @@ from spider_rs import Website async def main(): - website = Website("https://choosealicense.com", False).with_agent("BotBot").with_headers({ "authorization": "Something "}) + website = ( + Website("https://choosealicense.com", False) + .with_user_agent("BotBot") + .with_headers({"authorization": "Something "}) + ) website.crawl() print(website.get_links()) -asyncio.run(main()) \ No newline at end of file + +asyncio.run(main()) diff --git a/examples/screenshot.py b/examples/screenshot.py new file mode 100644 index 0000000..c027556 --- /dev/null +++ b/examples/screenshot.py @@ -0,0 +1,29 @@ +import asyncio + +from spider_rs import Website + +async def main(): + website = ( + Website("https://choosealicense.com", False) + .with_screenshot({ + "params": { + "cdp_params": { + "format": None, + "quality": None, + "clip": None, + "from_surface": None, + "capture_beyond_viewport": None + }, + "full_page": True, + "omit_background": False + }, + "bytes": False, + "save": True, + "output_dir": None + }) + ) + website.crawl(None, None, True) + print(website.get_links()) + + +asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 2df10fc..83459e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,14 @@ [build-system] -requires = ["maturin>=1,<2"] +requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" +[tool.maturin] +features = ["pyo3/extension-module"] + [project] name = "spider_rs" requires-python = ">=3.7" +summary = "The fastest web crawler written in Rust" classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", diff --git a/src/lib.rs b/src/lib.rs index 6f1f2ab..c9d9536 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,11 +11,13 @@ pub mod npage; pub mod nwebsite; pub mod page; pub mod shortcut; +pub mod utils; pub mod website; pub use npage::{new_page, page_title, NPage}; pub use nwebsite::NWebsite; pub use page::Page; +pub use utils::pydict_to_json_value; pub use website::Website; #[pyfunction] diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..3704560 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,70 @@ +use pyo3::types::{PyAny, PyDict, PyList}; +use pyo3::PyResult; +use serde_json::Value as JsonValue; + +/// convert pyobject to json value +pub fn pyobj_to_json_value(obj: &PyAny) -> PyResult { + // Handle None + if obj.is_none() { + return Ok(JsonValue::Null); + } + // Handle boolean + else if let Ok(val) = obj.extract::() { + return Ok(JsonValue::Bool(val)); + } + // Handle integers + else if let Ok(val) = obj.extract::() { + return Ok(JsonValue::Number(val.into())); + } + // Handle floats + else if let Ok(val) = obj.extract::() { + if let Some(num) = serde_json::Number::from_f64(val) { + return Ok(JsonValue::Number(num)); + } else { + return Err(pyo3::exceptions::PyValueError::new_err( + "Float value out of range", + )); + } + } + // Handle strings + else if let Ok(val) = obj.extract::<&str>() { + return Ok(JsonValue::String(val.to_string())); + } + // Handle lists + else if let Ok(list) = obj.downcast::() { + let mut vec = Vec::new(); + for item in list.iter() { + vec.push(pyobj_to_json_value(item)?); + } + return Ok(JsonValue::Array(vec)); + } + // Handle dictionaries + else if let Ok(dict) = obj.downcast::() { + let mut map = serde_json::Map::new(); + for (k, v) in dict.iter() { + let key: &str = k.extract()?; + let value = pyobj_to_json_value(v)?; + map.insert(key.to_string(), value); + } + return Ok(JsonValue::Object(map)); + } + // Catch-all for unsupported types + else { + Err(pyo3::exceptions::PyTypeError::new_err( + "Unsupported Python type", + )) + } +} + +/// convert pydict to json value +pub fn pydict_to_json_value(py_dict: &pyo3::types::PyDict) -> PyResult { + let mut map = serde_json::Map::new(); + + for (k, v) in py_dict.iter() { + let key: &str = k.extract()?; + let value: JsonValue = pyobj_to_json_value(v)?; + map.insert(key.to_string(), value); + } + + Ok(serde_json::Value::Object(map)) +} diff --git a/src/website.rs b/src/website.rs index dca877f..439dbb6 100644 --- a/src/website.rs +++ b/src/website.rs @@ -1,6 +1,7 @@ -use crate::{new_page, NPage, BUFFER}; +use crate::{new_page, pydict_to_json_value, NPage, BUFFER}; use indexmap::IndexMap; use pyo3::prelude::*; +use pyo3::types::PyDict; use spider::compact_str::CompactString; use spider::configuration::WaitForIdleNetwork; use spider::tokio::select; @@ -741,24 +742,67 @@ impl Website { slf } + /// Take a screenshot of the page when using chrome. + pub fn with_screenshot<'a>( + mut slf: PyRefMut<'a, Self>, + screenshot_configs: Option<&'a PyDict>, + ) -> PyRefMut<'a, Self> { + if let Some(py_obj) = screenshot_configs { + if let Ok(config_json) = pydict_to_json_value(py_obj) { + match serde_json::from_value::(config_json) { + Ok(configs) => { + slf.inner.with_screenshot(Some(configs)); + } + Err(e) => { + spider::utils::log("", e.to_string()); + } + } + } else { + spider::utils::log("Error extracting String from PyAny", ""); + } + } + + slf + } + + /// Use OpenAI to generate dynamic javascript snippets. Make sure to set the `OPENAI_API_KEY` env variable. + pub fn with_openai<'a>( + mut slf: PyRefMut<'a, Self>, + openai_configs: Option<&'a PyDict>, + ) -> PyRefMut<'a, Self> { + if let Some(py_obj) = openai_configs { + if let Ok(config_json) = pydict_to_json_value(py_obj) { + match serde_json::from_value::(config_json) { + Ok(configs) => { + slf.inner.with_openai(Some(configs)); + } + Err(e) => { + spider::utils::log("", e.to_string()); + } + } + } else { + spider::utils::log("Error extracting String from PyAny for OpenAI config", ""); + } + } + + slf + } + /// Regex black list urls from the crawl pub fn with_blacklist_url( mut slf: PyRefMut<'_, Self>, blacklist_url: Option>, ) -> PyRefMut<'_, Self> { - slf - .inner - .configuration - .with_blacklist_url(match blacklist_url { - Some(v) => { - let mut blacklist: Vec = Vec::new(); - for item in v { - blacklist.push(CompactString::new(item)); - } - Some(blacklist) + slf.inner.with_blacklist_url(match blacklist_url { + Some(v) => { + let mut blacklist: Vec = Vec::new(); + for item in v { + blacklist.push(CompactString::new(item)); } - _ => None, - }); + Some(blacklist) + } + _ => None, + }); slf }