From 83c7e07629f29bc3c98bc2541ad5c2b24151b91e Mon Sep 17 00:00:00 2001
From: j-mendez <jeff@a11ywatch.com>
Date: Wed, 20 Mar 2024 12:35:09 -0400
Subject: [PATCH] feat(openai): add openai flag dynamic js snippets

---
 Cargo.toml             | 11 ++++---
 book/src/website.md    | 41 +++++++++++++++++++++++++
 examples/builder.py    |  9 ++++--
 examples/screenshot.py | 29 +++++++++++++++++
 pyproject.toml         |  6 +++-
 src/lib.rs             |  2 ++
 src/utils.rs           | 70 ++++++++++++++++++++++++++++++++++++++++++
 src/website.rs         | 70 ++++++++++++++++++++++++++++++++++--------
 8 files changed, 217 insertions(+), 21 deletions(-)
 create mode 100644 examples/screenshot.py
 create mode 100644 src/utils.rs

diff --git a/Cargo.toml b/Cargo.toml
index 818dd9f..a6de24a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,9 +1,9 @@
 [package]
 edition = "2021"
 name = "spider_rs"
-version = "0.0.27"
-description = "The fastest web crawler written in Rust ported to nodejs."
-repository = "https://github.com/spider-rs/spider-nodejs"
+version = "0.0.30"
+repository = "https://github.com/spider-rs/spider-py"
+license = "MIT"
 
 [lib]
 crate-type = ["cdylib"]
@@ -11,9 +11,10 @@ crate-type = ["cdylib"]
 [dependencies]
 indexmap = "2.1.0"
 num_cpus = "1.16.0"
-spider = { version = "1.85.4", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache" ] }
-pyo3 = { version = "0.20.3", features = ["extension-module"] }
+spider = { version = "1.86.11", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache", "serde", "openai" ] }
+pyo3 = { version = "0.20.3", features = ["extension-module", "serde"] }
 pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
+serde_json = "1.0.114"
 
 [target.x86_64-unknown-linux-gnu.dependencies]
 openssl-sys = { version = "0.9.96", features = ["vendored"] }
diff --git a/book/src/website.md b/book/src/website.md
index 616a691..5c64b04 100644
--- a/book/src/website.md
+++ b/book/src/website.md
@@ -236,6 +236,47 @@ async def main():
 asyncio.run(main())
 ```
 
+### OpenAI
+
+Use OpenAI to generate dynamic scripts to use with headless. Make sure to set the `OPENAI_API_KEY` env variable.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_openai({ model: "gpt-3.5-turbo", prompt: "Search for movies", maxTokens: 300 })
+
+asyncio.run(main())
+```
+
+### Screenshots
+
+Take a screenshot of the pages on crawl when using headless chrome.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = (
+        Website("https://choosealicense.com", False)
+        .with_screenshot({
+            "params": {
+                "cdp_params": None,
+                "full_page": True,
+                "omit_background": False
+            },
+            "bytes": False,
+            "save": True,
+            "output_dir": None
+        })
+    )
+
+asyncio.run(main())
+```
+
+
 ### Http2 Prior Knowledge
 
 Use http2 to connect if you know the website servers supports this.
diff --git a/examples/builder.py b/examples/builder.py
index e518072..77d8501 100644
--- a/examples/builder.py
+++ b/examples/builder.py
@@ -3,8 +3,13 @@
 from spider_rs import Website
 
 async def main():
-    website = Website("https://choosealicense.com", False).with_agent("BotBot").with_headers({ "authorization": "Something "})
+    website = (
+        Website("https://choosealicense.com", False)
+        .with_user_agent("BotBot")
+        .with_headers({"authorization": "Something "})
+    )
     website.crawl()
     print(website.get_links())
 
-asyncio.run(main())
\ No newline at end of file
+
+asyncio.run(main())
diff --git a/examples/screenshot.py b/examples/screenshot.py
new file mode 100644
index 0000000..c027556
--- /dev/null
+++ b/examples/screenshot.py
@@ -0,0 +1,29 @@
+import asyncio
+
+from spider_rs import Website
+
+async def main():
+    website = (
+        Website("https://choosealicense.com", False)
+        .with_screenshot({
+            "params": {
+                "cdp_params": {
+                    "format": None,
+                    "quality": None,
+                    "clip": None,
+                    "from_surface": None,
+                    "capture_beyond_viewport": None
+                },
+                "full_page": True,
+                "omit_background": False
+            },
+            "bytes": False,
+            "save": True,
+            "output_dir": None
+        })
+    )
+    website.crawl(None, None, True)
+    print(website.get_links())
+
+
+asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
index 2df10fc..83459e0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,14 @@
 [build-system]
-requires = ["maturin>=1,<2"]
+requires = ["maturin>=1.0,<2.0"]
 build-backend = "maturin"
 
+[tool.maturin]
+features = ["pyo3/extension-module"]
+
 [project]
 name = "spider_rs"
 requires-python = ">=3.7"
+summary = "The fastest web crawler written in Rust"
 classifiers = [
     "Programming Language :: Rust",
     "Programming Language :: Python :: Implementation :: CPython",
diff --git a/src/lib.rs b/src/lib.rs
index 6f1f2ab..c9d9536 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,11 +11,13 @@ pub mod npage;
 pub mod nwebsite;
 pub mod page;
 pub mod shortcut;
+pub mod utils;
 pub mod website;
 
 pub use npage::{new_page, page_title, NPage};
 pub use nwebsite::NWebsite;
 pub use page::Page;
+pub use utils::pydict_to_json_value;
 pub use website::Website;
 
 #[pyfunction]
diff --git a/src/utils.rs b/src/utils.rs
new file mode 100644
index 0000000..3704560
--- /dev/null
+++ b/src/utils.rs
@@ -0,0 +1,70 @@
+use pyo3::types::{PyAny, PyDict, PyList};
+use pyo3::PyResult;
+use serde_json::Value as JsonValue;
+
+/// convert pyobject to json value
+pub fn pyobj_to_json_value(obj: &PyAny) -> PyResult<JsonValue> {
+  // Handle None
+  if obj.is_none() {
+    return Ok(JsonValue::Null);
+  }
+  // Handle boolean
+  else if let Ok(val) = obj.extract::<bool>() {
+    return Ok(JsonValue::Bool(val));
+  }
+  // Handle integers
+  else if let Ok(val) = obj.extract::<i64>() {
+    return Ok(JsonValue::Number(val.into()));
+  }
+  // Handle floats
+  else if let Ok(val) = obj.extract::<f64>() {
+    if let Some(num) = serde_json::Number::from_f64(val) {
+      return Ok(JsonValue::Number(num));
+    } else {
+      return Err(pyo3::exceptions::PyValueError::new_err(
+        "Float value out of range",
+      ));
+    }
+  }
+  // Handle strings
+  else if let Ok(val) = obj.extract::<&str>() {
+    return Ok(JsonValue::String(val.to_string()));
+  }
+  // Handle lists
+  else if let Ok(list) = obj.downcast::<PyList>() {
+    let mut vec = Vec::new();
+    for item in list.iter() {
+      vec.push(pyobj_to_json_value(item)?);
+    }
+    return Ok(JsonValue::Array(vec));
+  }
+  // Handle dictionaries
+  else if let Ok(dict) = obj.downcast::<PyDict>() {
+    let mut map = serde_json::Map::new();
+    for (k, v) in dict.iter() {
+      let key: &str = k.extract()?;
+      let value = pyobj_to_json_value(v)?;
+      map.insert(key.to_string(), value);
+    }
+    return Ok(JsonValue::Object(map));
+  }
+  // Catch-all for unsupported types
+  else {
+    Err(pyo3::exceptions::PyTypeError::new_err(
+      "Unsupported Python type",
+    ))
+  }
+}
+
+/// convert pydict to json value
+pub fn pydict_to_json_value(py_dict: &pyo3::types::PyDict) -> PyResult<JsonValue> {
+  let mut map = serde_json::Map::new();
+
+  for (k, v) in py_dict.iter() {
+    let key: &str = k.extract()?;
+    let value: JsonValue = pyobj_to_json_value(v)?;
+    map.insert(key.to_string(), value);
+  }
+
+  Ok(serde_json::Value::Object(map))
+}
diff --git a/src/website.rs b/src/website.rs
index dca877f..439dbb6 100644
--- a/src/website.rs
+++ b/src/website.rs
@@ -1,6 +1,7 @@
-use crate::{new_page, NPage, BUFFER};
+use crate::{new_page, pydict_to_json_value, NPage, BUFFER};
 use indexmap::IndexMap;
 use pyo3::prelude::*;
+use pyo3::types::PyDict;
 use spider::compact_str::CompactString;
 use spider::configuration::WaitForIdleNetwork;
 use spider::tokio::select;
@@ -741,24 +742,67 @@ impl Website {
     slf
   }
 
+  /// Take a screenshot of the page when using chrome.
+  pub fn with_screenshot<'a>(
+    mut slf: PyRefMut<'a, Self>,
+    screenshot_configs: Option<&'a PyDict>,
+  ) -> PyRefMut<'a, Self> {
+    if let Some(py_obj) = screenshot_configs {
+      if let Ok(config_json) = pydict_to_json_value(py_obj) {
+        match serde_json::from_value::<spider::configuration::ScreenShotConfig>(config_json) {
+          Ok(configs) => {
+            slf.inner.with_screenshot(Some(configs));
+          }
+          Err(e) => {
+            spider::utils::log("", e.to_string());
+          }
+        }
+      } else {
+        spider::utils::log("Error extracting String from PyAny", "");
+      }
+    }
+
+    slf
+  }
+
+  /// Use OpenAI to generate dynamic javascript snippets. Make sure to set the `OPENAI_API_KEY` env variable.
+  pub fn with_openai<'a>(
+    mut slf: PyRefMut<'a, Self>,
+    openai_configs: Option<&'a PyDict>,
+  ) -> PyRefMut<'a, Self> {
+    if let Some(py_obj) = openai_configs {
+      if let Ok(config_json) = pydict_to_json_value(py_obj) {
+        match serde_json::from_value::<spider::configuration::GPTConfigs>(config_json) {
+          Ok(configs) => {
+            slf.inner.with_openai(Some(configs));
+          }
+          Err(e) => {
+            spider::utils::log("", e.to_string());
+          }
+        }
+      } else {
+        spider::utils::log("Error extracting String from PyAny for OpenAI config", "");
+      }
+    }
+
+    slf
+  }
+
   /// Regex black list urls from the crawl
   pub fn with_blacklist_url(
     mut slf: PyRefMut<'_, Self>,
     blacklist_url: Option<Vec<String>>,
   ) -> PyRefMut<'_, Self> {
-    slf
-      .inner
-      .configuration
-      .with_blacklist_url(match blacklist_url {
-        Some(v) => {
-          let mut blacklist: Vec<CompactString> = Vec::new();
-          for item in v {
-            blacklist.push(CompactString::new(item));
-          }
-          Some(blacklist)
+    slf.inner.with_blacklist_url(match blacklist_url {
+      Some(v) => {
+        let mut blacklist: Vec<CompactString> = Vec::new();
+        for item in v {
+          blacklist.push(CompactString::new(item));
         }
-        _ => None,
-      });
+        Some(blacklist)
+      }
+      _ => None,
+    });
 
     slf
   }