From 13403984491cea892badaa3ee3e3b7d0df8f6824 Mon Sep 17 00:00:00 2001 From: djkato Date: Mon, 8 Jul 2024 20:20:36 +0200 Subject: [PATCH] almost finished first sitemap logic I think --- Cargo.lock | 1 + rust-toolchain.toml | 2 +- sitemap-generator/Cargo.toml | 2 + sitemap-generator/src/app.rs | 4 +- sitemap-generator/src/main.rs | 2 +- sitemap-generator/src/sitemap/category.rs | 1 + sitemap-generator/src/sitemap/collection.rs | 1 + .../src/sitemap/event_handler.rs | 192 ++++++++++++++++-- sitemap-generator/src/sitemap/mod.rs | 87 +++++--- sitemap-generator/src/sitemap/page.rs | 1 + sitemap-generator/src/sitemap/product.rs | 1 + sitemap-generator/src/test.rs | 30 ++- 12 files changed, 276 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bcf0ee4..d97cf4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3938,6 +3938,7 @@ dependencies = [ "serde_json", "surf", "tera", + "thiserror", "tinytemplate", "tokio", "tower", diff --git a/rust-toolchain.toml b/rust-toolchain.toml index afae8a9..c7dd9c8 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "nightly-2024-04-25" +channel = "nightly-2024-06-20" ## Toggle to this one for sdk releases # channel = "stable" targets = ["x86_64-unknown-linux-gnu", "wasm32-unknown-unknown"] diff --git a/sitemap-generator/Cargo.toml b/sitemap-generator/Cargo.toml index 5c202b5..1cd79da 100644 --- a/sitemap-generator/Cargo.toml +++ b/sitemap-generator/Cargo.toml @@ -28,6 +28,7 @@ tower-http = { workspace = true, features = ["fs", "trace"] } surf.workspace = true cynic = { workspace = true, features = ["http-surf"] } cynic-codegen.workspace = true +thiserror.workspace = true tera = { version = "1.19.1", default-features = false } fd-lock = "4.0.2" quick-xml = { version = "0.34.0", features = ["serialize"] } @@ -37,6 +38,7 @@ chrono = { version = "0.4.34", features = ["serde"] } serde_cbor = "0.11.2" pico-args = "0.5.0" rayon = "1.10.0" +# itertools = "0.13.0" [build-dependencies] cynic-codegen.workspace = true diff --git a/sitemap-generator/src/app.rs b/sitemap-generator/src/app.rs index 82cf6c9..d92cbae 100644 --- a/sitemap-generator/src/app.rs +++ b/sitemap-generator/src/app.rs @@ -10,6 +10,8 @@ use saleor_app_sdk::{config::Config, manifest::AppManifest, SaleorApp}; use serde::{Deserialize, Serialize}; use tracing::level_filters::LevelFilter; +use crate::queries::event_subjects_updated::Event; + // Make our own error that wraps `anyhow::Error`. pub struct AppError(anyhow::Error); @@ -60,7 +62,7 @@ pub struct AppState { pub target_channel: String, pub sitemap_config: SitemapConfig, pub manifest: AppManifest, - pub task_queue_sender: Sender, + pub task_queue_sender: Sender, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/sitemap-generator/src/main.rs b/sitemap-generator/src/main.rs index a7db11a..b6add7e 100644 --- a/sitemap-generator/src/main.rs +++ b/sitemap-generator/src/main.rs @@ -31,7 +31,7 @@ use tracing::{debug, error, info}; use crate::{ app::{trace_to_std, AppState, SitemapConfig}, queries::event_subjects_updated::EVENTS_QUERY, - routes::{create_routes, register::regenerate}, + routes::create_routes, }; #[tokio::main] diff --git a/sitemap-generator/src/sitemap/category.rs b/sitemap-generator/src/sitemap/category.rs index e69de29..8b13789 100644 --- a/sitemap-generator/src/sitemap/category.rs +++ b/sitemap-generator/src/sitemap/category.rs @@ -0,0 +1 @@ + diff --git a/sitemap-generator/src/sitemap/collection.rs b/sitemap-generator/src/sitemap/collection.rs index e69de29..8b13789 100644 --- a/sitemap-generator/src/sitemap/collection.rs +++ b/sitemap-generator/src/sitemap/collection.rs @@ -0,0 +1 @@ + diff --git a/sitemap-generator/src/sitemap/event_handler.rs b/sitemap-generator/src/sitemap/event_handler.rs index e0e7da0..43e79e2 100644 --- a/sitemap-generator/src/sitemap/event_handler.rs +++ b/sitemap-generator/src/sitemap/event_handler.rs @@ -1,30 +1,129 @@ +use quick_xml::DeError; use rayon::prelude::*; use std::{ - fs::{read_dir, File}, + fs::{self, read_dir, File}, io::BufReader, + path::PathBuf, }; +use tinytemplate::TinyTemplate; -use crate::queries::event_subjects_updated::Event; +use crate::{app::SitemapConfig, queries::event_subjects_updated::Event, sitemap::Url}; use tokio::{sync::mpsc::Receiver, task::JoinHandle}; -use tracing::warn; +use tracing::{debug, error, trace, warn}; -use super::UrlSet; +use super::{RefType, UrlSet}; + +// 10k links google says, but there's also a size limit and my custom params might be messing with +// that? Rather split prematurely to be sure. +const MAX_URL_IN_SET: usize = 6000; pub struct EventHandler { - receiver: Receiver, + receiver: Receiver<(Event, SitemapConfig)>, } impl EventHandler { - pub fn start(receiver: Receiver) -> JoinHandle<()> { + pub fn start(receiver: Receiver<(Event, SitemapConfig)>) -> JoinHandle<()> { let mut s = Self { receiver }; tokio::spawn(s.listen()) } async fn listen(mut self) { - while let Some(message) = self.receiver.recv().await { + while let Some((message, sitemap_config)) = self.receiver.recv().await { match message { Event::ProductCreated(product) => {} - Event::ProductUpdated(product) => {} + Event::ProductUpdated(product) => { + if let Some(product) = product.product { + let mut url_sets = read_xmls(&sitemap_config.target_folder).await; + let mut was_any_set_affected = false; + + //in case no sitemaps exist yet, create first urlset + if url_sets.is_empty() { + let url_set = UrlSet::new(); + url_sets.push(( + url_set, + std::path::Path::new(&format!( + "{}/0.xml", + sitemap_config.target_folder + )) + .to_path_buf(), + )); + } + + // check if any url_sets contain affected urls + for (set, path) in &mut url_sets { + let mut affected_urls = set.find_urls(product.id.inner()); + + if affected_urls.len() == 0 { + trace!("Product doesn't exist in url_set {:?}", path); + continue; + } + was_any_set_affected = true; + + // Update affected urls + affected_urls.iter_mut().for_each(|url| { + let mut templater = TinyTemplate::new(); + templater + .add_template("product", &sitemap_config.product_template) + .expect("Check your url templates!"); + let new_loc = templater + .render("product", &product) + .expect("Check your url templates!"); + debug!("updated `{}` to `{}`", &url.loc, new_loc); + url.loc = new_loc; + }); + } + + //create product url if no set contained url with it + if !was_any_set_affected { + debug!("Product isn't in any sitemap, creating..."); + if let Some((last_url_set, _)) = url_sets.last_mut() { + if product.category.is_none() { + debug!("product missing category, hopefully not needed in url template?"); + } + last_url_set.url.push(Url::new_with_ref( + product.id.inner().to_owned(), + product.slug, + RefType::Product, + product.category.clone().map(|c| c.id.inner().to_owned()), + product.category.clone().map(|c| c.slug), + Some(RefType::Category), + )); + } + } + + let mut split_url_sets = vec![]; + //write first time, if some throw too long error, split and try in second + //loop + for url_set in url_sets { + if let Err(e) = write_urlset_to_file(&url_set).await { + match e { + WriteUrlSetToFileErr::UrlSetTooLong(l) => { + debug!("url set too large ({l}), splitting..."); + if let Some(mut new_url_sets) = + split_urlset_to_new_file(url_set).await + { + split_url_sets.append(&mut new_url_sets); + } + } + e => error!("{:?}", e), + } + }; + } + + //the second attempt + for url_set in split_url_sets { + if let Err(e) = write_urlset_to_file(&url_set).await { + match e { + WriteUrlSetToFileErr::UrlSetTooLong(l) => { + error!("url set STILL too large?? ({l}), ignoring url set {:?}...", url_set); + } + e => error!("{:?}", e), + } + }; + } + } + warn!("Event::ProductCreated missing product"); + } Event::ProductDeleted(product) => {} Event::CategoryCreated(category) => {} Event::CategoryUpdated(category) => {} @@ -41,9 +140,9 @@ impl EventHandler { } } -async fn read_xmls() { - let paths = read_dir(std::env::var("SITEMAP_TARGET_FOLDER").unwrap()).unwrap(); - let mut all_urls: Vec = paths +async fn read_xmls(target_folder: &str) -> Vec<(UrlSet, PathBuf)> { + let paths = read_dir(target_folder).unwrap(); + let all_urls: Vec<(UrlSet, PathBuf)> = paths .into_iter() .par_bridge() .filter_map(|path| { @@ -51,10 +150,79 @@ async fn read_xmls() { if path.path().is_file() { let file = File::open(path.path()).expect("Unable to open file"); let reader = BufReader::new(file); - return Some(quick_xml::de::from_reader(reader).unwrap()); + return Some((quick_xml::de::from_reader(reader).unwrap(), path.path())); } } return None; }) .collect(); + all_urls +} + +/** +* fails `if url_set.url.len() > MAX_URL_IN_SET` +*/ +async fn split_urlset_to_new_file(union: (UrlSet, PathBuf)) -> Option> { + let (url_set, path) = union; + + if url_set.url.len() < MAX_URL_IN_SET { + return None; + } + + let mut was_original_file_assigned = false; + let chunks = url_set.url.chunks(MAX_URL_IN_SET).collect::>(); + + let mut file_number = path + .file_stem() + .unwrap() + .to_str() + .unwrap() + .parse::() + .unwrap(); + + return Some( + chunks + .into_iter() + .map(|urls| { + let folder = path.clone().parent().unwrap().to_str().unwrap().to_owned(); + + //keep incrementing file number till a file with that number is free to use + if !was_original_file_assigned { + was_original_file_assigned = true + } else { + while !std::path::Path::new(&format!("{folder}/{file_number}.xml")).exists() { + file_number = file_number + 1; + } + } + + let mut url_set = UrlSet::new(); + url_set.url = urls.into(); + ( + url_set, + std::path::Path::new(&format!("{folder}/{file_number}.xml")).to_path_buf(), + ) + }) + .collect::>(), + ); +} + +async fn write_urlset_to_file( + url_set_n_path: &(UrlSet, PathBuf), +) -> Result<(), WriteUrlSetToFileErr> { + let (url_set, path) = url_set_n_path; + if url_set.url.len() > MAX_URL_IN_SET { + return Err(WriteUrlSetToFileErr::UrlSetTooLong(url_set.url.len())); + } + fs::write(path, &quick_xml::se::to_string(&url_set)?)?; + Ok(()) +} + +#[derive(thiserror::Error, Debug)] +pub enum WriteUrlSetToFileErr { + #[error("writing error")] + IoResult(#[from] std::io::Error), + #[error("Url set length exeeds xml standard of 10k entries per file")] + UrlSetTooLong(usize), + #[error("{0}")] + DeError(#[from] DeError), } diff --git a/sitemap-generator/src/sitemap/mod.rs b/sitemap-generator/src/sitemap/mod.rs index d27dfb8..d92af64 100644 --- a/sitemap-generator/src/sitemap/mod.rs +++ b/sitemap-generator/src/sitemap/mod.rs @@ -8,13 +8,10 @@ use chrono::{DateTime, FixedOffset, SubsecRound}; use quick_xml::DeError; use serde::{Deserialize, Serialize}; - const SITEMAP_XMLNS: &str = "http://sitemaps.org/schemas/sitemap/0.9"; const SALEOR_REF_XMLNS: &str = "http://app-sitemap-generator.kremik.sk/xml-schemas/saleor-ref.xsd"; - - -#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)] #[serde(rename = "urlset")] pub struct UrlSet { #[serde(rename = "@xmlns:saleor")] @@ -24,28 +21,44 @@ pub struct UrlSet { pub url: Vec, } -#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)] pub struct Url { pub loc: String, pub lastmod: DateTime, #[serde(rename = "saleor:ref")] pub saleor_ref: SaleorRef, } + +#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)] pub enum RefType { Product, - + Category, + Collection, + Page, } -#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)] pub struct SaleorRef { #[serde(rename = "saleor:id")] pub id: String, #[serde(rename = "saleor:type")] - pub typ: String, - #[serde(rename = "saleor:category-id")] + pub typ: RefType, + /** + Related items come first in url, if present. eg: + site.com/{page} : typ = RefType::Page + site.com/{category}/{product} : typ= Product, related_typ: Category + */ + #[serde(rename = "saleor:related-id")] + #[serde(skip_serializing_if = "Option::is_none")] + pub related_id: Option, + /** + Related items come first in url, if present. eg: + site.com/{page} : typ = RefType::Page + site.com/{category}/{product} : typ= Product, related_typ: Category + */ + #[serde(rename = "saleor:related-typ")] #[serde(skip_serializing_if = "Option::is_none")] - pub category_id: Option, - pub + pub related_typ: Option, } impl UrlSet { @@ -76,14 +89,25 @@ impl UrlSet { url: vec![], } } + + pub fn find_urls(&mut self, id: &str) -> Vec<&mut Url> { + self.url + .iter_mut() + .filter(|url| { + url.saleor_ref.id == id || url.saleor_ref.related_id == Some(id.to_owned()) + }) + .collect() + } } impl Url { - pub fn new_generic_url(id: String, slug: String) -> Self { + pub fn new(id: String, slug: String, typ: RefType) -> Self { Self { saleor_ref: SaleorRef { - product_id: None, - category_id: Some(id), + id, + typ, + related_id: None, + related_typ: None, }, lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1), // Have template string determine the url @@ -91,20 +115,35 @@ impl Url { } } - pub fn new_product_url( - category_id: String, - product_id: String, - category_slug: String, - product_slug: String, + /** + For exaple: product/category, product/collection + */ + pub fn new_with_ref( + id: String, + slug: String, + typ: RefType, + related_id: Option, + related_slug: Option, + related_typ: Option, ) -> Self { + let loc = match related_slug { + Some(r_s) => { + format!("https://example.com/{r_s}/{slug}") + } + None => { + format!("https://example.com/{slug}") + } + }; Self { - // Have template string determine the url - loc: format!("https://example.com/{category_slug}/{product_slug}"), - lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1), saleor_ref: SaleorRef { - product_id: Some(product_id), - category_id: Some(category_id), + id, + typ, + related_id, + related_typ, }, + lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1), + // Have template string determine the url + loc, } } } diff --git a/sitemap-generator/src/sitemap/page.rs b/sitemap-generator/src/sitemap/page.rs index e69de29..8b13789 100644 --- a/sitemap-generator/src/sitemap/page.rs +++ b/sitemap-generator/src/sitemap/page.rs @@ -0,0 +1 @@ + diff --git a/sitemap-generator/src/sitemap/product.rs b/sitemap-generator/src/sitemap/product.rs index e69de29..8b13789 100644 --- a/sitemap-generator/src/sitemap/product.rs +++ b/sitemap-generator/src/sitemap/product.rs @@ -0,0 +1 @@ + diff --git a/sitemap-generator/src/test.rs b/sitemap-generator/src/test.rs index de4c1cd..af63b61 100644 --- a/sitemap-generator/src/test.rs +++ b/sitemap-generator/src/test.rs @@ -1,23 +1,35 @@ #[cfg(test)] mod test { - use crate::sitemap::{Url, UrlSet}; + use crate::sitemap::{RefType, Url, UrlSet}; fn urlset_serialisation_isnt_lossy() { let mut url_set = UrlSet::new(); url_set.url.append(&mut vec![ - Url::new_generic_url("category1coolid".to_string(), "category1".to_string()), - Url::new_generic_url("category2coolid".to_string(), "category2".to_string()), - Url::new_product_url( + Url::new( "category1coolid".to_string(), "category1".to_string(), - "product1coolid".to_string(), - "product1".to_string(), + RefType::Category, ), - Url::new_product_url( + Url::new( + "Collection1".to_string(), + "Collection1coolid".to_string(), + RefType::Collection, + ), + Url::new_with_ref( + "category1coolid".to_string(), + "category1".to_string(), + RefType::Product, + Some("product1coolid".to_string()), + Some("product1".to_string()), + Some(RefType::Category), + ), + Url::new_with_ref( "category2coolid".to_string(), "category2".to_string(), - "product2coolid".to_string(), - "product2".to_string(), + RefType::Product, + Some("product2coolid".to_string()), + Some("product2".to_string()), + Some(RefType::Category), ), ]); let file_str = url_set.to_file().unwrap();