From 46d901efd9ff756d33a634d0cc108b1d485bff9c Mon Sep 17 00:00:00 2001 From: Alex Kristiansen Date: Thu, 26 Oct 2023 17:23:26 -0700 Subject: [PATCH 1/2] number of bugfixes, add historical_metadata --- kirum/src/entries.rs | 28 ++++++-- kirum/src/files.rs | 1 + kirum/src/generate.rs | 5 +- kirum/src/ingest/json.rs | 14 +++- kirum/src/main.rs | 5 +- kirum/src/new.rs | 3 + kirum/src/tmpl.rs | 1 + libkirum/src/kirum.rs | 130 +++++++++++++++++++++++++++++++++++-- libkirum/src/matching.rs | 3 + libkirum/src/transforms.rs | 2 +- readme.md | 1 + 11 files changed, 174 insertions(+), 19 deletions(-) diff --git a/kirum/src/entries.rs b/kirum/src/entries.rs index c079df4..5311793 100644 --- a/kirum/src/entries.rs +++ b/kirum/src/entries.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use anyhow::{anyhow, Result}; use libkirum::{word::{PartOfSpeech, Etymology}, kirum::Lexis, transforms::{TransformFunc, Transform}, matching::LexisMatch, lemma::Lemma}; use serde::{Serialize, Deserialize}; use serde_with::skip_serializing_none; @@ -46,8 +47,12 @@ pub struct RawLexicalEntry { #[serde(default = "default_archaic")] /// Optional user tagging pub archaic: bool, + /// Optional tags used for user-filtering pub tags: Option>, - /// A tag that tells Kirum to generate the word based on the phonetic rule set specified by the tag + /// Optional metadata values used for filtering, and ordering. + /// Unlike tags, historical_metadata will be copied to any derivative words, and can be used for templating, filtering, etc + pub historical_metadata: Option>, + /// A key that tells Kirum to generate the word based on the phonetic rule set specified by the tag pub generate: Option, /// Words that will be added as a derivative of the enclosing Lexis; any value not specified will be taken from the enclosing entry. pub derivatives: Option> @@ -82,6 +87,7 @@ impl From for Lexis{ definition: source.definition, archaic: source.archaic, tags: source.tags.unwrap_or(Vec::new()), + historical_metadata: source.historical_metadata.unwrap_or(HashMap::new()), word_create: source.generate } } @@ -97,6 +103,7 @@ impl From for RawLexicalEntry{ etymology: None, archaic: value.archaic, tags: if !value.tags.is_empty() {Some(value.tags)} else {None}, + historical_metadata: if !value.historical_metadata.is_empty() {Some(value.historical_metadata)} else {None}, derivatives: None, generate: value.word_create } @@ -104,7 +111,10 @@ impl From for RawLexicalEntry{ } /// take the output of a call to to_vec_etymons() and structure it like a graph json file structure -pub fn create_json_graph(lex: Vec<(Lexis, Etymology)>,mut key_gen: F) -> WordGraph +/// If render_metadata is false, any historical_metadata fields will not be copied. +/// This is useful in situations where we're writing out derivative values, and don't want metadata that will be +/// re-derived during ingest to get copied over +pub fn create_json_graph(lex: Vec<(Lexis, Etymology)>,mut key_gen: F, render_metadata: bool) -> Result where F: FnMut(Lexis) -> String { let mut graph: HashMap = HashMap::new(); @@ -112,9 +122,15 @@ pub fn create_json_graph(lex: Vec<(Lexis, Etymology)>,mut key_gen: F) -> Word for (word, ety) in lex{ let base: RawLexicalEntry = word.clone().into(); let found_ety = if !ety.etymons.is_empty() {Some(ety)} else {None}; - let complete = RawLexicalEntry{etymology: found_ety, ..base}; + let mut complete = RawLexicalEntry{etymology: found_ety, ..base}; + if !render_metadata{ + complete.historical_metadata = None + } let key = key_gen(word); - graph.insert(key, complete); - } - WordGraph { words: graph } + let found = graph.insert(key.clone(), complete.clone()); + if let Some(existing) = found{ + return Err(anyhow!("Key {} already exists in map; existing: '{}' \n new:' '{}'", key, existing.definition, complete.definition)) + } + }; + Ok( WordGraph { words: graph }) } \ No newline at end of file diff --git a/kirum/src/files.rs b/kirum/src/files.rs index 5f1fbdb..c674d23 100644 --- a/kirum/src/files.rs +++ b/kirum/src/files.rs @@ -89,6 +89,7 @@ pub fn read_tree_files(files: &Vec) -> Result Result<()> { }, cli::Format::Json => { let words = computed.to_vec_etymons(|_|true); - let word_data = create_json_graph(words, |l| l.id); + let word_data = create_json_graph(words, |l| l.id, false) + .context("could not create map from language data")?; serde_json::to_string_pretty(&word_data)? } diff --git a/kirum/src/new.rs b/kirum/src/new.rs index 6ee6e1c..d28a221 100644 --- a/kirum/src/new.rs +++ b/kirum/src/new.rs @@ -49,12 +49,14 @@ pub fn create_new_project(name: &str) -> Result<()> { etymology: None, archaic: true, tags: None, + historical_metadata: None, derivatives: None, generate: None, }); word_map.insert("latin_example".into(), RawLexicalEntry { word: None, word_type: Some("word".into()), + historical_metadata: None, language: Some("Latin".into()), definition: "an instance, model, example".into(), part_of_speech: Some(libkirum::word::PartOfSpeech::Noun), @@ -71,6 +73,7 @@ pub fn create_new_project(name: &str) -> Result<()> { etymology: None, archaic: true, tags: None, + historical_metadata: None, derivatives: None, generate: None, }, diff --git a/kirum/src/tmpl.rs b/kirum/src/tmpl.rs index a4fb69d..47042a5 100644 --- a/kirum/src/tmpl.rs +++ b/kirum/src/tmpl.rs @@ -7,6 +7,7 @@ use anyhow::{Result, Context, anyhow}; /// Render a dictionary from a list of words, and a template pub fn generate_from_tmpl(rendered_lang: Vec, template_file: String, rhai_files: Option>) -> Result { let mut reg = Handlebars::new(); + reg.register_escape_fn(handlebars::no_escape); reg.register_helper("string_eq", Box::new(string_eq)); reg.register_template_file("tmpl", &template_file).context(format!("could not add template file {}", template_file))?; if let Some(files) = rhai_files{ diff --git a/libkirum/src/kirum.rs b/libkirum/src/kirum.rs index 91a31b0..13eaa86 100644 --- a/libkirum/src/kirum.rs +++ b/libkirum/src/kirum.rs @@ -11,7 +11,7 @@ use petgraph::stable_graph::NodeIndex; use petgraph::Graph; use log::{trace, debug}; -#[derive(Clone, Default, PartialEq, serde::Deserialize, serde::Serialize)] +#[derive(Clone, Default, serde::Deserialize, serde::Serialize)] /// A Lexis represents a headword in Kirum's lexicon, be it a word, word stem, morpheme, etc. pub struct Lexis { /// Optional ID for the lex, used by connect_etymology_id @@ -32,12 +32,34 @@ pub struct Lexis { /// Optional user-supplied tags //#[serde(skip)] pub tags: Vec, + /// Optional user-supplied metadata. Unlike tags, historical_metadata will trickle down to any derivative words. + /// This shared metadata can be used to track common qualities of words, for filtering, templating, etc + pub historical_metadata: HashMap, /// Optional field that can be used to randomly generate a word value if none exists, separate from any etymology. /// If the given word has no etymology, this value takes prescience. /// The string value is used to generate a word based on the underlying phonology rules supplied to the TreeEtymology structure. pub word_create: Option } +// this custom implementation exists because we don't want history metadata to count towards equality +// as the metadata field might shift while the graph is still being built. +impl PartialEq for Lexis { + fn eq(&self, other: &Self) -> bool { + self.id == other.id && + self.word == other.word && + self.language == other.language && + self.pos == other.pos && + self.lexis_type == other.lexis_type && + self.definition == other.definition && + self.archaic == other.archaic && + self.tags == other.tags && + self.word_create == other.word_create + + } + fn ne(&self, other: &Self) -> bool { + ! self.eq(other) + } +} impl std::fmt::Debug for Lexis { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -194,11 +216,21 @@ impl LanguageTree { } } + + if ety_idx.is_none(){ + ety_idx = Some(self.graph.add_node(etymon)); + } + if lex_idx.is_none(){ lex_idx = Some(self.graph.add_node(lex)); } - if ety_idx.is_none(){ - ety_idx = Some(self.graph.add_node(etymon)); + + // trickle down metadata + let ety_metadata = &self.graph[ety_idx.unwrap()].historical_metadata; + if !ety_metadata.is_empty() { + let mut new = ety_metadata.clone(); + new.extend(self.graph[lex_idx.unwrap()].historical_metadata.iter().map(|(k, v)| (k.clone(), v.clone()))); + self.graph[lex_idx.unwrap()].historical_metadata = new; } self.graph.add_edge(ety_idx.unwrap(), lex_idx.unwrap(), TreeEtymology { transforms: trans, intermediate_word: None, agglutination_order }); @@ -216,6 +248,8 @@ impl LanguageTree { None => false } } + + /// Fill out the graph, walking the structure until all possible lexii have been generated or updated. /// This method is idempotent, and can be run any time to calculate unpopulated or incorrect lexii in the language tree. pub fn compute_lexicon(&mut self) { @@ -267,13 +301,16 @@ impl LanguageTree { self.graph[node].word = Some(rendered_word); updated.insert(node, true); + + // merge upstream historical metadata + self.combine_maps_for_lex_idx(&node); // check global transforms if let Some(gt) = &self.global_transforms { let mut updating = self.graph[node].clone(); + let etys: Vec<&Lexis> = self.graph.neighbors_directed(node, Direction::Incoming).map(|e| &self.graph[e]).collect(); for trans in gt { // collect the upstream etymons - let etys: Vec<&Lexis> = self.graph.neighbors_directed(node, Direction::Incoming).map(|e| &self.graph[e]).collect(); - trans.transform(&mut updating, Some(etys)); + trans.transform(&mut updating, Some(&etys)); trace!("updated word {:?} with global transform ", self.graph[node].id); } self.graph[node] = updating; @@ -315,6 +352,17 @@ impl LanguageTree { } } + fn combine_maps_for_lex_idx(&mut self, id: &NodeIndex) { + let etys: Vec = self.graph.neighbors_directed(*id, Direction::Incoming).map(|e| self.graph[e].clone()).collect(); + for ety in etys { + if !ety.historical_metadata.is_empty(){ + self.graph[*id].historical_metadata.extend(ety.historical_metadata.iter().map(|(k, v)| (k.clone(), v.clone()))); + } + + } + } + + /// Walk through each word in the tree, applying the walk_function closure. The closure takes a Lexis value, and returns a tuple of two optional Lexis and Transform values. /// If the closure returns `Some()` for the Lexis value, the enclosed Lexis will be added as a derivative word to the tree. pub fn walk_create_derivatives(&mut self, mut walk_function: impl FnMut(Lexis)->(Option, Option)){ @@ -418,6 +466,7 @@ impl LanguageTree { } + fn join_string_vectors(words: &mut [(i32, Lemma)]) -> Lemma{ words.sort_by_key(|k| k.0); let merged: Vec = words.iter().flat_map(|s| s.1.clone().chars()).collect(); @@ -435,8 +484,10 @@ mod tests { fn create_basic_words() -> LanguageTree { - let parent = Lexis{id: "parent".to_string(), word: Some("wrh".into()), language: "gauntlet".to_string(), lexis_type: "root".to_string(), ..Default::default()}; - let derivative_one = Lexis{id: "derivative_one".to_string(), word: None, lexis_type: "word".to_string(), ..parent.clone()}; + let parent = Lexis{id: "parent".to_string(), word: Some("wrh".into()), language: "gauntlet".to_string(), + historical_metadata: HashMap::from([("test".to_string(), "t".to_string())]), lexis_type: "root".to_string(), ..Default::default()}; + let derivative_one = Lexis{id: "derivative_one".to_string(), word: None, + historical_metadata: HashMap::from([("derivative".to_string(), "one".to_string())]), lexis_type: "word".to_string(), ..parent.clone()}; let derivative_two = Lexis{id: "derivative_two".to_string(), word: None, lexis_type: "word".to_string(), ..parent.clone()}; let transform_one = Transform{name: "first_transform".to_string(), @@ -486,7 +537,72 @@ mod tests { test_tree.compute_lexicon(); let test_word = test_tree.to_vec_etymons(|f| f.language == "New Gauntlet".to_string()); assert_eq!(test_word[0].0.word.clone().unwrap(), Lemma::from("kasurauwarh")) + } + + #[test] + fn test_metadata_derives(){ + let mut test_tree = create_basic_with_globals(); + test_tree.compute_lexicon(); + + let final_dict = test_tree.to_vec(); + for word in final_dict { + assert_eq!((Some(&"t".to_string())), word.historical_metadata.get("test")) + } + } + + #[test] + fn metadata_multiple_object() { + let mut test_tree = create_basic_with_globals(); + test_tree.compute_lexicon(); + + let final_dict = test_tree.to_vec(); + for word in final_dict { + match word.id.as_str() { + "parent" => { + assert_eq!(HashMap::from([("test".to_string(), "t".to_string())]), word.historical_metadata) + }, + "derivative_one"=> { + assert_eq!(HashMap::from([("test".to_string(), "t".to_string()), ("derivative".to_string(), "one".to_string())]), word.historical_metadata) + }, + "derivative_two" => { + assert_eq!(HashMap::from([("test".to_string(), "t".to_string()), ("derivative".to_string(), "one".to_string())]), word.historical_metadata) + } + _ => {assert!(false, "bad map value in test")} + } + } + } + + #[test] + fn metadata_out_of_order() { + let parent = Lexis{id: "parent".to_string(), word: Some("wrh".into()), language: "gauntlet".to_string(), + historical_metadata: HashMap::from([("test".to_string(), "t".to_string())]), lexis_type: "root".to_string(), ..Default::default()}; + let derivative_one = Lexis{id: "derivative_one".to_string(), word: None, + historical_metadata: HashMap::from([("derivative".to_string(), "one".to_string())]), lexis_type: "word".to_string(), ..parent.clone()}; + let derivative_two = Lexis{id: "derivative_two".to_string(), word: None, lexis_type: "word".to_string(), ..parent.clone()}; + + let transform_one = Transform{name: "first_transform".to_string(), + lex_match: None, + transforms: vec![TransformFunc::LetterArray { letters: vec![LetterArrayValues::Place(0), LetterArrayValues::Char("a".into()), LetterArrayValues::Place(1), LetterArrayValues::Place(2)] }] + }; + + let transform_two = Transform{name: "second_transform".to_string(), + lex_match: None, + transforms: vec![TransformFunc::Prefix { value: "au".into() }], + }; + let mut tree = LanguageTree::new(); + + tree.add_lexis(derivative_one.clone()); + tree.connect_etymology_id(derivative_two, derivative_one.id.clone(), vec![transform_two], None); + tree.connect_etymology(derivative_one, parent, vec![transform_one], None); + + + tree.compute_lexicon(); + + let final_dict = tree.to_vec(); + for word in final_dict { + assert_eq!((Some(&"t".to_string())), word.historical_metadata.get("test")) + } } #[test] diff --git a/libkirum/src/matching.rs b/libkirum/src/matching.rs index fdbff01..b513782 100644 --- a/libkirum/src/matching.rs +++ b/libkirum/src/matching.rs @@ -187,6 +187,8 @@ pub enum WhenMatch{ #[cfg(test)] mod tests { + use std::collections::HashMap; + use crate::errors::LangError; use crate::kirum::Lexis; use crate::matching::{Value, ValueMatch, LexisMatch, EqualValue}; @@ -203,6 +205,7 @@ mod tests { definition: "".to_string(), archaic: false, tags: vec!["tag1".to_string(), "tag2".to_string()], + historical_metadata: HashMap::new(), word_create: None }; diff --git a/libkirum/src/transforms.rs b/libkirum/src/transforms.rs index 1fa0703..e478dd9 100644 --- a/libkirum/src/transforms.rs +++ b/libkirum/src/transforms.rs @@ -15,7 +15,7 @@ pub struct GlobalTransform { impl GlobalTransform { /// Transform the given lexis, or return the original unaltered lexis if the specified lexii don't meet the match statements - pub fn transform(&self, lex: &mut Lexis, etymon: Option>) { + pub fn transform(&self, lex: &mut Lexis, etymon: Option<&Vec<&Lexis>>) { // check to see if the etymon should allow us to transform let should_trans = if let Some(ety) = etymon { if let Some(ety_match) = &self.etymon_match { diff --git a/readme.md b/readme.md index dbe1e68..022876e 100644 --- a/readme.md +++ b/readme.md @@ -74,6 +74,7 @@ A Tree file is a JSON object of `Lexis` objects, a maximal example of which is p ] }, "archaic": true, //optional. Used only for sorting and filtering. + "historical_metadata": {"metadata_value":"value"} // Optional historical metadata. Unlike tags, historical metadata is inherited from any etymons. Can also be used for sorting and templates. "tags": [ // optional, user-supplied tags. "example", "default" From 11597d64b44d4e58f889dd943274db4f55d818aa Mon Sep 17 00:00:00 2001 From: Alex Kristiansen Date: Thu, 26 Oct 2023 17:33:19 -0700 Subject: [PATCH 2/2] remove old metadata handler --- libkirum/src/kirum.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/libkirum/src/kirum.rs b/libkirum/src/kirum.rs index 13eaa86..fc35858 100644 --- a/libkirum/src/kirum.rs +++ b/libkirum/src/kirum.rs @@ -225,14 +225,6 @@ impl LanguageTree { lex_idx = Some(self.graph.add_node(lex)); } - // trickle down metadata - let ety_metadata = &self.graph[ety_idx.unwrap()].historical_metadata; - if !ety_metadata.is_empty() { - let mut new = ety_metadata.clone(); - new.extend(self.graph[lex_idx.unwrap()].historical_metadata.iter().map(|(k, v)| (k.clone(), v.clone()))); - self.graph[lex_idx.unwrap()].historical_metadata = new; - } - self.graph.add_edge(ety_idx.unwrap(), lex_idx.unwrap(), TreeEtymology { transforms: trans, intermediate_word: None, agglutination_order }); }