diff --git a/Makefile b/Makefile index 5e114d4..81e5bba 100644 --- a/Makefile +++ b/Makefile @@ -32,18 +32,18 @@ create_all_data: create_sandhi_rules: RUST_LOG=info cargo run --release --bin create_sandhi_rules -- \ - --data-dir data/build/vidyut-0.2.0 + --data-dir data/build/vidyut-latest # Creates a koshas and write it to disk. create_kosha: RUST_LOG=info cargo run --release --bin create_kosha -- \ - --input-dir data/raw/lex --output-dir data/build/vidyut-0.2.0 + --input-dir data/raw/lex --output-dir data/build/vidyut-latest # Trains a padaccheda model and saves important features to disk. # NOTE: when training, exclude the file paths used in `make eval`. train_cheda: cargo run --release --bin train_cheda -- \ - --vidyut-dir "data/build/vidyut-0.2.0" \ + --vidyut-dir "data/build/vidyut-latest" \ --include "data/raw/dcs/conllu/files/**/*.conllu" \ --exclude "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-088*.conllu" \ --exclude "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-089*.conllu" \ @@ -55,13 +55,13 @@ train_cheda: # Runs basic end-to-end tests against the given kosha. test_kosha: - RUST_LOG=info cargo run --release --bin test_kosha -- --data-dir data/build/vidyut-0.2.0/kosha + RUST_LOG=info cargo run --release --bin test_kosha -- --data-dir data/build/vidyut-latest/kosha # Evaluate our parsing quality on a large sample of text. eval_cheda: cargo run --release --bin eval_cheda -- \ - --vidyut-dir "data/build/vidyut-0.2.0" \ + --vidyut-dir "data/build/vidyut-latest" \ --paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-088*.conllu" \ --paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-089*.conllu" \ --paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-0900-MBh, 6, BhaGī 18-7707.conllu" diff --git a/scripts/create_all_data.sh b/scripts/create_all_data.sh index e047b20..1d81bb8 100755 --- a/scripts/create_all_data.sh +++ b/scripts/create_all_data.sh @@ -10,10 +10,10 @@ rm -Rf dcs-data 2&> /dev/null set -e # Create necessary directories. -mkdir -p "data/build/${1}" +OUTPUT_DIR="data/build/vidyut-latest" echo "=========================" -echo "| DCS corpus data |" +echo "Data fetch" echo "=========================" echo if [ -e "data/raw/dcs" ]; then @@ -26,10 +26,6 @@ else rm -Rf dcs-data fi echo -echo "=========================" -echo "| Linguistic data fetch |" -echo "=========================" -echo if [ -e "data/raw/lex" ]; then echo "Lexical data already exists -- skipping fetch." else @@ -42,12 +38,38 @@ else fi echo echo "=========================" -echo "| Vidyut build |" +echo "vidyut-chandas" echo "=========================" +mkdir -p "${OUTPUT_DIR}/chandas" +cp -r vidyut-chandas/data "${OUTPUT_DIR}/chandas" +echo "Copied files to output dir." echo +echo "=========================" +echo "vidyut-kosha" +echo "=========================" make create_kosha make test_kosha +echo +echo "=========================" +echo "vidyut-lipi" +echo "=========================" +echo "(no data files needed)" +echo +echo "=========================" +echo "vidyut-prakriya" +echo "=========================" +mkdir -p "${OUTPUT_DIR}/prakriya" +cp -r "vidyut-prakriya/data/" "${OUTPUT_DIR}/prakriya" +echo "Copied files to output dir." +echo +echo "=========================" +echo "vidyut-sandhi" +echo "=========================" make create_sandhi_rules +echo +echo "=========================" +echo "vidyut-cheda" +echo "=========================" make train_cheda make eval_cheda echo diff --git a/src/bin/create_sandhi_rules.rs b/src/bin/create_sandhi_rules.rs index daac8db..11c29ec 100644 --- a/src/bin/create_sandhi_rules.rs +++ b/src/bin/create_sandhi_rules.rs @@ -26,7 +26,7 @@ fn write_rules(rules: &[Rule], path: &Path) -> Result<()> { fn main() { let args = Args::parse(); let rules = generate_rules(); - let config = Config::new(&args.data_dir); + let config = Config::new(args.data_dir); if let Err(err) = write_rules(&rules, config.sandhi()) { println!("{}", err); diff --git a/src/bin/train_cheda.rs b/src/bin/train_cheda.rs index aff9b8e..d16a438 100644 --- a/src/bin/train_cheda.rs +++ b/src/bin/train_cheda.rs @@ -70,7 +70,7 @@ fn process_sentence(tokens: &[Token], s: &mut Statistics) { let c = s .transitions .entry(prev_state) - .or_insert_with(HashMap::new) + .or_default() .entry(cur_state) .or_insert(0); *c += 1; @@ -81,7 +81,7 @@ fn process_sentence(tokens: &[Token], s: &mut Statistics) { let c = s .emissions .entry(cur_state) - .or_insert_with(HashMap::new) + .or_default() .entry(to_slp1(lemma)) .or_insert(0); *c += 1; diff --git a/vidyut-chandas/src/akshara.rs b/vidyut-chandas/src/akshara.rs index 0e7092f..2e819c2 100644 --- a/vidyut-chandas/src/akshara.rs +++ b/vidyut-chandas/src/akshara.rs @@ -3,9 +3,9 @@ use crate::sounds; /// The weight of an akshara. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub enum Weight { - /// A heavy syllable. + /// A *guru* or heavy syllable. G, - /// A light syllable. + /// A *laghu* or light syllable. L, } @@ -18,7 +18,7 @@ pub enum Weight { /// - It must not start with an anusvara or visarga. /// /// Together, these three rurles mean that an input string has exactly one division into aksharas. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Eq, PartialEq)] pub struct Akshara { pub(crate) text: String, pub(crate) weight: Weight, @@ -43,7 +43,7 @@ impl Akshara { } /// The length of this akshara in matras. - pub fn num_matras(&self) -> usize { + pub fn num_matras(&self) -> i32 { match self.weight { Weight::L => 1, Weight::G => 2, @@ -55,6 +55,7 @@ impl Akshara { /// /// Any text that is not a valid Sanskrit sound in SLP1 will be ignored. pub fn scan_line(text: impl AsRef) -> Vec { + // Split into aksharas. let mut akshara_strs = Vec::new(); let mut cur = String::new(); for c in text.as_ref().chars() { @@ -71,6 +72,7 @@ pub fn scan_line(text: impl AsRef) -> Vec { if let Some(prev) = akshara_strs.last_mut() { prev.push(c); } + // `else` means `M` and `H` follow a non-vowel, which indicates an error. } // Skip all other punctuation, spaces, etc. @@ -85,8 +87,10 @@ pub fn scan_line(text: impl AsRef) -> Vec { // Case 2: extend old syllable last.push_str(&cur); } + // `else` means that `text` contains only consonants, which indicates an error. } + // Calculate weights. akshara_strs .iter() .enumerate() @@ -97,10 +101,12 @@ pub fn scan_line(text: impl AsRef) -> Vec { false }; - let weight = if !cur.ends_with(sounds::is_hrasva) || next_is_samyogadi { - Weight::G - } else { + let has_hrasva = cur.chars().any(sounds::is_hrasva); + let has_visarga_or_anusvara = matches!(cur.chars().last(), Some('M') | Some('H')); + let weight = if has_hrasva && !next_is_samyogadi && !has_visarga_or_anusvara { Weight::L + } else { + Weight::G }; Akshara::new(cur.to_string(), weight) }) @@ -254,15 +260,14 @@ mod tests { #[test] fn test_scan_block_with_laghu_weight_change() { let scan = scan_lines("anIkam".lines()); - assert_eq!(weights(&scan[0]), vec![L, G, G]); + assert_eq!(weights(&scan[0]), vec![L, G, L]); // Last syllable of `anIkam` becomes guru due to following samyoga. let scan = scan_lines("anIkam\nvyUQam".lines()); assert_eq!(weights(&scan[0]), vec![L, G, G]); // Last syllable of `anIka` stays laghu due to following vowel. - // TODO: this is buggy. - // let scan = scan_block("anIkam\neva"); - // assert_eq!(weights(&scan[0]), vec![L, G, L]); + let scan = scan_lines("anIkam\neva".lines()); + assert_eq!(weights(&scan[0]), vec![L, G, L]); } } diff --git a/vidyut-chandas/src/chandas.rs b/vidyut-chandas/src/chandas.rs index 8dcffc6..758246c 100644 --- a/vidyut-chandas/src/chandas.rs +++ b/vidyut-chandas/src/chandas.rs @@ -1,20 +1,39 @@ use crate::akshara::{scan_lines, Akshara}; -use crate::vrtta::{MatchType, Vrtta}; -use std::error::Error; +use crate::error::Result; +use crate::padya::{Jati, JatiKind, MatchType, Vrtta}; use std::fs; use std::path::{Path, PathBuf}; +/// Models a padya type. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub enum Padya { + Vrtta(Vrtta), + Jati(Jati), +} + +impl Padya { + pub fn name(&self) -> &str { + use Padya::*; + + match self { + Vrtta(v) => v.name(), + Jati(j) => j.name(), + } + } +} + /// Describes a result of classifying an input string with `Chandas`. -pub struct MatchResult { - vrtta: Option, +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Match { + padya: Option, match_type: MatchType, aksharas: Vec>, } -impl MatchResult { - /// The vrtta match for this query. - pub fn vrtta(&self) -> &Option { - &self.vrtta +impl Match { + /// The padya match for this query. + pub fn padya(&self) -> &Option { + &self.padya } /// The match type for this query. @@ -28,6 +47,31 @@ impl MatchResult { } } +/// Describes a result of classifying an input string with `Chandas`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Matches { + padyas: Vec, + match_types: Vec, + aksharas: Vec>, +} + +impl Matches { + /// The padya matches for the query. + pub fn padyas(&self) -> &Vec { + &self.padyas + } + + /// The match type for this query. + pub fn match_types(&self) -> &Vec { + &self.match_types + } + + /// The aksharas in this query. + pub fn aksharas(&self) -> &Vec> { + &self.aksharas + } +} + /// A metrical classifier. /// /// @@ -48,31 +92,49 @@ impl MatchResult { /// assert_eq!(result.vrtta().as_ref().unwrap().name(), "vasantatilakA"); /// assert_eq!(result.match_type(), MatchType::Pada); /// ``` -#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[derive(Clone, Debug, Default, Eq, Hash, PartialEq)] pub struct Chandas { vrttas: Vec, + jatis: Vec, } impl Chandas { /// Creates a new `Chandas` instance. pub fn new(vrttas: Vec) -> Chandas { - Self { vrttas } + // List is from M. R. Kale's *A Higher Sanskrit Grammar*. + // Order is roughly based on priority -- items earlier in the list should block items + // later. + let jatis = vec![ + Jati::with_kind("vEtAlIyam", vec![14, 16, 14, 16], JatiKind::Vaitaliyam), + Jati::new("upagIti", vec![12, 15, 12, 15]), + Jati::new("AryAgIti", vec![12, 20, 12, 20]), + Jati::new("gIti", vec![12, 18, 12, 18]), + Jati::new("udgIti", vec![12, 15, 12, 18]), + Jati::with_kind( + "OpacCandasikam", + vec![16, 18, 16, 18], + JatiKind::Aupacchandasikam, + ), + Jati::new("AryA", vec![12, 18, 12, 15]), + ]; + + Self { vrttas, jatis } } /// Creates a new `Chandas` instance by defining meters from the given text data. /// /// We recommend using this constructor when the program does not have access to the /// filesystem, e.g. when using this code in WebAssembly. - pub fn from_text(data: &str) -> Result> { - let vrttas: Result, _> = data.lines().map(Vrtta::try_from).collect(); + pub fn from_text(data: &str) -> Result { + let vrttas: Result> = data.lines().map(Vrtta::try_from).collect(); Ok(Self::new(vrttas?)) } /// Creates a new classifier from the given data path. - pub fn from_file(path: &Path) -> Result> { + pub fn from_file(path: &Path) -> Result { let path = PathBuf::from(path).join(path); let data = fs::read_to_string(path)?; - let vrttas: Result, _> = data.lines().map(Vrtta::try_from).collect(); + let vrttas: Result> = data.lines().map(Vrtta::try_from).collect(); Ok(Self::new(vrttas?)) } @@ -82,12 +144,23 @@ impl Chandas { &self.vrttas } + /// The jatis available to this classifier. + pub fn jatis(&self) -> &Vec { + &self.jatis + } + /// Classifies the input string against an internal list of meters. /// - /// Currently, this function supports only simple samavrtta. - pub fn classify(&self, text: impl AsRef) -> MatchResult { - let aksharas = scan_lines(text.as_ref().lines()); + /// Currently, this function supports only vrttas. + pub fn classify(&self, text: impl AsRef) -> Match { + self.classify_inner(text.as_ref()) + } + fn classify_inner(&self, text: &str) -> Match { + let aksharas = scan_lines(text.lines()); + + // Try vrttas first because these are more exact and can be confused for certain jati + // types. let mut best_match = MatchType::None; let mut i_best = None; for (i, vrtta) in self.vrttas.iter().enumerate() { @@ -99,18 +172,70 @@ impl Chandas { } if let Some(i) = i_best { - MatchResult { - vrtta: Some(self.vrttas[i].clone()), + return Match { + padya: Some(Padya::Vrtta(self.vrttas[i].clone())), match_type: best_match, aksharas, + }; + } + + for jati in &self.jatis { + let aksharas = scan_lines(text.lines()); + let flattened_aksharas: Vec<_> = aksharas.clone().into_iter().flatten().collect(); + let res = jati.try_match(&flattened_aksharas); + if res == MatchType::Full { + return Match { + padya: Some(Padya::Jati(jati.clone())), + match_type: MatchType::Full, + aksharas, + }; } - } else { - MatchResult { - vrtta: None, - match_type: best_match, - aksharas, + } + + // No luck -- return. + Match { + padya: None, + match_type: MatchType::None, + aksharas, + } + } + + /// Classifies the input string against an internal list of meters and returns all possible + /// matches. + /// + /// Currently, this function supports only vrttas. + pub fn classify_all(&self, text: impl AsRef) -> Matches { + self.classify_all_inner(text.as_ref()) + } + + fn classify_all_inner(&self, text: &str) -> Matches { + let aksharas = scan_lines(text.lines()); + let mut padyas = Vec::new(); + let mut match_types = Vec::new(); + + for vrtta in &self.vrttas { + let match_type = vrtta.try_match(&aksharas); + if match_type != MatchType::None { + padyas.push(Padya::Vrtta(vrtta.clone())); + match_types.push(match_type); + } + } + + for jati in &self.jatis { + let flattened_aksharas: Vec<_> = aksharas.clone().into_iter().flatten().collect(); + let res = jati.try_match(&flattened_aksharas); + if res == MatchType::Full { + padyas.push(Padya::Jati(jati.clone())); + match_types.push(MatchType::Full); } } + + // No luck -- return. + Matches { + padyas, + match_types, + aksharas, + } } } @@ -118,9 +243,10 @@ impl Chandas { mod tests { use super::*; - fn assert_has_vrtta(c: &Chandas, text: &str, expected: &str) { + fn assert_has_padya(c: &Chandas, text: &str, expected: &str) { let res = c.classify(text); - assert_eq!(res.vrtta().as_ref().unwrap().name(), expected); + assert!(res.padya().is_some()); + assert_eq!(res.padya().clone().unwrap().name(), expected); } fn new_chandas() -> Chandas { @@ -136,17 +262,26 @@ mod tests { ]) } + #[test] + fn chandas_struct() { + let c = new_chandas(); + // Check that `vrttas()` is defined and returns real results. + assert_eq!(c.vrttas().len(), 4); + // Check that `jatis()` is defined and returns real results. + assert_eq!(c.jatis().len(), 7); + } + #[test] fn classify_samavrtta_single_pada() { let c = new_chandas(); - assert_has_vrtta(&c, "mAtaH samastajagatAM maDukEwaBAreH", "vasantatilakA"); - assert_has_vrtta(&c, "mAtaH\nsamastajagatAM\nmaDukEwaBAreH", "vasantatilakA"); + assert_has_padya(&c, "mAtaH samastajagatAM maDukEwaBAreH", "vasantatilakA"); + assert_has_padya(&c, "mAtaH\nsamastajagatAM\nmaDukEwaBAreH", "vasantatilakA"); } #[test] fn classify_samavrtta_full_verse() { let c = new_chandas(); - assert_has_vrtta( + assert_has_padya( &c, "kaScitkAntAvirahaguruRA svADikArapramattaH zApenAstaMgamitamahimA varzaBogyeRa BartuH . @@ -154,24 +289,24 @@ mod tests { snigDacCAyAtaruzu vasatiM rAmagiryASramezu .. 1 ..", "mandAkrAntA", ); - assert!(c.classify("mo mo go go vidyunmAlA").vrtta().is_none()); + assert!(c.classify("mo mo go go vidyunmAlA").padya().is_none()); } #[test] fn classify_ardhasamavrtta_pada_1() { let c = new_chandas(); - assert_has_vrtta(&c, "aTa madanavaDUrupaplavAntaM", "puzpitAgrA"); + assert_has_padya(&c, "aTa madanavaDUrupaplavAntaM", "puzpitAgrA"); } #[test] fn classify_ardhasamavrtta_half() { let c = new_chandas(); - assert_has_vrtta( + assert_has_padya( &c, "aTa madanavaDUrupaplavAntaM vyasanakfSA paripAlayAmbaBUva", "puzpitAgrA", ); - assert_has_vrtta( + assert_has_padya( &c, "aTa\nmadanavaDUrupaplavAntaM\nvyasanakfSA\nparipAlayAmbaBUva", "puzpitAgrA", @@ -181,10 +316,12 @@ mod tests { #[test] fn classify_ardhasamavrtta_full_verse() { let c = new_chandas(); - assert_has_vrtta( + assert_has_padya( &c, - "aTa madanavaDUrupaplavAntaM vyasanakfSA paripAlayAmbaBUva | - SaSina iva divAtanasya leKA kiraRaparikzayaDUsarA pradozam ||", + concat!( + "aTa madanavaDUrupaplavAntaM vyasanakfSA paripAlayAmbaBUva |", + "SaSina iva divAtanasya leKA kiraRaparikzayaDUsarA pradozam ||", + ), "puzpitAgrA", ); } @@ -192,17 +329,83 @@ mod tests { #[test] fn classify_vishamavrtta_pada_1() { let c = new_chandas(); - assert_has_vrtta(&c, "aTa vAsavasya vacanena", "udgatA"); + assert_has_padya(&c, "aTa vAsavasya vacanena", "udgatA"); } #[test] fn classify_vishamavrtta_full_verse() { let c = new_chandas(); - assert_has_vrtta( + assert_has_padya( &c, - "aTa vAsavasya vacanena ruciravadanastrilocanam | - klAntirahitamaBirADayituM viDivattapAMsi vidaDe DanaMjayaH ||", + concat!( + "aTa vAsavasya vacanena ruciravadanastrilocanam |", + "klAntirahitamaBirADayituM viDivattapAMsi vidaDe DanaMjayaH ||", + ), "udgatA", ); } + + #[test] + fn classify_jati() { + // Examples are from M. R. Kale's *A Higher Sanskrit Grammar* + let c = new_chandas(); + + let text = concat!( + "yenAmandamarande daladaravinde dinAnyanAyizata |", + "kuwaje Kalu tenehA tenehA maDukareRa kaTam ||" + ); + assert_has_padya(&c, text, "AryA"); + + let text = concat!( + "pAwIra tava pawIyAnkaH paripAwImimAmurIkartum |", + "yatpiMzatAmapi nfRAM pizwo'pi tanozi parimalEH puzwim ||", + ); + assert_has_padya(&c, text, "gIti"); + + let text = concat!( + "navagopasuMdarIRAM rAsollAse murArAtim |", + "asmArayadupagItiH svargakuraNgIdfSAM gIteH ||" + ); + assert_has_padya(&c, text, "upagIti"); + + let text = concat!( + "nArAyaRasya saMtatamudgItiH saMsmftirBaktyA |", + "arcAyAmAsaktirdustarasaMsArasAgare taraRiH ||", + ); + assert_has_padya(&c, text, "udgIti"); + + let text = concat!( + "cArusamIraRavipine hariRakalaNkakiraNAvalI savilAsA |", + "AbadDarAmamohA velAmUle viBAvarI parihIna ||", + ); + assert_has_padya(&c, text, "AryAgIti"); + } + + #[test] + fn classify_jati_vaitaliyam() { + let c = new_chandas(); + let text = concat!( + "kuzalaM Kalu tuByameva tat vacanaM kfzRayadaByaDAmaham |", + "upadeSaparAH parezvapi svavinASABimuKezu sADavaH ||", + ); + assert_has_padya(&c, text, "vEtAlIyam"); + } + + #[test] + fn classify_jati_aupacchandasikam() { + let c = new_chandas(); + let text = concat!( + "AtanvAnaM surArikAntAsvOpacCandasikaM hfdo vinodam |", + "kaMsaM yo nirjaGAna devo vande taM jagatAM sTitiM daDAnam ||", + ); + assert_has_padya(&c, text, "OpacCandasikam"); + } + + #[test] + fn classify_all() { + let c = new_chandas(); + let ret = c.classify_all("kaScitkAntA"); + assert_eq!(ret.padyas().len(), 1); + assert_eq!(ret.padyas()[0].name(), "mandAkrAntA"); + } } diff --git a/vidyut-chandas/src/error.rs b/vidyut-chandas/src/error.rs index 8ba0570..879be98 100644 --- a/vidyut-chandas/src/error.rs +++ b/vidyut-chandas/src/error.rs @@ -1,19 +1,29 @@ -use std::error::Error; use std::fmt; #[allow(unused)] pub(crate) type Result = std::result::Result; #[allow(unused)] -#[derive(Clone, Debug)] +#[derive(Debug)] pub enum ChandasError { ParseError, + IoError(std::io::Error), } -impl Error for ChandasError {} +impl From for ChandasError { + #[inline] + fn from(err: std::io::Error) -> ChandasError { + ChandasError::IoError(err) + } +} impl fmt::Display for ChandasError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Could not parse meter.") + use ChandasError::*; + + match self { + ParseError => write!(f, "Could not parse meter."), + IoError(_) => write!(f, "Could not open input file."), + } } } diff --git a/vidyut-chandas/src/lib.rs b/vidyut-chandas/src/lib.rs index f125865..72819bd 100644 --- a/vidyut-chandas/src/lib.rs +++ b/vidyut-chandas/src/lib.rs @@ -5,10 +5,11 @@ mod akshara; mod chandas; mod error; +mod padya; mod sounds; -mod vrtta; + mod wasm; pub use akshara::{Akshara, Weight}; -pub use chandas::{Chandas, MatchResult}; -pub use vrtta::{Jati, MatchType, Vrtta}; +pub use chandas::{Chandas, Match, Matches}; +pub use padya::{Jati, MatchType, Vrtta}; diff --git a/vidyut-chandas/src/vrtta.rs b/vidyut-chandas/src/padya.rs similarity index 61% rename from vidyut-chandas/src/vrtta.rs rename to vidyut-chandas/src/padya.rs index 90301a2..33a0e01 100644 --- a/vidyut-chandas/src/vrtta.rs +++ b/vidyut-chandas/src/padya.rs @@ -1,5 +1,5 @@ use crate::akshara::{Akshara, Weight}; -use std::error::Error; +use crate::error::{ChandasError, Result}; /// Models the weights that a vrtta can accept. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] @@ -74,12 +74,6 @@ impl Gana { } } -fn to_counts(text: &str) -> Vec { - text.split_whitespace() - .filter_map(|n| n.parse().ok()) - .collect() -} - /// Models a *pāda*, which is one of the four "feet" or "legs" of a verse. /// A *pāda* defines a specific pattern of light and heavy syllables and /// might also define one or more *yati*s (caesuras). @@ -126,15 +120,12 @@ impl Vrtta { pub(crate) fn try_match(&self, aksharas: &[Vec]) -> MatchType { use PatternWeight::*; - eprintln!("Testing against: {}", self.name); for row in aksharas { let mut s = Vec::new(); for a in row { s.push(a.text.clone()); } - eprintln!("{}", s.join(" ")); } - eprintln!(); let mut full = Vec::new(); @@ -152,8 +143,7 @@ impl Vrtta { *last = Any; } - let pattern_flat: Vec = - full.iter().map(|x| x.to_owned()).flatten().collect(); + let pattern_flat: Vec = full.iter().flat_map(|x| x.to_owned()).collect(); let aksharas_flat: Vec<&Akshara> = aksharas.iter().flatten().collect(); let contains_aksharas = if pattern_flat.len() >= aksharas_flat.len() { @@ -227,9 +217,9 @@ impl Vrtta { } impl TryFrom<&str> for Pada { - type Error = Box; + type Error = ChandasError; - fn try_from(text: &str) -> Result { + fn try_from(text: &str) -> Result { let weights: Vec = text .chars() .filter_map(|c| match c { @@ -249,58 +239,173 @@ impl TryFrom<&str> for Pada { } impl TryFrom<&str> for Vrtta { - type Error = Box; + type Error = ChandasError; - fn try_from(text: &str) -> Result { - let fields: Vec<_> = text.split("\t").collect(); + fn try_from(text: &str) -> Result { + let fields: Vec<_> = text.split('\t').collect(); debug_assert_eq!(fields.len(), 3); let name = fields[0]; let _ = fields[1]; let pattern_str = fields[2]; - let padas: Result, Box> = - pattern_str.split("/").map(|x| x.try_into()).collect(); + let padas: Result> = pattern_str.split('/').map(|x| x.try_into()).collect(); let padas = padas?; Ok(Vrtta::new(name, padas)) } } +#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] +pub(crate) enum JatiKind { + /// A default jati. + Basic, + /// Requires that each pada ends in ra-la-ga (_._._) + Vaitaliyam, + /// Requires that each pada ends in ra-ya (_._.__) + Aupacchandasikam, +} + /// Models a *jāti*, which defines a specific pattern of *mātrā*s (morae). -#[allow(unused)] #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct Jati { + /// The name of this jati. name: String, - matras: Vec>, + /// The matras required for this jati. + matras: Vec, + /// Any special conditions the jati must follow. + kind: JatiKind, } impl Jati { /// Creates a new `Jati` with the given name and matra pattern. - pub fn new(name: impl AsRef, matras: Vec>) -> Self { + pub fn new(name: impl AsRef, matras: Vec) -> Self { Self { name: name.as_ref().to_string(), matras, + kind: JatiKind::Basic, } } - #[allow(unused)] - pub(crate) fn matras(&self) -> &Vec> { + /// Creates a new `Jati` with the given name and matra pattern. + pub(crate) fn with_kind(name: impl AsRef, matras: Vec, kind: JatiKind) -> Self { + Self { + name: name.as_ref().to_string(), + matras, + kind, + } + } + + /// The name of this meter. + pub fn name(&self) -> &String { + &self.name + } + + /// The matras that define this meter. The returned `Vec` has length 4. + pub fn matras(&self) -> &Vec { &self.matras } + + pub(crate) fn kind(&self) -> JatiKind { + self.kind + } + + pub(crate) fn try_match(&self, aksharas: &[Akshara]) -> MatchType { + let mut cur_matras = 0; + let mut akshara_padas = Vec::new(); + let mut i_offset = 0; + + for (i, a) in aksharas.iter().enumerate() { + let i_pada = akshara_padas.len(); + if let Some(pada_matras) = self.matras().get(i_pada) { + cur_matras += a.num_matras(); + + if cur_matras == *pada_matras || (i_pada % 2 == 1 && cur_matras + 1 == *pada_matras) + { + akshara_padas.push(aksharas[i_offset..=i].to_vec()); + i_offset = i + 1; + cur_matras = 0; + } + } else { + // More aksharas than padas -- not a match. + return MatchType::None; + } + } + + // Incomplete match. + // TODO: decide how to handle this. Prefix? + if akshara_padas.len() != 4 { + return MatchType::None; + } + + match self.kind() { + JatiKind::Vaitaliyam => { + // Each pada must end with ra-la-ga (_._._) + let all_match = + akshara_padas + .iter() + .enumerate() + .all(|(i, pada)| match pada.as_slice() { + [.., a, b, c, d, e] => { + use Weight::*; + a.weight() == G + && b.weight() == L + && c.weight() == G + && d.weight() == L + // Laghu OK at end of even pada. + && (e.weight() == G || (i % 2 == 1)) + } + _ => false, + }); + if all_match { + MatchType::Full + } else { + MatchType::None + } + } + JatiKind::Aupacchandasikam => { + // Each pada must end with ra-ya (_._.__) + let all_match = + akshara_padas + .iter() + .enumerate() + .all(|(i, pada)| match pada.as_slice() { + [.., a, b, c, d, e, f] => { + use Weight::*; + a.weight() == G + && b.weight() == L + && c.weight() == G + && d.weight() == L + && e.weight() == G + // Laghu OK at end of even pada. + && (f.weight() == G || (i % 2 == 1)) + } + _ => false, + }); + if all_match { + MatchType::Full + } else { + MatchType::None + } + } + _ => MatchType::Full, + } + } } +/* impl TryFrom<&str> for Jati { - type Error = Box; + type Error = ChandasError; - fn try_from(text: &str) -> Result { - let fields: Vec<_> = text.split("\t").collect(); + fn try_from(text: &str) -> Result { + let fields: Vec<_> = text.split('\t').collect(); debug_assert_eq!(fields.len(), 2); let name = fields[0]; let pattern_str = fields[1]; - let counts = pattern_str.split("/").map(to_counts).collect(); + let counts = pattern_str.split('/').map(|n| n.parse().unwrap()).collect(); Ok(Jati::new(name, counts)) } } +*/ #[cfg(test)] mod tests { diff --git a/vidyut-chandas/src/wasm.rs b/vidyut-chandas/src/wasm.rs index 9d4d720..e84bfd6 100644 --- a/vidyut-chandas/src/wasm.rs +++ b/vidyut-chandas/src/wasm.rs @@ -56,14 +56,14 @@ impl From for MatchType { #[allow(non_snake_case)] #[wasm_bindgen] #[derive(Serialize)] -pub struct MatchResult { +pub struct Match { vrtta: Option, matchType: MatchType, aksharas: Vec>, } -impl From for MatchResult { - fn from(m: rs::MatchResult) -> Self { +impl From for Match { + fn from(m: rs::Match) -> Self { let mut aksharas = Vec::new(); for rs_row in m.aksharas() { let mut row = Vec::new(); @@ -72,11 +72,8 @@ impl From for MatchResult { } aksharas.push(row); } - MatchResult { - vrtta: match m.vrtta() { - Some(v) => Some(v.name().to_string()), - None => None, - }, + Match { + vrtta: m.padya().as_ref().map(|v| v.name().to_string()), matchType: m.match_type().into(), aksharas, } @@ -104,7 +101,7 @@ impl Chandas { } pub fn classify(&self, text: &str) -> JsValue { - let res: MatchResult = self.0.classify(text).into(); + let res: Match = self.0.classify(text).into(); serde_wasm_bindgen::to_value(&res).expect("wasm") } } diff --git a/vidyut-cheda/src/segmenting.rs b/vidyut-cheda/src/segmenting.rs index 79cf0e6..8c5f645 100644 --- a/vidyut-cheda/src/segmenting.rs +++ b/vidyut-cheda/src/segmenting.rs @@ -288,7 +288,7 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { new.score = ctx.model.score(&new, &token_pool); viterbi_cache .entry(new.remaining.clone()) - .or_insert_with(HashMap::new) + .or_default() .insert("STATE".to_string(), new.clone()); let new_score = new.score; @@ -329,7 +329,7 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { // Use state "STATE" for now since we don't have any states implemented. let maybe_rival = viterbi_cache .entry(new.remaining.clone()) - .or_insert_with(HashMap::new) + .or_default() .get("STATE"); let new_score = new.score; if let Some(rival) = maybe_rival { @@ -339,7 +339,7 @@ fn segment(raw_text: &str, ctx: &Chedaka) -> Result> { }; viterbi_cache .entry(new.remaining.clone()) - .or_insert_with(HashMap::new) + .or_default() .insert("STATE".to_string(), new.clone()); pq.push(new, new_score); } diff --git a/vidyut-prakriya/README.md b/vidyut-prakriya/README.md index 4c36602..fd5453a 100644 --- a/vidyut-prakriya/README.md +++ b/vidyut-prakriya/README.md @@ -4,6 +4,7 @@ (Published as [A fast prakriyā generator][paper] at ISCLS 2024.) + [paper]: https://iscls.github.io/assets/files/proceedings/2024.iscls.7.pdf `vidyut-prakriya` generates Sanskrit words with their prakriyās (derivations) diff --git a/vidyut-prakriya/src/args/pratipadika.rs b/vidyut-prakriya/src/args/pratipadika.rs index fbab724..77c3b83 100644 --- a/vidyut-prakriya/src/args/pratipadika.rs +++ b/vidyut-prakriya/src/args/pratipadika.rs @@ -62,7 +62,7 @@ impl Pratipadika { impl From<&str> for Pratipadika { fn from(s: &str) -> Self { - Self::basic(s.to_string()) + Self::basic(s) } } diff --git a/vidyut-prakriya/src/core/term.rs b/vidyut-prakriya/src/core/term.rs index 25e3fb8..1a15fb7 100644 --- a/vidyut-prakriya/src/core/term.rs +++ b/vidyut-prakriya/src/core/term.rs @@ -195,7 +195,7 @@ impl Term { /// Returns the last sound in the term if it exists. pub fn antya(&self) -> Option { - self.text.chars().rev().next() + self.text.chars().next_back() } /// Returns the penultimate sound in the term if it exists. @@ -694,8 +694,7 @@ impl Term { .bytes() .enumerate() .rev() - .filter(|(_, c)| sounds::is_ac(*c as char)) - .next(); + .find(|(_, c)| sounds::is_ac(*c as char)); if let Some((i, _)) = result { self.set_at(i, s); } @@ -744,8 +743,8 @@ impl Term { // Don't save asiddha sounds. return; } else { - let sthanivat_antya = self.sthanivat.chars().rev().next().expect("ok"); - let text_antya = self.text.chars().rev().next().expect("ok"); + let sthanivat_antya = self.sthanivat.chars().next_back().expect("ok"); + let text_antya = self.text.chars().next_back().expect("ok"); if sounds::is_ac(sthanivat_antya) { if text_antya == 'y' || text_antya == 'v' { // Don't save changes to the final vowel. diff --git a/vidyut-prakriya/src/it_samjna.rs b/vidyut-prakriya/src/it_samjna.rs index d8cec40..4f1ea25 100644 --- a/vidyut-prakriya/src/it_samjna.rs +++ b/vidyut-prakriya/src/it_samjna.rs @@ -73,7 +73,7 @@ fn is_exempt_from_lakshaku(t: &Term) -> bool { fn get_upadesha(t: &Term) -> Result<&str> { match &t.u { - Some(s) => Ok(&s), + Some(s) => Ok(s), None => Err(Error::invalid_upadesha(&t.text)), } } @@ -207,9 +207,9 @@ pub fn run(p: &mut Prakriya, i_term: usize) -> Result<()> { if let Some(t) = p.get(i_term) { let upadesha = get_upadesha(t)?; - let adi = match get_adi(&upadesha) { + let adi = match get_adi(upadesha) { Some(x) => x, - None => return Err(Error::invalid_upadesha(&upadesha)), + None => return Err(Error::invalid_upadesha(upadesha)), }; if t.is_pratyaya() { diff --git a/vidyut-prakriya/src/sanadi.rs b/vidyut-prakriya/src/sanadi.rs index 23ec0ce..6a6739f 100644 --- a/vidyut-prakriya/src/sanadi.rs +++ b/vidyut-prakriya/src/sanadi.rs @@ -321,7 +321,7 @@ fn try_add(p: &mut Prakriya, sanadi: &Option, is_ardhadhatuka: bool) -> pub fn try_create_namadhatu(p: &mut Prakriya, dhatu: &Namadhatu) -> Option<()> { match dhatu.pratipadika() { Pratipadika::Basic(basic) => { - pratipadika_karya::add_basic(p, &basic); + pratipadika_karya::add_basic(p, basic); } _ => panic!("Unsupported type for namadhatu"), } @@ -331,7 +331,7 @@ pub fn try_create_namadhatu(p: &mut Prakriya, dhatu: &Namadhatu) -> Option<()> { su.add_tags(&[T::Pratyaya, T::Sup, T::Vibhakti, T::V1, T::Luk]); p.push(su); - try_add(p, &dhatu.nama_sanadi(), false); + try_add(p, dhatu.nama_sanadi(), false); Some(()) } diff --git a/vidyut-prakriya/src/sounds.rs b/vidyut-prakriya/src/sounds.rs index 83f1e21..22535bb 100644 --- a/vidyut-prakriya/src/sounds.rs +++ b/vidyut-prakriya/src/sounds.rs @@ -208,10 +208,7 @@ fn create_sound_props() -> HashMap { (s("v"), Sthana::DantaOshtha), ]); for k in s("Yam M").to_string().chars() { - sthana - .entry(k) - .or_insert_with(Vec::new) - .push(Sthana::Nasika); + sthana.entry(k).or_default().push(Sthana::Nasika); } let ghosha = flatten(vec![ diff --git a/vidyut-prakriya/src/vyakarana.rs b/vidyut-prakriya/src/vyakarana.rs index 7de74e9..60e69b0 100644 --- a/vidyut-prakriya/src/vyakarana.rs +++ b/vidyut-prakriya/src/vyakarana.rs @@ -324,7 +324,7 @@ impl Vyakarana { /// # Ok::<(), Error>(()) pub fn derive_samasas(&self, args: &Samasa) -> Vec { let mut stack = self.create_prakriya_stack(); - stack.find_all(|p| ashtadhyayi::derive_samasa(p, &args)); + stack.find_all(|p| ashtadhyayi::derive_samasa(p, args)); stack.prakriyas() } diff --git a/vidyut-prakriya/src/wasm.rs b/vidyut-prakriya/src/wasm.rs index 7fc3b96..35da45a 100644 --- a/vidyut-prakriya/src/wasm.rs +++ b/vidyut-prakriya/src/wasm.rs @@ -253,7 +253,7 @@ impl Vidyut { pub fn deriveDhatus(&self, code: &str) -> JsValue { if let Some(dhatu) = self.dhatupatha.get(code) { let v = Vyakarana::new(); - let prakriyas = v.derive_dhatus(&dhatu); + let prakriyas = v.derive_dhatus(dhatu); let web_prakriyas = to_web_prakriyas(&prakriyas); serde_wasm_bindgen::to_value(&web_prakriyas).expect("wasm") } else {