From 2b5a9805a9d14eaa1d83d99efb0c261bb8eb2d44 Mon Sep 17 00:00:00 2001 From: Arun Prasad Date: Wed, 6 Nov 2024 13:37:25 -0800 Subject: [PATCH] [kosha] Substantially increase size and coverage This commit uses data from the Upasargartha-candrika to create a large number of prefixed tinantas and krdantas. In addition, I've fixed a few minor bugs and added more documentation to these crates. --- Cargo.lock | 13 +- Makefile | 4 +- README.md | 4 +- scripts/create_all_data.sh | 2 + src/bin/create_kosha.rs | 913 +++++++++++++-------- src/bin/eval_cheda.rs | 7 +- src/bin/test_kosha.rs | 185 +++-- vidyut-chandas/README.md | 21 +- vidyut-chandas/src/chandas.rs | 31 +- vidyut-cheda/Cargo.toml | 1 + vidyut-cheda/src/dcs.rs | 34 +- vidyut-cheda/src/strict_mode.rs | 2 +- vidyut-kosha/Cargo.toml | 1 + vidyut-kosha/src/kosha.rs | 12 +- vidyut-kosha/src/morph.rs | 326 +++++--- vidyut-kosha/src/packing.rs | 33 +- vidyut-lipi/scripts/create_schemes.py | 6 +- vidyut-lipi/src/autogen_schemes.rs | 1 + vidyut-lipi/src/mapping.rs | 13 +- vidyut-lipi/src/numerals.rs | 2 +- vidyut-lipi/src/scheme.rs | 10 +- vidyut-lipi/src/transliterate.rs | 4 +- vidyut-lipi/src/unicode_norm.rs | 30 +- vidyut-lipi/tests/basic.rs | 18 + vidyut-prakriya/README.md | 45 +- vidyut-prakriya/src/angasya/guna_vrddhi.rs | 6 +- vidyut-prakriya/src/angasya/subanta.rs | 2 +- vidyut-prakriya/src/args.rs | 8 +- vidyut-prakriya/src/args/dhatu.rs | 2 +- vidyut-prakriya/src/args/krt.rs | 59 +- vidyut-prakriya/src/args/pada.rs | 2 +- vidyut-prakriya/src/args/pratipadika.rs | 6 +- vidyut-prakriya/src/args/samasa.rs | 4 +- vidyut-prakriya/src/args/sup.rs | 24 +- vidyut-prakriya/src/args/taddhita.rs | 2 +- vidyut-prakriya/src/args/tin.rs | 28 +- vidyut-prakriya/src/args/unadi.rs | 2 +- vidyut-prakriya/src/ashtadhyayi.rs | 14 +- vidyut-prakriya/src/core/term.rs | 8 +- vidyut-prakriya/src/core/term_view.rs | 16 +- vidyut-prakriya/src/dhatu_karya.rs | 2 +- vidyut-prakriya/src/dhatupatha.rs | 40 +- vidyut-prakriya/src/vikarana.rs | 19 +- vidyut-prakriya/src/vyakarana.rs | 117 ++- vidyut-prakriya/tests/kashika_3_2.rs | 16 +- vidyut-prakriya/tests/regressions.rs | 11 + 46 files changed, 1342 insertions(+), 764 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 33ee1ca..fdaa9df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -57,9 +57,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.0.2" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "487f1e0fcbe47deb8b0574e646def1c903389d95241dd1bbcc6ce4a715dfc0c1" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "block-buffer" @@ -156,7 +156,7 @@ version = "4.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42dfd32784433290c51d92c438bb72ea5063797fc3cc9a21a8c4346bebbb2098" dependencies = [ - "bitflags 2.0.2", + "bitflags 2.6.0", "clap_derive", "clap_lex", "is-terminal", @@ -798,9 +798,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.1" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "opaque-debug" @@ -1466,6 +1466,7 @@ dependencies = [ "vidyut-cheda", "vidyut-kosha", "vidyut-lipi", + "vidyut-prakriya", "vidyut-sandhi", ] @@ -1499,6 +1500,7 @@ dependencies = [ "tempfile", "vidyut-kosha", "vidyut-lipi", + "vidyut-prakriya", "vidyut-sandhi", ] @@ -1517,6 +1519,7 @@ dependencies = [ "rustc-hash", "serde", "tempfile", + "vidyut-prakriya", ] [[package]] diff --git a/Makefile b/Makefile index 81e5bba..b7ffe87 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,9 @@ create_sandhi_rules: # Creates a koshas and write it to disk. create_kosha: RUST_LOG=info cargo run --release --bin create_kosha -- \ - --input-dir data/raw/lex --output-dir data/build/vidyut-latest + --input-dir data/raw/lex \ + --dhatupatha vidyut-prakriya/data/dhatupatha.tsv \ + --output-dir data/build/vidyut-latest # Trains a padaccheda model and saves important features to disk. # NOTE: when training, exclude the file paths used in `make eval`. diff --git a/README.md b/README.md index 631f6a3..d70ce7b 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,9 @@ In Rust, components of this kind are called *crates*. ### [`vidyut-chandas`][vidyut-chandas] -`vidyut-chandas` is an experimental classifier for Sanskrit meters. +`vidyut-chandas` identifies the meter in some piece of Sanskrit text. This +crate is experimental, and while it is useful for common and basic use cases, +it is not a state-of-the-art solution. For details, see the [vidyut-chandas README][vidyut-chandas]. diff --git a/scripts/create_all_data.sh b/scripts/create_all_data.sh index 1d81bb8..84d872b 100755 --- a/scripts/create_all_data.sh +++ b/scripts/create_all_data.sh @@ -22,6 +22,8 @@ else echo "Training data does not exist -- fetching." mkdir -p "data/raw/dcs" git clone --depth 1 https://github.com/OliverHellwig/sanskrit.git dcs-data + # Use a fixed commit to avoid breakages from later changes. + pushd dcs-data && git reset --hard 1bc281e && popd mv dcs-data/dcs/data/conllu data/raw/dcs/conllu rm -Rf dcs-data fi diff --git a/src/bin/create_kosha.rs b/src/bin/create_kosha.rs index cc94279..e20fe5a 100644 --- a/src/bin/create_kosha.rs +++ b/src/bin/create_kosha.rs @@ -1,76 +1,40 @@ //! Creates an FST kosha using our raw linguistic data. //! -//! The slowest part of this process is `add_nominals`, which inflects almost 200,000 nominal -//! stems with all of the endings they allow. +//! This binary is computationally intensive and may take several minutes. +//! +//! TODO: +//! - prefixes +//! - sya-Satf, sya-SAnac, ya-SAnac +//! - upasarga + tvA (upAsitvA, etc.) +//! - pada variants +//! - dedupe krdantas with existing nominals +//! - update `morph` encoding for krdantas use clap::Parser; use lazy_static::lazy_static; use log::info; use rayon::prelude::*; -use rustc_hash::FxHashMap; -use std::cmp::Eq; -use std::hash::Hash; +use std::collections::HashMap; use std::path::{Path, PathBuf}; use std::process; use vidyut_cheda::sounds::{is_ac, is_ghosha, is_hal}; use vidyut_cheda::Config; use vidyut_kosha::morph::*; -use vidyut_kosha::{Builder, Kosha}; - -/// A MultiMap that can be unwrapped into its underlying map. -/// We use this custom version of a MultiMap so that we can use Rayon's par_iter on the wrapped -/// map. -struct MultiMap(FxHashMap>); - -impl MultiMap { - fn new() -> Self { - Self(FxHashMap::default()) - } +use vidyut_kosha::Builder; +use vidyut_prakriya::args as vp; +use vidyut_prakriya::dhatupatha::Entry as DhatuEntry; +use vidyut_prakriya::{Dhatupatha, Vyakarana}; - fn with_capacity(size: usize) -> Self { - Self(FxHashMap::with_capacity_and_hasher( - size, - Default::default(), - )) - } - - fn insert(&mut self, key: K, value: V) { - self.0.entry(key).or_default().push(value); - } - - fn extend(&mut self, other: Self) { - for (key, values) in other.0.into_iter() { - for value in values { - self.insert(key.clone(), value); - } - } - } -} +type Result = std::result::Result>; +type UpasargaDhatuMap = HashMap>>; -/// Copied from multimap::MultiMap; -impl FromIterator<(K, V)> for MultiMap -where - K: Eq + Hash + Clone, -{ - fn from_iter>(iterable: T) -> MultiMap { - let iter = iterable.into_iter(); - let hint = iter.size_hint().0; - - let mut multimap = MultiMap::with_capacity(hint); - for (k, v) in iter { - multimap.insert(k, v); - } +/// A list of pratipadikas. +type StemVec = Vec<(String, Pratipadika)>; - multimap - } -} +/// A list of complete padas. +type PadaVec = Vec<(String, Pada)>; -type Result = std::result::Result>; -/// A map of pratipadikas. -type StemMap = MultiMap; -/// A map of complete padas. -type PadaMap = MultiMap; -/// A map of sup pratyayas. -type SupMap = MultiMap; +/// A list of sup pratyayas. +type SupVec = Vec<(String, String, Pada)>; #[derive(Parser, Debug)] #[command(author, version, about)] @@ -79,28 +43,25 @@ struct Args { #[arg(short, long)] input_dir: PathBuf, + /// Path to a dhatupatha file (e.g. the one used by vidyut-prakriya) + #[arg(short, long)] + dhatupatha: PathBuf, + /// Path to the Vidyut output directory. #[arg(short, long)] output_dir: PathBuf, } -/// Defines all of the input data paths we use in Vidyut. +/// Defines all of the input data paths we need to construct the FST. pub struct DataPaths { pub indeclinables: PathBuf, pub nominal_endings_compounded: PathBuf, pub nominal_endings_inflected: PathBuf, - pub nominal_stems: PathBuf, - pub nominal_padas: PathBuf, - pub participle_stems: PathBuf, + pub basic_pratipadikas: PathBuf, + pub irregular_subantas: PathBuf, pub prefix_groups: PathBuf, pub prefixed_roots: PathBuf, - pub pronouns: PathBuf, - pub sandhi_rules: PathBuf, - pub unprefixed_roots: PathBuf, - pub verb_endings: PathBuf, - pub verb_prefixes: PathBuf, - pub verbal_indeclinables: PathBuf, - pub verbs: PathBuf, + pub upasarga_dhatus: PathBuf, } impl DataPaths { @@ -110,71 +71,94 @@ impl DataPaths { indeclinables: base.join("indeclinables.csv"), nominal_endings_compounded: base.join("nominal-endings-compounded.csv"), nominal_endings_inflected: base.join("nominal-endings-inflected.csv"), - nominal_stems: base.join("nominal-stems.csv"), - nominal_padas: base.join("nominals-irregular.csv"), - participle_stems: base.join("participle-stems.csv"), + basic_pratipadikas: base.join("nominal-stems.csv"), + irregular_subantas: base.join("nominals-irregular.csv"), prefix_groups: base.join("prefix-groups.csv"), prefixed_roots: base.join("prefixed-roots.csv"), - pronouns: base.join("pronouns.csv"), - sandhi_rules: base.join("sandhi-rules.csv"), - unprefixed_roots: base.join("unprefixed-roots.csv"), - verb_endings: base.join("verb-endings.csv"), - verb_prefixes: base.join("verb-prefixes.csv"), - verbal_indeclinables: base.join("verbal-indeclinables.csv"), - verbs: base.join("verbs.csv"), + upasarga_dhatus: base.join("upasarga-dhatus.csv"), } } } -fn parse_stem_linga(code: &str) -> Vec { - match code { - "m" => vec![Linga::Pum], - "f" => vec![Linga::Stri], - "n" => vec![Linga::Napumsaka], - "mf" => vec![Linga::Pum, Linga::Stri], - "fn" => vec![Linga::Stri, Linga::Napumsaka], - "mn" => vec![Linga::Pum, Linga::Napumsaka], - "mfn" => vec![Linga::Pum, Linga::Stri, Linga::Napumsaka], - "none" => vec![], - &_ => panic!("Unknown type {}", code), +/// Creates a collection of (linga, vibhakti, vacana) combinations. +fn linga_vibhakti_vacana_options() -> Vec<(vp::Linga, vp::Vibhakti, vp::Vacana)> { + let mut ret = Vec::new(); + for linga in vp::Linga::iter() { + for vibhakti in vp::Vibhakti::iter() { + for vacana in vp::Vacana::iter() { + ret.push((*linga, *vibhakti, *vacana)) + } + } } + ret } -fn parse_pada_prayoga(code: &str) -> PadaPrayoga { - match code { - "para" => PadaPrayoga::Parasmaipada, - "atma" => PadaPrayoga::AtmanepadaKartari, - "pass" => PadaPrayoga::AtmanepadaNotKartari, - &_ => panic!("Unknown type {}", code), - } +/// Creates a collection of common sanAdi combinations. +fn sanadi_options() -> Vec> { + use vp::Sanadi::*; + vec![ + vec![], + vec![Ric], + vec![san], + vec![yaN], + vec![yaNluk], + vec![Ric, san], + vec![san, Ric], + ] } -fn parse_krt_pratyaya(tense: &str, voice: &str) -> KrtPratyaya { - match (tense, voice) { - ("past", "active") => KrtPratyaya::Ktavat, - ("past", "pass") => KrtPratyaya::Kta, - - ("pres", "para") => KrtPratyaya::Shatr, - ("pres", "atma") => KrtPratyaya::Shanac, - ("pres", "pass") => KrtPratyaya::YakShanac, - - ("fut", "para") => KrtPratyaya::SyaShatr, - ("fut", "atma") => KrtPratyaya::SyaShanac, - ("fut", "pass") => KrtPratyaya::Krtya, +fn tinanta_options() -> Vec<( + vp::Prayoga, + vp::DhatuPada, + vp::Lakara, + vp::Purusha, + vp::Vacana, +)> { + let mut ret = Vec::new(); + for prayoga in vp::Prayoga::iter() { + for pada in vp::DhatuPada::iter() { + if *prayoga == vp::Prayoga::Bhave { + // Duplicates karmani -- skip + continue; + } + for lakara in vp::Lakara::iter() { + if *lakara == vp::Lakara::Let { + // Experimental -- skip + continue; + } + for purusha in vp::Purusha::iter() { + for vacana in vp::Vacana::iter() { + ret.push((*prayoga, *pada, *lakara, *purusha, *vacana)); + } + } + } + } + } + ret +} - ("perf", "para") => KrtPratyaya::Kvasu, - ("perf", "atma") => KrtPratyaya::Kanac, - ("perf", "pass") => KrtPratyaya::Kanac, - (&_, &_) => panic!("Unknown type (`{tense}`, `{voice}`)"), +fn parse_stem_linga(code: &str) -> &[Linga] { + use Linga::*; + match code { + "m" => &[Pum], + "f" => &[Stri], + "n" => &[Napumsaka], + "mf" => &[Pum, Stri], + "fn" => &[Stri, Napumsaka], + "mn" => &[Pum, Napumsaka], + "mfn" => &[Pum, Stri, Napumsaka], + "none" => &[], + &_ => panic!("Unknown type {}", code), } } -fn add_indeclinables(path: &Path, padas: &mut PadaMap) -> Result<()> { +/// Adds avyayas scraped from the MW dictionary. +fn add_avyayas(path: &Path, padas: &mut PadaVec) -> Result<()> { let mut rdr = csv::Reader::from_path(path)?; for maybe_row in rdr.records() { let r = maybe_row?; let pada = r[0].to_string(); - padas.insert( + padas.push(( pada.clone(), Pada::Avyaya(Avyaya { pratipadika: Pratipadika::Basic { @@ -182,12 +166,15 @@ fn add_indeclinables(path: &Path, padas: &mut PadaMap) -> Result<()> { lingas: Vec::new(), }, }), - ); + )); } Ok(()) } -fn add_nominal_padas(path: &Path, padas: &mut PadaMap) -> Result<()> { +// Adds irregular subantas specified manually. +// +// TODO: can we deprecate this given vidyut-prakriya? +fn add_irregular_subantas(path: &Path, padas: &mut PadaVec) -> Result<()> { let mut rdr = csv::Reader::from_path(path)?; for maybe_row in rdr.records() { let r = maybe_row?; @@ -201,7 +188,7 @@ fn add_nominal_padas(path: &Path, padas: &mut PadaMap) -> Result<()> { let semantics = Pada::Subanta(Subanta { pratipadika: Pratipadika::Basic { text: pratipadika.clone(), - lingas: stem_lingas, + lingas: stem_lingas.to_vec(), }, linga, vibhakti, @@ -209,9 +196,10 @@ fn add_nominal_padas(path: &Path, padas: &mut PadaMap) -> Result<()> { is_purvapada: false, }); - padas.insert(pada.clone(), semantics); + padas.push((pada.clone(), semantics)); } + // `mahA` is common but missing upstream, so add it specially. let semantics = Pada::Subanta(Subanta { pratipadika: Pratipadika::Basic { text: "mahat".to_string(), @@ -222,59 +210,15 @@ fn add_nominal_padas(path: &Path, padas: &mut PadaMap) -> Result<()> { vacana: None, is_purvapada: true, }); - padas.insert("mahA".to_string(), semantics); - - Ok(()) -} - -fn add_nominal_endings_compounded(path: &Path, endings: &mut SupMap) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; - for maybe_row in rdr.records() { - let r = maybe_row?; - let stem = r[0].to_string(); - let stem_lingas = parse_stem_linga(&r[1]); - let ending = r[2].to_string(); - let ending_linga = r[3].parse()?; - - let semantics = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: stem.clone(), - lingas: stem_lingas, - }, - linga: Some(ending_linga), - vibhakti: None, - vacana: None, - is_purvapada: true, - }); - endings.insert(ending, (stem, semantics)); - } - Ok(()) -} - -fn add_nominal_endings_inflected(path: &Path, endings: &mut SupMap) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; - for maybe_row in rdr.records() { - let r = maybe_row?; + padas.push(("mahA".to_string(), semantics)); - let stem = r[0].to_string(); - let ending = r[2].to_string(); - let linga = r[3].parse()?; - let semantics = Pada::Subanta(Subanta { - pratipadika: Pratipadika::Basic { - text: stem.clone(), - lingas: vec![linga], - }, - linga: Some(linga), - vibhakti: r[4].parse().ok(), - vacana: r[5].parse().ok(), - is_purvapada: false, - }); - endings.insert(ending, (stem, semantics)); - } Ok(()) } -fn add_nominal_stems(path: &Path, padas: &mut StemMap) -> Result<()> { +/// Add simple pratipadikas scraped from the MW dictionary. +/// +/// TODO: deduplicate with our krdantas, etc. +fn add_basic_pratipadikas(path: &Path, stems: &mut StemVec) -> Result<()> { let mut rdr = csv::Reader::from_path(path)?; for maybe_row in rdr.records() { let r = maybe_row?; @@ -282,31 +226,17 @@ fn add_nominal_stems(path: &Path, padas: &mut StemMap) -> Result<()> { let lingas = parse_stem_linga(&r[1]); let semantics = Pratipadika::Basic { text: stem.clone(), - lingas, + lingas: lingas.to_vec(), }; - padas.insert(stem, semantics); + stems.push((stem, semantics)); } Ok(()) } -fn add_participle_stems(path: &Path, padas: &mut StemMap) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; - for maybe_row in rdr.records() { - let r = maybe_row?; - let stem = r[0].to_string(); - let root = r[1].to_string(); - padas.insert( - stem, - Pratipadika::Krdanta { - dhatu: Dhatu(root), - pratyaya: parse_krt_pratyaya(&r[4], &r[5]), - }, - ); - } - Ok(()) -} - -fn add_prefix_groups(path: &Path, padas: &mut PadaMap) -> Result<()> { +/// Adds various common prefix groups. +/// +/// TODO: this doesn't make sense. We aren't storing the split prefixes anywhere ... +fn add_prefix_groups(path: &Path, padas: &mut PadaVec) -> Result<()> { let mut rdr = csv::Reader::from_path(path)?; for maybe_row in rdr.records() { let r = maybe_row?; @@ -318,123 +248,56 @@ fn add_prefix_groups(path: &Path, padas: &mut PadaMap) -> Result<()> { lingas: Vec::new(), }, }); - padas.insert(value.to_string(), semantics); + padas.push((value.to_string(), semantics)); } Ok(()) } -fn add_pronouns(path: &Path, padas: &mut PadaMap) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; +/// TODO: delete this after migrating to vidyut-prakriya for everything. +fn read_sup_endings(paths: &DataPaths) -> Result { + let mut endings = SupVec::new(); + + let mut rdr = csv::Reader::from_path(&paths.nominal_endings_compounded)?; for maybe_row in rdr.records() { let r = maybe_row?; - let stem = r[0].to_string(); - let text = r[2].to_string(); - let linga = match &r[3] { - "none" => None, - "_" => None, - s => Some(s.parse()?), - }; - let lingas = match linga { - Some(x) => vec![x], - None => vec![], - }; + let stem_lingas = parse_stem_linga(&r[1]); + let ending = r[2].to_string(); + let ending_linga = r[3].parse()?; - let morph = Pada::Subanta(Subanta { + let semantics = Pada::Subanta(Subanta { pratipadika: Pratipadika::Basic { text: stem.clone(), - lingas, - }, - linga, - vibhakti: r[4].parse().ok(), - vacana: r[5].parse().ok(), - is_purvapada: false, - }); - padas.insert(text, morph); - } - Ok(()) -} - -fn add_verbal_indeclinables(path: &Path, padas: &mut PadaMap) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; - for maybe_row in rdr.records() { - let row = maybe_row?; - let pada = row[0].to_string(); - let root = row[1].to_string(); - let pratyaya = match &row[3] { - "gerund" => { - if pada.ends_with("ya") { - KrtPratyaya::Lyap - } else { - KrtPratyaya::Ktva - } - } - "infinitive" => KrtPratyaya::Tumun, - &_ => panic!("Unknown indeclinable type `{}`", &row[3]), - }; - let semantics = Pada::Avyaya(Avyaya { - pratipadika: Pratipadika::Krdanta { - dhatu: Dhatu(root), - pratyaya, + lingas: stem_lingas.to_vec(), }, + linga: Some(ending_linga), + vibhakti: None, + vacana: None, + is_purvapada: true, }); - - padas.insert(pada, semantics); + endings.push((ending, stem, semantics)); } - Ok(()) -} -fn add_verbs(path: &Path, padas: &mut PadaMap) -> Result<()> { - let mut rdr = csv::Reader::from_path(path)?; + let mut rdr = csv::Reader::from_path(&paths.nominal_endings_inflected)?; for maybe_row in rdr.records() { let r = maybe_row?; - let text = r[0].to_string(); - let root = r[1].to_string(); - - let purusha = match &r[4] { - "3" => Purusha::Prathama, - "2" => Purusha::Madhyama, - "1" => Purusha::Uttama, - &_ => panic!("Unknown type `{}`", &r[4]), - }; - - let vacana = r[5].parse()?; - - let lakara = match &r[6] { - "pres" => Lakara::Lat, - "ipft" => Lakara::Lan, - "sfut" => Lakara::Lrt, - "opt" => Lakara::VidhiLin, - "ben" => Lakara::AshirLin, - "inj" => Lakara::LunNoAgama, - "pfut" => Lakara::Lut, - "impv" => Lakara::Lot, - "perf" => Lakara::Lit, - "aor" => Lakara::Lun, - "cond" => Lakara::Lrn, - &_ => panic!("Unknown type {}", &r[6]), - }; - let pada = parse_pada_prayoga(&r[7]); - - padas.insert( - text, - Pada::Tinanta(Tinanta { - dhatu: Dhatu(root), - purusha, - vacana, - lakara, - pada, - }), - ); + let stem = r[0].to_string(); + let ending = r[2].to_string(); + let linga = r[3].parse()?; + let semantics = Pada::Subanta(Subanta { + pratipadika: Pratipadika::Basic { + text: stem.clone(), + lingas: vec![linga], + }, + linga: Some(linga), + vibhakti: r[4].parse().ok(), + vacana: r[5].parse().ok(), + is_purvapada: false, + }); + endings.push((ending, stem, semantics)); } - Ok(()) -} -fn read_nominal_endings(paths: &DataPaths) -> Result { - let mut endings = SupMap::new(); - add_nominal_endings_compounded(&paths.nominal_endings_compounded, &mut endings)?; - add_nominal_endings_inflected(&paths.nominal_endings_inflected, &mut endings)?; Ok(endings) } @@ -464,18 +327,15 @@ fn get_variants(text: &str) -> Vec { variants } -fn read_stems(paths: &DataPaths) -> Result { - let mut stems = StemMap::new(); - add_nominal_stems(&paths.nominal_stems, &mut stems)?; - add_participle_stems(&paths.participle_stems, &mut stems)?; +fn read_stems(paths: &DataPaths) -> Result { + let mut stems = StemVec::new(); + add_basic_pratipadikas(&paths.basic_pratipadikas, &mut stems)?; // Add simple support for variants. - let mut variants = StemMap::new(); - for (k, vs) in stems.0.iter() { - for k_variant in get_variants(k) { - for v in vs { - variants.insert(k_variant.clone(), v.clone()); - } + let mut variants = StemVec::new(); + for (k, v) in &stems { + for k_variant in get_variants(&k) { + variants.push((k_variant.clone(), v.clone())); } } stems.extend(variants); @@ -483,22 +343,17 @@ fn read_stems(paths: &DataPaths) -> Result { Ok(stems) } -fn read_padas(paths: &DataPaths) -> Result { - let mut padas = PadaMap::with_capacity(20_000_000); - add_indeclinables(&paths.indeclinables, &mut padas).expect("Could not find indeclinables"); +fn read_padas(paths: &DataPaths) -> Result { + let mut padas = PadaVec::with_capacity(20_000_000); + add_avyayas(&paths.indeclinables, &mut padas).expect("Could not find indeclinables"); add_prefix_groups(&paths.prefix_groups, &mut padas).expect("Could not find prefix groups"); - add_pronouns(&paths.pronouns, &mut padas).expect("Could not find pronouns"); - add_verbal_indeclinables(&paths.verbal_indeclinables, &mut padas) - .expect("Could not find verbal indeclinables"); - add_verbs(&paths.verbs, &mut padas).expect("Could not find verbs"); - add_nominal_padas(&paths.nominal_padas, &mut padas).expect("Could not find irregular nominals"); - - let mut variants = PadaMap::new(); - for (k, vs) in padas.0.iter() { - for k_variant in get_variants(k) { - for v in vs { - variants.insert(k_variant.clone(), v.clone()); - } + add_irregular_subantas(&paths.irregular_subantas, &mut padas) + .expect("Could not find irregular subantas"); + + let mut variants = PadaVec::new(); + for (k, v) in &padas { + for k_variant in get_variants(&k) { + variants.push((k_variant.clone(), v.clone())); } } padas.extend(variants); @@ -539,22 +394,25 @@ fn inflect_halanta_stem(stem: &str, sup: &str) -> String { } // Generates all nominal padas and adds them to the pada map. -fn add_nominals(stems: &StemMap, endings: &SupMap, padas: &mut PadaMap) { - let stem_to_endings = endings - .0 - .iter() - .flat_map(|(ending, vs)| { - vs.iter() - .map(|(stem, pada)| (stem.clone(), (ending.clone(), pada.clone()))) - }) - .collect::>(); +fn add_nominals(stems: &StemVec, endings: &SupVec, padas: &mut PadaVec) { + let mut stem_to_endings = HashMap::new(); + for (ending, stem, semantics) in endings { + if !stem_to_endings.contains_key(stem) { + let stem = stem.clone(); + stem_to_endings.insert(stem, vec![]); + } + stem_to_endings + .get_mut(stem) + .unwrap() + .push((ending.clone(), semantics.clone())); + } // For all stems, ... - for (stem_text, all_stem_semantics) in stems.0.iter() { + for (stem_text, stem_semantics) in stems { let mut was_inserted = false; // And all stem endings ... - for (stem_ending, sup_pratyayas) in stem_to_endings.0.iter() { + for (stem_ending, sup_pratyayas) in stem_to_endings.iter() { // If the stem ends in this ending ... if let Some(prefix) = stem_text.strip_suffix(stem_ending) { // Then for all pratyayas that the ending allows, ... @@ -562,14 +420,12 @@ fn add_nominals(stems: &StemMap, endings: &SupMap, padas: &mut PadaMap) { let pada_text = prefix.to_string() + sup_text; if let Pada::Subanta(sup_semantics) = sup_semantics { - for stem_semantics in all_stem_semantics { - // Create and insert the corresponding pada. - let pada_semantics = Pada::Subanta(Subanta { - pratipadika: stem_semantics.clone(), - ..sup_semantics.clone() - }); - padas.insert(pada_text.clone(), pada_semantics); - } + // Create and insert the corresponding pada. + let pada_semantics = Pada::Subanta(Subanta { + pratipadika: stem_semantics.clone(), + ..sup_semantics.clone() + }); + padas.push((pada_text.clone(), pada_semantics)); } } was_inserted = true; @@ -580,59 +436,410 @@ fn add_nominals(stems: &StemMap, endings: &SupMap, padas: &mut PadaMap) { // If the stem is a special consonant ending ... if is_hal(stem_text.chars().last().unwrap()) { let pratyayas = stem_to_endings - .0 .get("_") .expect("`_` ending should be defined"); for (sup_text, sup_semantics) in pratyayas { let pada_text = inflect_halanta_stem(stem_text, sup_text); if let Pada::Subanta(sup_semantics) = sup_semantics { - for stem_semantics in all_stem_semantics { - // Create and insert the corresponding pada. - let pada_semantics = Pada::Subanta(Subanta { - pratipadika: stem_semantics.clone(), - ..sup_semantics.clone() + // Create and insert the corresponding pada. + let pada_semantics = Pada::Subanta(Subanta { + pratipadika: stem_semantics.clone(), + ..sup_semantics.clone() + }); + padas.push((pada_text.clone(), pada_semantics)); + } + } + } + } + } +} + +fn create_sarvanamas(padas: &mut PadaVec) { + // Data copied from vidyut-prakriya. + const SARVANAMA: &[&str] = &[ + // qatara, qatama + // TODO: actually detect qatarac/qatamac in vidyut-prakriya. + "katara", "yatara", "tatara", "ekatara", "katama", "yatama", "tatama", "ekatama", + // sarvAdi + "sarva", "viSva", "uBa", "uBaya", "qatara", "qatama", "anya", "anyatara", "itara", "tvat", + "tva", "nema", "sama", "sima", "pUrva", "para", "avara", "dakziRa", "uttara", "apara", + "aDara", "sva", "antara", "tyad", "tad", "yad", "etad", "idam", "adas", "eka", "dvi", + "yuzmad", "asmad", "Bavatu~", "kim", + ]; + + let linga_vibhakti_vacana = linga_vibhakti_vacana_options(); + + let v = Vyakarana::builder() + .log_steps(false) + .is_chandasi(true) + .build(); + for stem in SARVANAMA { + let prati = vp::Pratipadika::basic(stem); + let lingas = vec![Linga::Pum, Linga::Stri, Linga::Napumsaka]; + + for (linga, vibhakti, vacana) in &linga_vibhakti_vacana { + let args = vp::Subanta::new(prati.clone(), *linga, *vibhakti, *vacana); + let prakriyas = v.derive_subantas(&args); + for p in prakriyas { + let morph = Pada::Subanta(Subanta { + pratipadika: Pratipadika::Basic { + text: stem.to_string(), + lingas: lingas.clone(), + }, + linga: Some(Linga::from(*linga)), + vibhakti: Some(Vibhakti::from(*vibhakti)), + vacana: Some(Vacana::from(*vacana)), + is_purvapada: false, + }); + let text = p.text(); + padas.push((text, morph)); + } + } + } +} + +/// Creates all tinantas. +/// +/// This function generates the following combinations: +/// +/// (upasarga, dhatu, sanadi, pada, lakara, purusha, vacana) +/// +/// - `upasarga` comes from the Upasargartha-candrika. +/// - `dhatu` comes from the Dhatupatha on ashtadhyayi.com +/// +/// TODO: gati, cvi +fn create_tinantas( + entries: &Vec, + upasarga_dhatus: &UpasargaDhatuMap, + padas: &mut PadaVec, +) { + let all_sanadis = sanadi_options(); + let args = tinanta_options(); + + let v = Vyakarana::builder() + .log_steps(false) + .is_chandasi(true) + .build(); + + let results: Vec<_> = entries + .par_iter() + .flat_map(|entry| { + let new = Vec::new(); + let upasarga_groups = upasarga_dhatus.get(entry.code()).unwrap_or(&new); + let mut ret = Vec::new(); + + for group in upasarga_groups { + for sanadi in &all_sanadis { + let dhatu = entry + .dhatu() + .clone() + .with_sanadi(sanadi) + .with_prefixes(group); + + for (prayoga, dhatu_pada, lakara, purusha, vacana) in &args { + let args = vp::Tinanta::builder() + .dhatu(dhatu.clone()) + .prayoga(*prayoga) + .pada(*dhatu_pada) + .lakara(*lakara) + .purusha(*purusha) + .vacana(*vacana) + .build() + .expect("ok"); + + let pada_prayoga = match (dhatu_pada, prayoga) { + (vp::DhatuPada::Parasmai, _) => PadaPrayoga::Parasmaipada, + (vp::DhatuPada::Atmane, vp::Prayoga::Kartari) => { + PadaPrayoga::AtmanepadaKartari + } + (vp::DhatuPada::Atmane, _) => PadaPrayoga::AtmanepadaNotKartari, + }; + + let prakriyas = v.derive_tinantas(&args); + ret.extend(prakriyas.iter().map(|prakriya| { + let text = prakriya.text(); + let semantics = Pada::Tinanta(Tinanta { + dhatu: dhatu.clone().into(), + purusha: Purusha::from(*purusha), + vacana: Vacana::from(*vacana), + lakara: Lakara::from(*lakara), + pada: PadaPrayoga::from(pada_prayoga), }); - padas.insert(pada_text.clone(), pada_semantics); + + (text, semantics) + })); + } + } + } + + ret.into_par_iter() + }) + .collect(); + + padas.extend(results); +} + +/// Creates all krdantas that form nominals. +/// +/// This function generates the following combinations: +/// +/// (upasarga, dhatu, sanadi, pada, krt, linga, vibhakti, vacana) +/// +/// - `upasarga` comes from the Upasargartha-candrika. +/// - `dhatu` comes from the Dhatupatha on ashtadhyayi.com +/// +/// TODO: gati, cvi +fn create_inflected_krdantas( + entries: &Vec, + upasarga_dhatus: &UpasargaDhatuMap, + padas: &mut PadaVec, +) { + use vp::BaseKrt as VKrt; + + let linga_vibhakti_vacana = linga_vibhakti_vacana_options(); + let all_sanadis = sanadi_options(); + let all_krts = &[ + // Lit + VKrt::kvasu, + VKrt::kAnac, + // nistha + VKrt::kta, + VKrt::ktavatu, + // Lat + VKrt::Satf, + VKrt::SAnac, + // krtya + VKrt::yat, + VKrt::Ryat, + VKrt::kyap, + VKrt::tavya, + VKrt::anIyar, + // Common + VKrt::Rvul, + VKrt::lyuw, + VKrt::tfc, + // TODO: all all of the others, including unadis. + ]; + + let sat_pratyayas = &[VKrt::Satf, VKrt::SAnac]; + + let v = Vyakarana::builder() + .log_steps(false) + .is_chandasi(true) + .build(); + + let results: Vec<_> = entries + .par_iter() + .flat_map(|entry| { + let new = Vec::new(); + let upasarga_groups = upasarga_dhatus.get(entry.code()).unwrap_or(&new); + let mut ret = Vec::new(); + + for group in upasarga_groups { + for sanadi in &all_sanadis { + let dhatu = entry + .dhatu() + .clone() + .with_sanadi(sanadi) + .with_prefixes(group); + + for krt in all_krts { + for (linga, vibhakti, vacana) in &linga_vibhakti_vacana { + let krdanta = vp::Krdanta::builder() + .dhatu(dhatu.clone()) + .krt(*krt) + .build() + .expect("ok"); + + let args = + vp::Subanta::new(krdanta.clone(), *linga, *vibhakti, *vacana); + + let prakriyas = v.derive_subantas(&args); + ret.extend(prakriyas.iter().map(|p| { + let text = p.text(); + let semantics = Pada::Subanta(Subanta { + pratipadika: Pratipadika::Krdanta { + dhatu: dhatu.clone().into(), + krt: Krt::new(*krt), + }, + linga: Some(Linga::from(*linga)), + vibhakti: Some(Vibhakti::from(*vibhakti)), + vacana: Some(Vacana::from(*vacana)), + is_purvapada: false, + }); + + (text, semantics) + })); + } + } + + // lrt-sat (karizyan, karizyamARaH, ...) + for krt in sat_pratyayas { + for (linga, vibhakti, vacana) in &linga_vibhakti_vacana { + let krdanta = vp::Krdanta::builder() + .dhatu(dhatu.clone()) + .lakara(vp::Lakara::Lrt) + .krt(VKrt::Satf) + .build() + .expect("ok"); + + let args = + vp::Subanta::new(krdanta.clone(), *linga, *vibhakti, *vacana); + + let prakriyas = v.derive_subantas(&args); + ret.extend(prakriyas.iter().map(|p| { + let text = p.text(); + let semantics = Pada::Subanta(Subanta { + pratipadika: Pratipadika::Krdanta { + dhatu: dhatu.clone().into(), + krt: Krt::new(*krt), + }, + linga: Some(Linga::from(*linga)), + vibhakti: Some(Vibhakti::from(*vibhakti)), + vacana: Some(Vacana::from(*vacana)), + is_purvapada: false, + }); + + (text, semantics) + })); } } } } - } + + ret.into_par_iter() + }) + .collect(); + + padas.extend(results); +} + +/// Creates all krdantas that form avyayas. +/// +/// This function generates the following combinations: +/// +/// (upasarga, dhatu, sanadi, krt) +/// +/// - `upasarga` comes from the Upasargartha-candrika. +/// - `dhatu` comes from the Dhatupatha on ashtadhyayi.com +/// +/// TODO: gati, cvi +fn create_avyaya_krdantas( + entries: &Vec, + upasarga_dhatus: &UpasargaDhatuMap, + padas: &mut PadaVec, +) { + let all_sanadis = sanadi_options(); + let all_krts = &[vp::BaseKrt::ktvA, vp::BaseKrt::tumun]; + + let v = Vyakarana::builder() + .log_steps(false) + .is_chandasi(true) + .build(); + + let results: Vec<_> = entries + .par_iter() + .flat_map(|entry| { + let new = Vec::new(); + let upasarga_groups = upasarga_dhatus.get(entry.code()).unwrap_or(&new); + let mut ret = Vec::new(); + + for group in upasarga_groups { + for sanadi in &all_sanadis { + let dhatu = entry + .dhatu() + .clone() + .with_sanadi(sanadi) + .with_prefixes(group); + for krt in all_krts { + let args = vp::Krdanta::builder() + .dhatu(dhatu.clone().with_sanadi(sanadi)) + .krt(*krt) + .build() + .expect("ok"); + + let prakriyas = v.derive_krdantas(&args); + ret.extend(prakriyas.iter().map(|p| { + let text = p.text(); + let semantics = Pada::Avyaya(Avyaya { + pratipadika: Pratipadika::Krdanta { + dhatu: dhatu.clone().into(), + krt: Krt::new(*krt), + }, + }); + + (text, semantics) + })); + } + } + } + + ret.into_par_iter() + }) + .collect(); + + padas.extend(results); +} + +/// Maps a dhatu code (e.g. "01.0001") to all lists of prefixes it might take. +fn parse_upasarga_dhatus(path: &Path) -> Result { + let mut rdr = csv::Reader::from_path(path)?; + let mut ret: UpasargaDhatuMap = HashMap::new(); + for maybe_row in rdr.records() { + let r = maybe_row?; + let upasargas: Vec<_> = r[0].split("-").map(|x| x.to_string()).collect(); + let code = r[2].to_string(); + // the empty Vec is for the default case (no prefixes). + ret.entry(code).or_insert(vec![Vec::new()]).push(upasargas); } + + Ok(ret) } fn run(args: Args) -> Result<()> { info!("Reading linguistic data ..."); + let data_paths = DataPaths::new(Path::new(&args.input_dir)); + let dhatupatha = Dhatupatha::from_path(&args.dhatupatha)?; + // let dhatu_entries: Vec = dhatupatha.into_iter().take(200).collect(); + let dhatu_entries: Vec = dhatupatha.into_iter().collect(); + let mut padas = read_padas(&data_paths)?; - let stems = read_stems(&data_paths)?; - let endings = read_nominal_endings(&data_paths)?; - info!("Generating nominals ..."); + info!("Creating tinantas ..."); + let upasarga_dhatus = parse_upasarga_dhatus(&data_paths.upasarga_dhatus)?; + create_tinantas(&dhatu_entries, &upasarga_dhatus, &mut padas); + + info!("Creating krdantas (inflected) ..."); + create_inflected_krdantas(&dhatu_entries, &upasarga_dhatus, &mut padas); + + info!("Creating krdantas (avyaya) ..."); + create_avyaya_krdantas(&dhatu_entries, &upasarga_dhatus, &mut padas); + + info!("Creating plain subantas ..."); + create_sarvanamas(&mut padas); + + let stems = read_stems(&data_paths)?; + let endings = read_sup_endings(&data_paths)?; add_nominals(&stems, &endings, &mut padas); - info!("Sorting kosha keys lexicographically ..."); - let mut padas: Vec<_> = padas.0.into_iter().collect(); - padas.par_sort_by(|x, y| x.0.cmp(&y.0)); + info!("Sorting keys ..."); + padas.par_sort(); info!("Inserting entries ..."); let config = Config::new(&args.output_dir); let mut builder = Builder::new(config.kosha())?; - for (key, pada_vec) in padas { - for pada in pada_vec { - builder.insert(&key, &pada)?; - } + let mut num_words = 0; + for (key, pada) in padas { + builder.insert(&key, &pada)?; + num_words += 1; } info!("Finishing build ..."); builder.finish()?; - // Check that we can load the dict. - let kosha = Kosha::new(config.kosha())?; - assert!(kosha.contains_key("narasya")); - - info!("Complete."); + info!("Complete. (Inserted {num_words} entries.)"); Ok(()) } diff --git a/src/bin/eval_cheda.rs b/src/bin/eval_cheda.rs index cc52380..029e096 100644 --- a/src/bin/eval_cheda.rs +++ b/src/bin/eval_cheda.rs @@ -11,6 +11,7 @@ use vidyut_cheda::Result; use vidyut_cheda::{Chedaka, Config, Token}; use vidyut_kosha::morph::*; use vidyut_lipi::{transliterate, Mapping, Scheme}; +use vidyut_prakriya::args as vp; #[derive(Parser, Debug)] #[command(author, version, about)] @@ -85,9 +86,9 @@ fn as_code(w: &Token) -> String { Pada::Avyaya(a) => { let val = match &a.pratipadika { Pratipadika::Basic { .. } => "i", - Pratipadika::Krdanta { pratyaya, .. } => match pratyaya { - KrtPratyaya::Ktva => "ktva", - KrtPratyaya::Tumun => "tumun", + Pratipadika::Krdanta { krt, .. } => match krt.value() { + vp::Krt::Base(vp::BaseKrt::ktvA) => "ktva", + vp::Krt::Base(vp::BaseKrt::tumun) => "tumun", _ => "_", }, }; diff --git a/src/bin/test_kosha.rs b/src/bin/test_kosha.rs index 3e6d072..6e9cbbf 100644 --- a/src/bin/test_kosha.rs +++ b/src/bin/test_kosha.rs @@ -16,47 +16,135 @@ struct Args { fn test_tinantas(k: &Kosha) -> Result<()> { let keys = vec![ - // Basic lakaras (kartari, karmani/bhAve) - "nayati", - "ninAya", - "netA", - "nezyati", - "nayatu", - "anayat", - // "nIyAt", - "nayet", - "anEzIt", - // "anezyat", - "nIyate", - "nIyatAm", - "anIyata", - "nIyeta", - // san dhAtus (kartari, karmani/bhAve) - "ninIzati", - "ninIzatu", - "aninIzat", - "ninIzet", - "ninIzyate", - "ninIzyatAm", - "aninIzyata", - "ninIzyeta", - // Nic dhAtus (kartari, karmani/bhAve) - "nAyayati", - "nAyayatu", - "anAyayat", - "nAyayet", - "nAyyate", - "nAyyatAm", - "anAyyata", - "nAyyeta", - // TODO: yaG + // Basic lakaras (kartari) + "Bavati", + "baBUva", + "BavitA", + "Bavizyati", + "Bavatu", + "aBavat", + "BUyAt", + "Bavet", + "aBUt", + "aBavizyat", + // Basic lakaras (karmani) + "BUyate", + "baBUve", + "BavitA", + "BAvitA", + "Bavizyate", + "BAvizyate", + "BavyatAm", + "aBUyata", + "BUyeta", + "BavizIzwa", + "BAvizIzwa", + "aBAvi", + "aBavizyata", + "aBAvizyata", + // sannanta (kartari) + "buBUzati", + "buBUzAmbaBUva", + "buBUzAYcakAra", + "buBUzAmAsa", + "buBUzitA", + "buBUzizyati", + "buBUzatu", + "abuBUzat", + "buBUzet", + "buBUzyAt", + "abuBUzIt", + "abuBUzizyat", + // Nijanta (kartari) + "BAvayati", + "BAvayAmbaBUva", + "BAvayAYcakAra", + "BAvayAmAsa", + "BAvayitA", + "BAvayizyati", + "BAvayatu", + "aBAvayat", + "BAvayet", + "BAvyAt", + "abIBavat", + "aBAvayizyat", + // yaGanta (kartari) + "boBUyate", + "boBUyAmbaBUva", + "boBUyAYcakre", + "boBUyAmAsa", + "boBUyitA", + "boBUyizyate", + "boBUyatAm", + "aboBUyata", + "boBUyeta", + "boBUyizIzwa", + "aboBUyizwa", + "aboBUyizyata", + // Prefixes + "aBiBavati", + "praBavati", + // Other tricky tinantas + "saMskaroti", + "saYcaskAra", + "saYcaskrire", ]; + let mut i = 0; for key in &keys { - assert!(k.contains_key(key), "{key}"); + let ok = k.contains_key(key); + if ok { + i += 1; + } else { + println!("FAILED: key {key} is missing"); + } } let n = keys.len(); - println!("{n} / {n} tinanta tests passed."); + println!("{i} / {n} tinanta tests passed."); + + Ok(()) +} + +fn test_krdantas(k: &Kosha) -> Result<()> { + let keys = vec![ + // kta, ktavat + "BUtaH", + "BUtam", + "BUtA", + "BUtavAn", + "BUtavat", + "BUtavatI", + // Satf + "Bavan", + "BavantaH", + "BavantI", + "Bavizyan", + "BavizyantaH", + "BavizyantI", + // krtya + "Bavyam", + "Bavitavyam", + "BavanIyam", + // Other + "BAvakaH", + "Bavanam", + // With prefixes + "aBiBUtam", + "praBUtam", + "saMskftam", + ]; + + let mut i = 0; + for key in &keys { + let ok = k.contains_key(key); + if ok { + i += 1; + } else { + println!("FAILED: key {key} is missing"); + } + } + let n = keys.len(); + println!("{i} / {n} tinanta tests passed."); Ok(()) } @@ -70,8 +158,8 @@ fn test_subantas(k: &Kosha) -> Result<()> { ("gurus", "guru"), ("vaDUs", "vaDU"), ("kartA", "kartf"), - // ("rEs", "rE"), - // "dyOs", + ("rAs", "rE"), + // ("dyOs", "div"), ("nOs", "nO"), ("AtmA", "Atman"), ("manasA", "manas"), @@ -87,8 +175,8 @@ fn test_subantas(k: &Kosha) -> Result<()> { // Irregular subantas ("mahAn", "mahat"), - // ("tri", "trayas"), - // ("zaz", "zaRRAm"), + ("trayas", "tri"), + ("zaRRAm", "zaz"), ("sapta", "saptan"), ("daSa", "daSan"), ("pitaras", "pitf"), @@ -99,20 +187,22 @@ fn test_subantas(k: &Kosha) -> Result<()> { ("yUnAm", "yuvan"), ]; + let mut i = 0; for (key, lemma) in &keys { + let present = k.contains_key(key); let entries: std::result::Result, _> = k.get_all(key).iter().map(|x| k.unpack(x)).collect(); let entries = entries?; + let has_lemma = entries.iter().any(|x| &x.lemma() == lemma); - assert!( - entries.iter().any(|x| &x.lemma() == lemma), - "{} {}", - key, - lemma - ); + if present && has_lemma { + i += 1; + } else { + println!("FAILED: key {key} is missing (present={present}, has_lemma={has_lemma})"); + } } let n = keys.len(); - println!("{n} / {n} subanta tests passed."); + println!("{i} / {n} tinanta tests passed."); Ok(()) } @@ -120,6 +210,7 @@ fn test_subantas(k: &Kosha) -> Result<()> { fn run_tests(args: Args) -> Result<()> { let kosha = Kosha::new(args.data_dir)?; test_tinantas(&kosha)?; + test_krdantas(&kosha)?; test_subantas(&kosha)?; Ok(()) } diff --git a/vidyut-chandas/README.md b/vidyut-chandas/README.md index 1198bf6..f38abeb 100644 --- a/vidyut-chandas/README.md +++ b/vidyut-chandas/README.md @@ -3,7 +3,7 @@

A Sanskrit metrical classifier

-`vidyut-chandas` is an experimental classifier for Sanskrit meters. +`vidyut-chandas` identifies the meter in some piece of Sanskrit text. This [crate][crate] is under active development as part of the [Ambuda][ambuda] project. If you enjoy our work and wish to contribute to it, we encourage you @@ -12,6 +12,12 @@ programmers and enthusiasts. An online demo is available [here][demo]. +`vidyut-chandas` is not a state-of-the-art solution, and you might consider +exploring and using these other projects instead: + +- [Skrutable](https://github.com/tylergneill/skrutable) +- [sanskritmetres](https://github.com/shreevatsa/sanskrit) + [crate]: https://doc.rust-lang.org/book/ch07-01-packages-and-crates.html [ambuda]: https://ambuda.org [discord]: https://discord.gg/7rGdTyWY7Z @@ -42,18 +48,11 @@ Usage We recommend using `vidyut-chandas` through our `Chandas` API: -```rust +```rust,no_run use vidyut_chandas::{Chandas, MatchType, Vrtta}; -let vrttas: Vec = vec![ - "vasantatilakA\tvrtta\tGGLGLLLGLLGLGG".try_into().unwrap(), - "mandAkrAntA\tvrtta\tGGGGLLLLLGGLGGLGG".try_into().unwrap(), - "puzpitAgrA\tvrtta\tLLLLLLGLGLGG/LLLLGLLGLGLGG".try_into().unwrap(), - "udgatA\tvrtta\tLLGLGLLLGL/LLLLLGLGLG/GLLLLLLGLLG/LLGLGLLLGLGLG".try_into().unwrap() -]; -let chandas = Chandas::new(vrttas); - +let chandas = Chandas::from_file("/path/to/meters.tsv").unwrap(); let result = chandas.classify("mAtaH samastajagatAM maDukEwaBAreH"); -assert_eq!(result.vrtta().as_ref().unwrap().name(), "vasantatilakA"); +assert_eq!(result.padya().as_ref().unwrap().name(), "vasantatilakA"); assert_eq!(result.match_type(), MatchType::Pada); ``` diff --git a/vidyut-chandas/src/chandas.rs b/vidyut-chandas/src/chandas.rs index 758246c..fd35594 100644 --- a/vidyut-chandas/src/chandas.rs +++ b/vidyut-chandas/src/chandas.rs @@ -2,7 +2,7 @@ use crate::akshara::{scan_lines, Akshara}; use crate::error::Result; use crate::padya::{Jati, JatiKind, MatchType, Vrtta}; use std::fs; -use std::path::{Path, PathBuf}; +use std::path::Path; /// Models a padya type. #[derive(Clone, Debug, Eq, Hash, PartialEq)] @@ -89,7 +89,7 @@ impl Matches { /// let chandas = Chandas::new(vrttas); /// /// let result = chandas.classify("mAtaH samastajagatAM maDukEwaBAreH"); -/// assert_eq!(result.vrtta().as_ref().unwrap().name(), "vasantatilakA"); +/// assert_eq!(result.padya().as_ref().unwrap().name(), "vasantatilakA"); /// assert_eq!(result.match_type(), MatchType::Pada); /// ``` #[derive(Clone, Debug, Default, Eq, Hash, PartialEq)] @@ -125,15 +125,22 @@ impl Chandas { /// /// We recommend using this constructor when the program does not have access to the /// filesystem, e.g. when using this code in WebAssembly. - pub fn from_text(data: &str) -> Result { - let vrttas: Result> = data.lines().map(Vrtta::try_from).collect(); + pub fn from_text(data: impl AsRef) -> Result { + let vrttas: Result> = data.as_ref().lines().map(Vrtta::try_from).collect(); Ok(Self::new(vrttas?)) } /// Creates a new classifier from the given data path. - pub fn from_file(path: &Path) -> Result { - let path = PathBuf::from(path).join(path); - let data = fs::read_to_string(path)?; + /// + /// ### Usage + /// + /// ```no_run + /// use vidyut_chandas::Chandas; + /// + /// let c = Chandas::from_file("/path/to/meters.tsv").unwrap(); + /// ``` + pub fn from_file(path: impl AsRef) -> Result { + let data = fs::read_to_string(path.as_ref())?; let vrttas: Result> = data.lines().map(Vrtta::try_from).collect(); Ok(Self::new(vrttas?)) @@ -151,7 +158,15 @@ impl Chandas { /// Classifies the input string against an internal list of meters. /// - /// Currently, this function supports only vrttas. + /// ### Usage + /// + /// ```no_run + /// use vidyut_chandas::Chandas; + /// + /// let c = Chandas::from_file("/path/to/meters.tsv").unwrap(); + /// let text = "kaScitkAntAvirahaguruRA svADikArapramattaH"; + /// let res = c.classify(text); + /// ``` pub fn classify(&self, text: impl AsRef) -> Match { self.classify_inner(text.as_ref()) } diff --git a/vidyut-cheda/Cargo.toml b/vidyut-cheda/Cargo.toml index 7cc08e9..3eed8ce 100644 --- a/vidyut-cheda/Cargo.toml +++ b/vidyut-cheda/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [dependencies] vidyut-kosha = { path = "../vidyut-kosha" } +vidyut-prakriya = { path = "../vidyut-prakriya" } vidyut-sandhi = { path = "../vidyut-sandhi" } vidyut-lipi = { path = "../vidyut-lipi" } clap = { version = "4.0.12", features = ["derive"] } diff --git a/vidyut-cheda/src/dcs.rs b/vidyut-cheda/src/dcs.rs index 4f09a99..ddc38ed 100644 --- a/vidyut-cheda/src/dcs.rs +++ b/vidyut-cheda/src/dcs.rs @@ -5,6 +5,7 @@ use crate::segmenting::Token; use compact_str::CompactString; use vidyut_kosha::morph::*; use vidyut_lipi::{transliterate, Mapping, Scheme}; +use vidyut_prakriya::args::BaseKrt; fn to_slp1(text: &str) -> String { let mapping = Mapping::new(Scheme::Iast, Scheme::Slp1); @@ -113,7 +114,7 @@ fn parse_verb(t: &EvalToken) -> Result { let lakara = parse_lakara(&t.features)?.unwrap_or(Lakara::Lat); let pada = parse_verb_pada(&t.features); Ok(Pada::Tinanta(Tinanta { - dhatu: Dhatu(root), + dhatu: Dhatu::mula(root), purusha, vacana, lakara, @@ -137,8 +138,8 @@ fn parse_krdanta(t: &EvalToken) -> Result { /// Reshapes a DCS krdanta subanta. fn parse_krdanta_subanta(t: &EvalToken) -> Result { let stem = Pratipadika::Krdanta { - dhatu: Dhatu(standardize_lemma(&t.lemma)), - pratyaya: parse_krt_pratyaya(&t.features)?.unwrap_or(KrtPratyaya::Kta), + dhatu: Dhatu::mula(standardize_lemma(&t.lemma)), + krt: parse_krt_pratyaya(&t.features)?.unwrap_or(Krt::new(BaseKrt::kta)), }; let linga = parse_linga(&t.features)?; let vibhakti = parse_vibhakti(&t.features)?; @@ -157,9 +158,9 @@ fn parse_krdanta_subanta(t: &EvalToken) -> Result { /// Reshapes a DCS krdanta avyaya. fn parse_krdanta_avyaya(t: &EvalToken) -> Result { let stem = Pratipadika::Krdanta { - dhatu: Dhatu(standardize_lemma(&t.lemma)), + dhatu: Dhatu::mula(standardize_lemma(&t.lemma)), // Use an arbitrary default. - pratyaya: parse_krt_pratyaya(&t.features)?.unwrap_or(KrtPratyaya::Kta), + krt: parse_krt_pratyaya(&t.features)?.unwrap_or(Krt::new(BaseKrt::kta)), }; Ok(Pada::Avyaya(Avyaya { pratipadika: stem })) @@ -174,13 +175,13 @@ fn parse_stem(t: &EvalToken) -> Pratipadika { } /// Reshapes a DCS tense into a Vidyut tense. -fn parse_krt_pratyaya(f: &TokenFeatures) -> Result> { +fn parse_krt_pratyaya(f: &TokenFeatures) -> Result> { let val = match f.get("Tense") { Some(s) => match s.as_str() { // FIXME: not enough information to reconstruct. - "Pres" => Some(KrtPratyaya::Shatr), - "Past" => Some(KrtPratyaya::Kta), - "Fut" => Some(KrtPratyaya::SyaShatr), + "Pres" => Some(Krt::new(BaseKrt::Satf)), + "Past" => Some(Krt::new(BaseKrt::kta)), + "Fut" => Some(Krt::new(BaseKrt::Satf)), &_ => return Err(Error::parse_dcs("Tense", s)), }, None => None, @@ -207,13 +208,13 @@ fn parse_vibhakti(f: &TokenFeatures) -> Result> { use Vibhakti::*; let val = match f.get("Case") { Some(s) => match s.as_str() { - "Nom" => Some(V1), - "Acc" => Some(V2), - "Ins" => Some(V3), - "Dat" => Some(V4), - "Abl" => Some(V5), - "Gen" => Some(V6), - "Loc" => Some(V7), + "Nom" => Some(Prathama), + "Acc" => Some(Dvitiya), + "Ins" => Some(Trtiya), + "Dat" => Some(Caturthi), + "Abl" => Some(Panchami), + "Gen" => Some(Sasthi), + "Loc" => Some(Saptami), "Voc" => Some(Sambodhana), "Cpd" => None, &_ => return Err(Error::parse_dcs("Case", s)), @@ -278,6 +279,7 @@ fn parse_lakara(f: &TokenFeatures) -> Result> { ("Aor", "Jus") => Lakara::LunNoAgama, ("Aor", "Prec") => Lakara::AshirLin, ("Fut", "Cond") => Lakara::Lrn, + ("Fut", "Pot") => Lakara::Lrn, ("Fut", "Ind") => Lakara::Lrt, ("Impf", "Ind") => Lakara::Lan, ("Perf", "Ind") => Lakara::Lit, diff --git a/vidyut-cheda/src/strict_mode.rs b/vidyut-cheda/src/strict_mode.rs index 2204970..4aca339 100644 --- a/vidyut-cheda/src/strict_mode.rs +++ b/vidyut-cheda/src/strict_mode.rs @@ -120,7 +120,7 @@ mod tests { }, linga: Some(Linga::Pum), vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::V7), + vibhakti: Some(Vibhakti::Saptami), is_purvapada: false, }); diff --git a/vidyut-kosha/Cargo.toml b/vidyut-kosha/Cargo.toml index 849c1a3..f29e2a6 100644 --- a/vidyut-kosha/Cargo.toml +++ b/vidyut-kosha/Cargo.toml @@ -19,6 +19,7 @@ fst = "0.4.7" modular-bitfield = "0.11.2" rustc-hash = "1.1.0" serde = { version = "1.0.152", optional = true, features = ["derive"] } +vidyut-prakriya = { path = "../vidyut-prakriya" } [dev-dependencies] bencher = "0.1.5" diff --git a/vidyut-kosha/src/kosha.rs b/vidyut-kosha/src/kosha.rs index 6174d86..1d7dc6f 100644 --- a/vidyut-kosha/src/kosha.rs +++ b/vidyut-kosha/src/kosha.rs @@ -3,6 +3,7 @@ //! //! Implementation //! -------------- +//! //! We implement our kosha as a finite state transducer using the `fst` crate. Finite state //! transducers are a generalization of tries in that they support both shared prefixes and shared //! suffixes. @@ -306,6 +307,7 @@ mod tests { use crate::morph::*; use fst::Streamer; use tempfile::tempdir; + use vidyut_prakriya::args as vp; type TestResult = Result<()>; @@ -322,7 +324,7 @@ mod tests { #[test] fn write_and_load() -> TestResult { let tin = Pada::Tinanta(Tinanta { - dhatu: Dhatu("gam".to_string()), + dhatu: Dhatu::mula("gam".to_string()), purusha: Purusha::Prathama, vacana: Vacana::Eka, lakara: Lakara::Lat, @@ -330,12 +332,12 @@ mod tests { }); let krdanta = Pada::Subanta(Subanta { pratipadika: Pratipadika::Krdanta { - dhatu: Dhatu("gam".to_string()), - pratyaya: KrtPratyaya::Shatr, + dhatu: Dhatu::mula("gam".to_string()), + krt: Krt::new(vp::BaseKrt::Satf), }, linga: Some(Linga::Pum), vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::V2), + vibhakti: Some(Vibhakti::Dvitiya), is_purvapada: false, }); let sup = Pada::Subanta(Subanta { @@ -345,7 +347,7 @@ mod tests { }, linga: Some(Linga::Pum), vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::V2), + vibhakti: Some(Vibhakti::Dvitiya), is_purvapada: false, }); diff --git a/vidyut-kosha/src/morph.rs b/vidyut-kosha/src/morph.rs index acac3ad..5163389 100644 --- a/vidyut-kosha/src/morph.rs +++ b/vidyut-kosha/src/morph.rs @@ -1,4 +1,4 @@ -//! Models the morphology of Sanskrit words, including their stems and endings. +//! Models the morphology of Sanskrit words, including their bases and endings. //! //! For details on how we represent morphological data, see the `Pada` enum and its comments. //! @@ -20,6 +20,7 @@ use modular_bitfield::prelude::*; use std::collections::HashMap; use std::fmt::{Display, Formatter, Result as FmtResult}; use std::str::FromStr; +use vidyut_prakriya::args as vp; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -74,10 +75,24 @@ macro_rules! enum_boilerplate { } } +macro_rules! from_vidyut_prakriya { + ($Enum:ident, [ $( $variant:ident ),* $(,)? ]) => { + impl From for $Enum { + fn from(val: vp::$Enum) -> Self { + match val { + $( + vp::$Enum::$variant => $Enum::$variant, + )* + } + } + } + } +} + /// Lemma for `None` semantics or any other case where the lemma is unknown. pub const NONE_LEMMA: &str = "[none]"; -/// Utility struct for reading complex serialized enums. +/// Utility struct for reading complex serialized data. struct FeatureMap(HashMap); impl FeatureMap { @@ -102,8 +117,8 @@ impl FeatureMap { } /// The *liṅga* (gender) of a *subanta*. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] #[bits = 2] pub enum Linga { /// The masculine gender. @@ -120,9 +135,11 @@ enum_boilerplate!(Linga, { Napumsaka => "n", }); +from_vidyut_prakriya!(Linga, [Pum, Stri, Napumsaka]); + /// The *vacana* (number) of a *subanta* or *tiṅanta*. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] #[bits = 2] pub enum Vacana { /// The singular. @@ -139,46 +156,53 @@ enum_boilerplate!(Vacana, { Bahu => "p", }); +from_vidyut_prakriya!(Vacana, [Eka, Dvi, Bahu]); + /// The *vibhakti* (case) of a *subanta*. /// /// The term *vibhakti* refers generally to any triad of inflectional endings for a *subanta* /// or *tiṅanta*. Here, `Vibhakti` refers specifically to the *subanta* tridas. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[bits = 4] pub enum Vibhakti { /// The first *vibhakti* (nominative case). - V1, + Prathama, /// The second *vibhakti* (accusative case). - V2, + Dvitiya, /// The third *vibhakti* (instrumental case). - V3, + Trtiya, /// The fourth *vibhakti* (dative case). - V4, + Caturthi, /// The fifth *vibhakti* (ablative case). - V5, + Panchami, /// The sixth *vibhakti* (genitive case). - V6, + Sasthi, /// The seventh *vibhakti* (locative case). - V7, + Saptami, /// The first *vibhakti* in the condition of *sambodhana* (vocative case). Sambodhana, } enum_boilerplate!(Vibhakti, { - V1 => "1", - V2 => "2", - V3 => "3", - V4 => "4", - V5 => "5", - V6 => "6", - V7 => "7", + Prathama => "1", + Dvitiya => "2", + Trtiya => "3", + Caturthi => "4", + Panchami => "5", + Sasthi => "6", + Saptami => "7", Sambodhana => "8", }); +from_vidyut_prakriya!( + Vibhakti, + [Prathama, Dvitiya, Trtiya, Caturthi, Panchami, Sasthi, Saptami, Sambodhana] +); + /// The *puruṣa* (person) of a *tiṅanta*. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] #[bits = 2] pub enum Purusha { /// The first *puruṣa* (third person). @@ -195,12 +219,14 @@ enum_boilerplate!(Purusha, { Uttama => "1", }); +from_vidyut_prakriya!(Purusha, [Prathama, Madhyama, Uttama]); + /// The *lakāra* (tense/mood) of a *tiṅanta*. /// /// The *lakāras* are morphological categories, but each typically expresses a specific meaning. /// For example, *laṭ-lakāra* almost always expresses an action in the present tense. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] #[bits = 4] pub enum Lakara { /// *laṭ-lakāra* (present indicative). @@ -244,6 +270,11 @@ enum_boilerplate!(Lakara, { Lrn => "lrn", }); +from_vidyut_prakriya!( + Lakara, + [Lat, Lit, Lut, Lrt, Let, Lot, Lan, VidhiLin, AshirLin, Lun, Lrn] +); + /// A *pratyaya* (suffix) that creates a new *dhātu* (verb root) #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -257,63 +288,9 @@ pub enum DhatuPratyaya { Yan, } -/// A *kṛt-pratyaya* (root or primary suffix). -/// -/// This list is not exhaustive. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub enum KrtPratyaya { - /// The *-tum* suffix (infinitive). - Tumun, - /// The *-tvā* suffix (unprefixed gerund). - Ktva, - /// The *-ya* suffix (prefixed gerund). - Lyap, - - /// The *-vas* suffix (perfect participle). - Kvasu, - /// The -*āna* suffix (perfect participle). - Kanac, - - /// The *-ta* suffix (past passive participle). - Kta, - /// The *-tavat* suffix (past active participle). - Ktavat, - - /// The *-at* suffix (present active participle). - Shatr, - /// The *-āna* suffix (present middle participle). - Shanac, - /// The *-ya vikaraṇa* followed by the *-āna* suffix (present passive participle). - YakShanac, - - /// The *-sya vikaraṇa* followed by the *-at* suffix (future active participle). - SyaShatr, - /// The *-sya vikaraṇa* followed by the *-āna* suffix (future middle participle). - SyaShanac, - /// The *-tavya*, *-anīya*, and *-ya* suffixes, etc. (future past participle, gerundive). - Krtya, -} - -enum_boilerplate!(KrtPratyaya, { - Tumun => "tumun", - Ktva => "ktvA", - Lyap => "lyap", - Kvasu => "kvasu", - Kanac => "kAnac", - Kta => "kta", - Ktavat => "ktavat", - Shatr => "Satf", - Shanac => "SAnac", - YakShanac => "yak-SAnac", - SyaShatr => "sya-Satf", - SyaShanac => "sya-SAnac", - Krtya => "kftya", -}); - /// The *pada* and *prayoga* of the *tiṅanta*. Roughly, these correspond respectively to the /// concepts of "voice" and "thematic relation." -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, BitfieldSpecifier)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, BitfieldSpecifier)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[bits = 2] pub enum PadaPrayoga { @@ -332,21 +309,108 @@ enum_boilerplate!(PadaPrayoga, { }); /// Models the semantics of a *dhātu* (verb root). -#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct Dhatu(pub String); +pub struct Dhatu { + /// The prefixes that this dhatu uses. + prefixes: Vec, + /// The sanAdi-pratyayas that this dhatu uses. + sanadi: Vec, + /// The base text of the dhatu. + text: String, +} impl Dhatu { + /// Creates a new `Dhatu`. + pub fn mula(text: String) -> Self { + Self { + prefixes: Vec::new(), + sanadi: Vec::new(), + text, + } + } + + /// Sets prefixes on the dhatu. + pub fn with_prefixes(mut self, prefixes: Vec) -> Self { + self.prefixes = prefixes; + self + } + + /// Sets sanAdi-pratyayas on the dhatu. + pub fn with_sanadi(mut self, sanadi: Vec) -> Self { + self.sanadi = sanadi; + self + } + + /// Returns the prefixes that this dhatu uses. + pub fn prefixes(&self) -> &[String] { + &self.prefixes + } + + /// Returns the sanAdi-pratyayas that this dhatu uses. + pub fn sanadi(&self) -> &[vp::Sanadi] { + &self.sanadi + } + /// The text of this dhatu. pub fn text(&self) -> &String { - &self.0 + &self.text + } + + /// Returns a string representation of this dhatu. + pub fn as_str(&self) -> String { + let prefixes = self.prefixes.join("-"); + let sanadi_strings: Vec<_> = self.sanadi.iter().map(|s| s.to_string()).collect(); + let text = self.text(); + let sanadi = sanadi_strings.join("-"); + format!("{prefixes},{text},{sanadi}") + } +} + +impl From for Dhatu { + fn from(vp: vp::Dhatu) -> Self { + Dhatu { + prefixes: vp.prefixes().clone(), + sanadi: vp.sanadi().clone(), + text: match vp.upadesha() { + Some(s) => s.to_string(), + None => String::new(), + }, + } + } +} + +impl FromStr for Dhatu { + type Err = Error; + + /// Parses the string representation of this dhatu. + fn from_str(text: &str) -> Result { + let fields: Vec<_> = text.split(',').collect(); + + let prefixes = fields.get(0).map_or(Vec::new(), |s| { + if s.is_empty() { + Vec::new() + } else { + s.split("-").map(|s| s.to_string()).collect() + } + }); + let text = fields.get(1).map_or(String::new(), |s| s.to_string()); + let sanadi: Vec = fields.get(2).map_or(Vec::new(), |s| { + s.split("-").flat_map(|s| vp::Sanadi::from_str(s)).collect() + }); + + Ok(Dhatu { + prefixes, + sanadi, + text, + }) } } /// Models the semantics of a *prātipadika*. /// /// An *prātipadika* is generally synonymous with a nominal base. -#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum Pratipadika { /// A basic *prātipadika* that cannot be analyzed further. @@ -361,16 +425,34 @@ pub enum Pratipadika { /// The dhatu on which this krdanta is based. dhatu: Dhatu, /// The pratyaya that created this krdanta. - pratyaya: KrtPratyaya, + krt: Krt, }, } +/// A *kṛt-pratyaya* (root or primary suffix). +/// +/// This list is not exhaustive. +#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq, Ord, PartialOrd)] +pub struct Krt(vp::Krt); + +impl Krt { + /// Creates a new `Krt` pratyaya. + pub fn new(k: impl Into) -> Self { + Self(k.into()) + } + + /// Returns the underlying krt-pratyaya. + pub fn value(&self) -> vp::Krt { + self.0 + } +} + impl Pratipadika { /// Returns the lemma that the *prātipadika* is based on. pub fn lemma(&self) -> &str { match &self { Pratipadika::Basic { text, .. } => text, - Pratipadika::Krdanta { dhatu, .. } => &dhatu.0, + Pratipadika::Krdanta { dhatu, .. } => &dhatu.text(), } } @@ -385,8 +467,8 @@ impl Pratipadika { .join(","); format!("basic:text={text}|lingas={lingas}") } - Pratipadika::Krdanta { dhatu, pratyaya } => { - format!("krdanta:dhatu={}|pratyaya={}", dhatu.0, pratyaya.as_str()) + Pratipadika::Krdanta { dhatu, krt } => { + format!("krdanta:dhatu={}|krt={}", dhatu.as_str(), krt.0.as_str()) } } } @@ -413,12 +495,15 @@ impl FromStr for Pratipadika { } else if let Some(s) = s.strip_prefix("krdanta:") { let kv = FeatureMap::from_str(s); - let dhatu = kv.get("dhatu")?.clone(); - let pratyaya = (kv.get("pratyaya")?).parse()?; + let dhatu_str = kv.get("dhatu")?.clone(); + let krt = Krt(vp::BaseKrt::from_str(kv.get("krt")?) + // TODO: expect is dangerous here + .expect("ok") + .into()); Ok(Pratipadika::Krdanta { - dhatu: Dhatu(dhatu), - pratyaya, + dhatu: dhatu_str.parse()?, + krt, }) } else { Err(Error::ParseEnum("Pratipadika", s.to_string())) @@ -468,7 +553,7 @@ enum_boilerplate!(POSTag, { /// | ṅi । os । sup | /// /// For *avyaya*s (indeclinables), see `Avyaya`. -#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Subanta { /// The nominal's stem. @@ -501,7 +586,7 @@ pub struct Subanta { /// /// A *tiṅanta* expresses person, number, tense/mood, and voice in addition to whatever semantics /// are conveyed by the *dhātu* and its prefixes. -#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Tinanta { /// The verb's root. @@ -521,8 +606,8 @@ pub struct Tinanta { /// An *avyaya*s (indeclinable) is traditionally modeled as a subtype of the *subanta* that has had /// its *sup* suffix elided. But we model the *avyaya* separately because we felt that doing so /// would be easier to reason about in downstream code. -#[derive(Clone, Debug, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] pub struct Avyaya { /// The indeclinable's stem. pub pratipadika: Pratipadika, @@ -531,8 +616,8 @@ pub struct Avyaya { /// Models the semantics of a Sanskrit *pada* (word). /// /// This enum can be packed into an unsigned integer via the `packing` module. -#[derive(Clone, Debug, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] pub enum Pada { /// Unknown or missing semantics. Unknown, @@ -556,7 +641,7 @@ impl Pada { /// In Sanskrit, a lemma is either a *dhātu* or a *prātipadika*. pub fn lemma(&self) -> &str { match &self { - Pada::Tinanta(t) => &t.dhatu.0, + Pada::Tinanta(t) => &t.dhatu.text(), Pada::Subanta(s) => s.pratipadika.lemma(), Pada::Avyaya(a) => a.pratipadika.lemma(), Pada::Unknown => NONE_LEMMA, @@ -600,7 +685,9 @@ mod tests { #[test] fn test_vibhakti_serde() -> TestResult { use Vibhakti::*; - for val in [V1, V2, V3, V4, V5, V6, V7, Sambodhana] { + for val in [ + Prathama, Dvitiya, Trtiya, Caturthi, Panchami, Sasthi, Saptami, Sambodhana, + ] { assert_eq!(val, val.as_str().parse()?); } Ok(()) @@ -626,18 +713,6 @@ mod tests { Ok(()) } - #[test] - fn test_krt_pratyaya_serde() -> TestResult { - use KrtPratyaya::*; - for val in [ - Tumun, Ktva, Lyap, Kvasu, Kanac, Kta, Ktavat, Shatr, Shanac, YakShanac, SyaShatr, - SyaShanac, Krtya, - ] { - assert_eq!(val, val.as_str().parse()?); - } - Ok(()) - } - #[test] fn test_pada_prayoga() -> TestResult { use PadaPrayoga::*; @@ -648,9 +723,24 @@ mod tests { } #[test] - fn test_dhatu() { - let d = Dhatu("BU".to_string()); - assert_eq!(d.text(), "BU"); + fn test_dhatu() -> TestResult { + let bhu = Dhatu::mula("BU".to_string()); + assert_eq!(bhu, bhu.as_str().parse()?); + + let abhibhu = Dhatu::mula("BU".to_string()).with_prefixes(vec!["aBi".to_string()]); + assert_eq!(abhibhu, abhibhu.as_str().parse()?); + + let abhibobhuya = Dhatu::mula("BU".to_string()) + .with_prefixes(vec!["aBi".to_string()]) + .with_sanadi(vec![vp::Sanadi::yaN]); + assert_eq!(abhibobhuya, abhibobhuya.as_str().parse()?); + + let pratyabhibubhushaya = Dhatu::mula("BU".to_string()) + .with_prefixes(vec!["prati".to_string(), "aBi".to_string()]) + .with_sanadi(vec![vp::Sanadi::san, vp::Sanadi::Ric]); + assert_eq!(pratyabhibubhushaya, pratyabhibubhushaya.as_str().parse()?); + + Ok(()) } #[test] @@ -666,8 +756,8 @@ mod tests { #[test] fn test_pratipadika_serde_with_krdanta() -> TestResult { let p = Pratipadika::Krdanta { - dhatu: Dhatu("gam".to_string()), - pratyaya: KrtPratyaya::Shatr, + dhatu: Dhatu::mula("gam".to_string()), + krt: Krt(vp::BaseKrt::Satf.into()), }; assert_eq!(p, p.as_str().parse()?); Ok(()) @@ -682,7 +772,7 @@ mod tests { }, linga: Some(Linga::Pum), vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::V2), + vibhakti: Some(Vibhakti::Dvitiya), is_purvapada: false, }); assert_eq!(p.lemma(), "agni"); @@ -692,12 +782,12 @@ mod tests { fn test_subanta_lemma_with_krdanta_stem() { let p = Pada::Subanta(Subanta { pratipadika: Pratipadika::Krdanta { - dhatu: Dhatu("gam".to_string()), - pratyaya: KrtPratyaya::Shatr, + dhatu: Dhatu::mula("gam".to_string()), + krt: Krt(vp::BaseKrt::Satf.into()), }, linga: Some(Linga::Pum), vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::V2), + vibhakti: Some(Vibhakti::Dvitiya), is_purvapada: false, }); assert_eq!(p.lemma(), "gam"); @@ -706,7 +796,7 @@ mod tests { #[test] fn test_tinanta_lemma() { let p = Pada::Tinanta(Tinanta { - dhatu: Dhatu("gam".to_string()), + dhatu: Dhatu::mula("gam".to_string()), purusha: Purusha::Prathama, vacana: Vacana::Eka, lakara: Lakara::Lat, @@ -730,8 +820,8 @@ mod tests { fn test_avyaya_lemma_with_krdanta_stem() { let p = Pada::Avyaya(Avyaya { pratipadika: Pratipadika::Krdanta { - dhatu: Dhatu("gam".to_string()), - pratyaya: KrtPratyaya::Tumun, + dhatu: Dhatu::mula("gam".to_string()), + krt: Krt(vp::BaseKrt::tumun.into()), }, }); assert_eq!(p.lemma(), "gam"); diff --git a/vidyut-kosha/src/packing.rs b/vidyut-kosha/src/packing.rs index a287028..a30d820 100644 --- a/vidyut-kosha/src/packing.rs +++ b/vidyut-kosha/src/packing.rs @@ -121,7 +121,10 @@ impl DhatuTable { let mut ret = Vec::new(); for line in reader.lines() { - ret.push(Dhatu(line?.to_string())); + match line?.parse() { + Ok(s) => ret.push(s), + _ => {} + } } Ok(Self(ret)) } @@ -131,8 +134,8 @@ impl DhatuTable { let data: String = self .0 .iter() - .map(|d| &d.0) - .fold(String::new(), |x, y| x + y + "\n"); + .map(|d| d.as_str()) + .fold(String::new(), |x, y| x + &y + "\n"); std::fs::write(path, data)?; Ok(()) @@ -243,19 +246,19 @@ pub enum PackedVibhakti { /// Unknown or missing vibhakti. None, /// The first *vibhakti* (nominative case). - V1, + Prathama, /// The second *vibhakti* (accusative case). - V2, + Dvitiya, /// The third *vibhakti* (instrumental case). - V3, + Trtiya, /// The fourth *vibhakti* (dative case). - V4, + Caturthi, /// The fifth *vibhakti* (ablative case). - V5, + Panchami, /// The sixth *vibhakti* (genitive case). - V6, + Sasthi, /// The seventh *vibhakti* (locative case). - V7, + Saptami, /// The first *vibhakti* in the condition of *sambodhana* (vocative case). Sambodhana, } @@ -263,7 +266,7 @@ pub enum PackedVibhakti { boilerplate!( PackedVibhakti, Vibhakti, - [V1, V2, V3, V4, V5, V6, V7, Sambodhana] + [Prathama, Dvitiya, Trtiya, Caturthi, Panchami, Sasthi, Saptami, Sambodhana] ); /// Semantics for a *subanta*. @@ -551,7 +554,7 @@ mod tests { }, linga: Some(Linga::Pum), vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::V6), + vibhakti: Some(Vibhakti::Sasthi), is_purvapada: false, }); let narasya = Pada::Subanta(Subanta { @@ -561,7 +564,7 @@ mod tests { }, linga: Some(Linga::Pum), vacana: Some(Vacana::Eka), - vibhakti: Some(Vibhakti::V6), + vibhakti: Some(Vibhakti::Sasthi), is_purvapada: false, }); @@ -578,7 +581,7 @@ mod tests { #[test] fn test_tinanta_packing() -> TestResult { let gacchati = Pada::Tinanta(Tinanta { - dhatu: Dhatu("gam".to_string()), + dhatu: Dhatu::mula("gam".to_string()), purusha: Purusha::Prathama, vacana: Vacana::Eka, lakara: Lakara::Lat, @@ -586,7 +589,7 @@ mod tests { }); let carati = Pada::Tinanta(Tinanta { - dhatu: Dhatu("car".to_string()), + dhatu: Dhatu::mula("car".to_string()), purusha: Purusha::Prathama, vacana: Vacana::Eka, lakara: Lakara::Lat, diff --git a/vidyut-lipi/scripts/create_schemes.py b/vidyut-lipi/scripts/create_schemes.py index 3430d10..8d54232 100755 --- a/vidyut-lipi/scripts/create_schemes.py +++ b/vidyut-lipi/scripts/create_schemes.py @@ -300,8 +300,7 @@ def __init__(self, d): "ऴ": None, C.DANDA: ".", C.DOUBLE_DANDA: "..", - # candrabindu - "\u0901": "m̐", + C.CANDRABINDU: "m̐", }, "JAVANESE": { C.DANDA: "\ua9c8", @@ -529,6 +528,9 @@ def __init__(self, d): "GURMUKHI": [ (C.ABBREVIATION_SIGN, "\u0a76"), ], + "IAST": [ + (C.CANDRABINDU, "\u0303"), + ], "ITRANS": [ # Vedic anusvara (just render as candrabindu) ("\u0901", "{\\m+}"), diff --git a/vidyut-lipi/src/autogen_schemes.rs b/vidyut-lipi/src/autogen_schemes.rs index 1e218d1..89fc60e 100644 --- a/vidyut-lipi/src/autogen_schemes.rs +++ b/vidyut-lipi/src/autogen_schemes.rs @@ -4951,6 +4951,7 @@ pub const IAST: &[(&str, &str)] = &[ (RR, "r̥̄"), (SIGN_RR, "r̥̄"), (COMBINING_DIGIT_1, "¹"), + (CANDRABINDU, "̃"), (E, "è"), (O, "ò"), (SIGN_E, "è"), diff --git a/vidyut-lipi/src/mapping.rs b/vidyut-lipi/src/mapping.rs index 0662df0..bcdd9d4 100644 --- a/vidyut-lipi/src/mapping.rs +++ b/vidyut-lipi/src/mapping.rs @@ -7,7 +7,7 @@ use rustc_hash::{FxHashMap, FxHashSet}; #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub(crate) struct Token { /// The text of this token. - pub text: String, + text: String, /// The token type. `kind` controls how this token combines with neighboring tokens. pub kind: TokenKind, } @@ -18,6 +18,11 @@ impl Token { Self { text, kind } } + /// Returns the string value of this token. + pub fn text(&self) -> &str { + &self.text + } + /// Returns whether this token represents a consonant. pub fn is_consonant(&self) -> bool { self.kind == TokenKind::Consonant @@ -27,7 +32,7 @@ impl Token { /// Models how a token behaves in relation to other tokens. #[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] pub(crate) enum TokenKind { - /// A consonant. A following vowel generally a vowel mark. + /// A consonant. A following vowel is generally a vowel mark. Consonant, /// A vowel mark, which generally must follow a consonant. VowelMark, @@ -410,8 +415,8 @@ impl Mapping { items.sort_by(|x, y| x.0.cmp(y.0)); for (k, v) in items { let k_codes: Vec<_> = k.chars().map(|c| c as u32).collect(); - let v_codes: Vec<_> = v.text.chars().map(|c| c as u32).collect(); - println!("{k} ({k_codes:x?}) --> {} ({v_codes:x?})", v.text); + let v_codes: Vec<_> = v.text().chars().map(|c| c as u32).collect(); + println!("{k} ({k_codes:x?}) --> {} ({v_codes:x?})", v.text()); } } } diff --git a/vidyut-lipi/src/numerals.rs b/vidyut-lipi/src/numerals.rs index 9a7a6c4..e4e883b 100644 --- a/vidyut-lipi/src/numerals.rs +++ b/vidyut-lipi/src/numerals.rs @@ -224,7 +224,7 @@ pub fn transliterate_numeral(buffer: &mut String, numeral: &str, mapping: &Mappi let glyph_str = c.encode_utf8(&mut temp); mapping.all.get(glyph_str) }) { - buffer.push_str(&glyph.text); + buffer.push_str(glyph.text()); } } } diff --git a/vidyut-lipi/src/scheme.rs b/vidyut-lipi/src/scheme.rs index cf94211..5b2df6a 100644 --- a/vidyut-lipi/src/scheme.rs +++ b/vidyut-lipi/src/scheme.rs @@ -335,7 +335,7 @@ impl Scheme { /// ### Usage /// /// ```rust,ignore - /// from vidyut_lipi import Scheme; + /// use vidyut_lipi::Scheme; /// /// for scheme in Scheme::iter() { /// println!("- {scheme}"); @@ -403,7 +403,7 @@ impl Scheme { /// ### Usage /// /// ``` - /// from vidyut_lipi import Scheme; + /// use vidyut_lipi::Scheme; /// /// assert_eq!(Scheme::Devanagari.iso_15924_code(), "Deva"); /// ``` @@ -468,7 +468,7 @@ impl Scheme { /// ### Usage /// /// ``` - /// from vidyut_lipi import Scheme; + /// use vidyut_lipi::Scheme; /// /// assert_eq!(Scheme::Devanagari.iso_15924_numeric_code(), 315); /// ``` @@ -537,9 +537,9 @@ impl Scheme { /// ### Usage /// /// ``` - /// from vidyut_lipi import Scheme; + /// use vidyut_lipi::Scheme; /// - /// assert_eq!(Scheme::Devanagari.icu_code(), 10); + /// assert_eq!(Scheme::Devanagari.icu_numeric_code(), 10); /// ``` pub fn icu_numeric_code(&self) -> u16 { use Scheme::*; diff --git a/vidyut-lipi/src/transliterate.rs b/vidyut-lipi/src/transliterate.rs index 0768fce..23ab6b8 100644 --- a/vidyut-lipi/src/transliterate.rs +++ b/vidyut-lipi/src/transliterate.rs @@ -106,7 +106,7 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { output.pop(); } - output += &token.text; + output += &token.text(); if is_to_alphabet && token.is_consonant() { // Add an implicit "a" vowel. @@ -122,7 +122,7 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { output.pop(); had_virama = false; } else { - let mut text = &token.text; + let mut text = token.text(); if had_virama { if let Some(mark) = mapping.marks.get(key) { output.pop(); diff --git a/vidyut-lipi/src/unicode_norm.rs b/vidyut-lipi/src/unicode_norm.rs index cfcfe16..ff0fcb7 100644 --- a/vidyut-lipi/src/unicode_norm.rs +++ b/vidyut-lipi/src/unicode_norm.rs @@ -99,7 +99,7 @@ pub const LATIN_NFD: Table = &[ ]; /// NFD/NFC mapping for Devanagari. -/// Spec: https://unicode.org/charts/PDF/U0900.pdf +/// Spec: /// /// (other Devanagari NFC/NFD combinations are usually exempt.) pub const DEVANAGARI_NFD: Table = &[ @@ -129,7 +129,7 @@ pub const DEVANAGARI_COMPOSITION_EXCLUSIONS: &[&str] = &[ ]; /// NFD/NFC mapping for Bengali. -/// Spec: https://unicode.org/charts/PDF/U0980.pdf +/// Spec: pub const BENGALI_NFD: Table = &[ ("\u{09cb}", "\u{09c7}\u{09be}"), // vowel sign o ("\u{09cc}", "\u{09c7}\u{09d7}"), // vowel sign au @@ -141,13 +141,13 @@ pub const BENGALI_NFD: Table = &[ /// Characters that should not be created during NFD --> NFC. pub const BENGALI_COMPOSITION_EXCLUSIONS: &[&str] = &["\u{09dc}", "\u{09dd}", "\u{09df}"]; -/// Spec: https://unicode.org/charts/PDF/U1000.pdf +/// Spec: pub const MYANMAR_NFD: Table = &[ ("\u{1026}", "\u{1025}\u{102e}"), // uu ]; /// NFD/NFC mapping for Balinese. -/// Spec: https://unicode.org/charts/PDF/U1B00.pdf +/// Spec: pub const BALINESE_NFD: Table = &[ ("\u{1b06}", "\u{1b05}\u{1b35}"), ("\u{1b08}", "\u{1b07}\u{1b35}"), @@ -161,13 +161,13 @@ pub const BALINESE_NFD: Table = &[ ("\u{1b41}", "\u{1b3f}\u{1b35}"), ]; -/// Spec: http://www.unicode.org/charts/PDF/U11300.pdf +/// Spec: pub const GRANTHA_NFD: Table = &[ ("\u{1134b}", "\u{11347}\u{1133e}"), // vowel sign oo ("\u{1134c}", "\u{11347}\u{11357}"), // vowel sign au ]; -/// Spec: https://unicode.org/charts/PDF/U0A00.pdf +/// Spec: pub const GURMUKHI_NFD: Table = &[ ("\u{0a33}", "\u{0a32}\u{0a3c}"), // letter lla ("\u{0a36}", "\u{0a38}\u{0a3c}"), // letter sha @@ -177,12 +177,12 @@ pub const GURMUKHI_NFD: Table = &[ ("\u{0a5e}", "\u{0a2b}\u{0a3c}"), // letter fa ]; -/// Spec: https://unicode.org/charts/PDF/U0A00.pdf +/// Spec: pub const GURMUKHI_COMPOSITION_EXCLUSIONS: &[&str] = &[ "\u{0a33}", "\u{0a36}", "\u{0a59}", "\u{0a5a}", "\u{0a5b}", "\u{0a5e}", ]; -/// Spec: https://unicode.org/charts/PDF/U0C80.pdf +/// Spec: pub const KANNADA_NFD: Table = &[ ("\u{0cc0}", "\u{0cbf}\u{0cd5}"), // vowel sign ii ("\u{0cc7}", "\u{0cc6}\u{0cd5}"), // vowel sign ee @@ -191,14 +191,14 @@ pub const KANNADA_NFD: Table = &[ ("\u{0ccb}", "\u{0cc6}\u{cc2}\u{0cd5}"), // vowel sign oo ]; -/// Spec: https://unicode.org/charts/PDF/U0D00.pdf +/// Spec: pub const MALAYALAM_NFD: Table = &[ ("\u{0d4a}", "\u{0d46}\u{0d3e}"), // vowel sign o ("\u{0d4b}", "\u{0d47}\u{0d3e}"), // vowel sign oo ("\u{0d4c}", "\u{0d46}\u{0d57}"), // vowel sign au ]; -/// Spec: https://unicode.org/charts/PDF/U0B00.pdf +/// Spec: pub const ORIYA_NFD: Table = &[ ("\u{0b48}", "\u{0b47}\u{0b56}"), // vowel sign ai ("\u{0b4b}", "\u{0b47}\u{0b3e}"), // vowel sign o @@ -209,13 +209,13 @@ pub const ORIYA_NFD: Table = &[ pub const ORIYA_COMPOSITION_EXCLUSIONS: &[&str] = &["\u{0b5c}", "\u{0b5d}"]; -/// Spec: https://unicode.org/charts/PDF/U11580.pdf +/// Spec: pub const SIDDHAM_NFD: Table = &[ ("\u{115ba}", "\u{115b8}\u{115af}"), // vowel sign o ("\u{115bb}", "\u{115b9}\u{115af}"), // vowel sign au ]; -/// Spec: https://unicode.org/charts/PDF/U0D80.pdf +/// Spec: pub const SINHALA_NFD: Table = &[ ("\u{0dda}", "\u{0dd9}\u{0dca}"), // vowel sign ee ("\u{0ddc}", "\u{0dd9}\u{0dcf}"), // vowel sign o @@ -223,7 +223,7 @@ pub const SINHALA_NFD: Table = &[ ("\u{0dde}", "\u{0dd9}\u{0ddf}"), // vowel sign au ]; -/// Spec: https://unicode.org/charts/PDF/U0B80.pdf +/// Spec: pub const TAMIL_NFD: Table = &[ ("\u{0b94}", "\u{0b92}\u{0bd7}"), // letter au ("\u{0bca}", "\u{0bc6}\u{0bbe}"), // vowel sign o @@ -231,7 +231,7 @@ pub const TAMIL_NFD: Table = &[ ("\u{0bcc}", "\u{0bc6}\u{0bd7}"), // vowel sign au ]; -/// Spec: https://unicode.org/charts/PDF/U0C00.pdf +/// Spec: pub const TELUGU_NFD: Table = &[ ("\u{0c48}", "\u{0c46}\u{0c56}"), // vowel sign ai ]; @@ -242,7 +242,7 @@ pub const KAITHI_NFD: Table = &[ ("\u{110ab}", "\u{110a5}\u{110ba}"), // Letter va ]; -/// Spec: https://www.unicode.org/charts/PDF/U11480.pdf +/// Spec: pub const TIRHUTA_NFD: Table = &[ ("\u{114bb}", "\u{114b9}\u{114ba}"), // vowel sign ai ("\u{114bc}", "\u{114b9}\u{114b0}"), // vowel sign o diff --git a/vidyut-lipi/tests/basic.rs b/vidyut-lipi/tests/basic.rs index 3384460..070621b 100644 --- a/vidyut-lipi/tests/basic.rs +++ b/vidyut-lipi/tests/basic.rs @@ -453,6 +453,24 @@ fn sanskrit_consonants_non_vedic() { ); } +// Test only Latin schemes, since most Brahmic schemes will just use a candrabindu here. +#[test] +fn sanskrit_nasal_semivowels() { + // Example from https://list.indology.info/pipermail/indology/2023-October/058252.html + let deva_text = "त्रील्ँलोकान्"; + + assert_two_way_pairwise(&[ + (Devanagari, deva_text), + (Iast, "trīlm̐lokān"), + (Iso15919, "trīlm̐lōkān"), + (Slp1, "trIl~lokAn"), + ]); + + // Alternate for IAST. + // TODO: or should this be preferred? + assert_transliterate("trīl̃lokān", Iast, Devanagari, deva_text); +} + #[test] fn sanskrit_symbols() { assert_two_way_pairwise(&[ diff --git a/vidyut-prakriya/README.md b/vidyut-prakriya/README.md index 6b959aa..0f84b33 100644 --- a/vidyut-prakriya/README.md +++ b/vidyut-prakriya/README.md @@ -59,7 +59,13 @@ moderate support for *samāsa*s and weak support for accent rules. Usage ----- -To generate all basic tinantas in kartari prayoga, run: +`vidyut-prakriya` supports two modes of use: + +### Command-line use + +The first way to use `vidyut-prakriya` is as a command-line tool for generating +Sanskrit words. For example, you can generate all basic *tiṅanta*s in *kartari +prayoga* by using the following command: ```shell $ make create_tinantas > output.csv @@ -70,7 +76,13 @@ first compile `vidyut-prakriya`. After this initial compilation step, however, subsequent runs will be much faster, and `make create_tinantas` will likely compile and complete within a few seconds. -To generate prakriyas programmatically, you can use the starter code below: +You can find other example commands by exploring the `Makefile` and in +particular the various invocations in `create_test_files`. + +### Programmatic use + +The second way to use `vidyut-prakriya` is programmatically. For example, we +can generate simple verbs like so: ```rust use vidyut_prakriya::Vyakarana; @@ -102,7 +114,7 @@ for p in prakriyas { } ``` -Output of the code above: +Given the code above, the output is as follows: ```text Bavati @@ -129,24 +141,27 @@ Bavati --------------------------- ``` -The left column shows a simple string label for each rule that was applied -during the derivation, and you can find details about what values these labels -can take in the comments on the `Rule` type. We suggest using `ashtadhyayi.com` -to learn more about these rules. +Here, the left column shows a simple string label for each rule that was +applied during the derivation. You can find details about what values these +labels can take in the comments on the `Rule` type. To learn more about these +rules, we suggest using [ashtadhyayi.com](https://ashtadhyayi.com). The right column shows the in-progress prakriya. We use an output convention that is common on other Ashtadhyayi websites. The encoding format for this -text is SLP1, which is the encoding format we use throughout the crate. - -[sv]: https://github.com/drdhaval2785/SanskritVerb +text is SLP1, which is the encoding format we use by default in all Vidyut +crates. -For more details, see the following methods on the `Vyakarana` struct: +For specific API details, see the following methods on the `Vyakarana` struct: - `derive_tinantas` (for verbs) - `derive_subantas` (for nominals) - `derive_krdantas` (for verbal suffixes) - `derive_taddhitantas` (for nominal suffixes) +Our test suite also contains numerous examples of invoking various parts of +`vidyut-prakriya`. We also have simpler examples available in the `examples` +directory. + Contributing ------------ @@ -209,11 +224,11 @@ copied value. Then, run `make test_all` again and confirm that all tests pass. Data ---- -This crate includes a Dhatupatha sourced from [ashtadhyayi.com][a-com], -and the author of ashtadhyayi.com has graciously agreed to share this file with -us under an MIT license. +This crate includes a *Dhātupāṭha* sourced from [ashtadhyayi.com][a-com], +and the author of [ashtadhyayi.com][a-com] has graciously agreed to share this +file with us under an MIT license. -For details on the lineage of this Dhatupatha, see our separate [data +For details on the lineage of this *Dhātupāṭha*, see our separate [data README][data-readme]. [a-com]: https://ashtadhyayi.com diff --git a/vidyut-prakriya/src/angasya/guna_vrddhi.rs b/vidyut-prakriya/src/angasya/guna_vrddhi.rs index 4c2188f..d5d86e1 100644 --- a/vidyut-prakriya/src/angasya/guna_vrddhi.rs +++ b/vidyut-prakriya/src/angasya/guna_vrddhi.rs @@ -104,10 +104,8 @@ impl<'a> GunaVrddhiPrakriya<'a> { self.block("1.1.5"); } else if anga.has_u_in(&["dIDIN", "vevIN"]) || anga.is_it_agama() { self.block("1.1.6"); - } else { - if !n.has_tag(T::Pratyaya) { - self.done = true; - } + } else if !n.has_tag(T::Pratyaya) { + self.done = true; } } diff --git a/vidyut-prakriya/src/angasya/subanta.rs b/vidyut-prakriya/src/angasya/subanta.rs index 728aed6..ccda5a7 100644 --- a/vidyut-prakriya/src/angasya/subanta.rs +++ b/vidyut-prakriya/src/angasya/subanta.rs @@ -576,7 +576,7 @@ fn try_anga_adesha_before_vibhakti_changes(p: &mut Prakriya, i_anga: usize) -> O let anga = p.get(i_anga)?; if sau - && (anga.text.contains("d") || anga.text.contains("t")) + && (anga.text.contains('d') || anga.text.contains('t')) && (!anga.has_antya('d') && !anga.has_antya('t')) { p.run_at("7.2.106", i_anga, |t| { diff --git a/vidyut-prakriya/src/args.rs b/vidyut-prakriya/src/args.rs index a27cd50..6eda27c 100644 --- a/vidyut-prakriya/src/args.rs +++ b/vidyut-prakriya/src/args.rs @@ -1,10 +1,10 @@ /*! Common arguments for the crate's main functions. -Before we begin a prakriya, we must declare certain morphological information up-front, such as our -desired purusha and vacana, the dhatu we wish to use, and so on. To better document the API and to -help users avoid configuration mistakes, we model this information through the enums and structs in -this module. +Before we begin a *prakriyā*, we must declare certain morphological information up-front, such as +our desired *puruṣa* and *vacana*, the *dhātu* we wish to use, and so on. To better document the +API and to help users avoid configuration mistakes, we model this information through the enums and +structs in this module. For extra flexibility, all of the pratyaya enums here provides `as_str` and `from_str` methods. For details on which strings are valid arguments in `from_str`, please read the source code directly. diff --git a/vidyut-prakriya/src/args/dhatu.rs b/vidyut-prakriya/src/args/dhatu.rs index bef2747..5c77f39 100644 --- a/vidyut-prakriya/src/args/dhatu.rs +++ b/vidyut-prakriya/src/args/dhatu.rs @@ -105,7 +105,7 @@ enum_boilerplate!(Antargana, { /// /// For details on what these pratyayas mean and what kinds of words they produce, see the comments /// below. -#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] #[allow(non_camel_case_types)] #[wasm_bindgen] pub enum Sanadi { diff --git a/vidyut-prakriya/src/args/krt.rs b/vidyut-prakriya/src/args/krt.rs index 7af6fa2..efa3c8e 100644 --- a/vidyut-prakriya/src/args/krt.rs +++ b/vidyut-prakriya/src/args/krt.rs @@ -12,7 +12,7 @@ use wasm_bindgen::prelude::wasm_bindgen; /// case explicitly here so that we can name pratyayas more concisely with SLP1. Doing so helps us /// distinguish between pratyayas like `naN` and `nan`. #[allow(dead_code, non_camel_case_types)] -#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq, Ord, PartialOrd)] #[wasm_bindgen] pub enum BaseKrt { /// -a @@ -327,7 +327,7 @@ enum_boilerplate!(BaseKrt, { }); /// Models a krt-pratyaya. -#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq, Ord, PartialOrd)] pub enum Krt { /// An ordinary krt-pratyaya as declared in the Ashtadhyayi. Base(BaseKrt), @@ -347,31 +347,12 @@ impl From for Krt { } } -/// Models the meaning of a krt-pratyaya. -/// -/// krts are often available only in specific senses. A given krt might be allowed in one sense -/// but blocked in another. To model and test this behavior, we use the enum below. -#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)] -pub enum KrtArtha { - /// Agent. (3.4.67) - Karta, - /// Existence. (3.3.18) - Bhava, - /// Having a habit, nature, or skill. - TacchilaTaddharmaTatsadhukara, - /// Designation. (3.3.118) - Samjna, - /// Solidity. (3.3.77) - Murti, - /// Location. (3.3.78) - Desha, -} - impl Krt { - /// Returns whether the krt suffix is an ArdhadhAtuka suffix. + /// Returns whether the krt suffix is an *ārdhadhātuka* suffix. /// - /// We must track this explicitly so that we can "look ahead" and potentially add -Aya or other - /// pratyayas for certain dhAtus. For details, see the implementation of rules 3.1.28 - 3.1.31. + /// We must track this explicitly so that we can "look ahead" and potentially add `-Aya` or + /// other *pratyaya*s for certain *dhātu*s. For details, see the implementation of rules 3.1.28 + /// - 3.1.31. pub fn is_ardhadhatuka(&self) -> bool { use BaseKrt::*; match self { @@ -392,10 +373,30 @@ impl Krt { } } -/// The information required to derive a krdanta in the grammar. +/// Models the meaning of a krt-pratyaya. +/// +/// krts are often available only in specific senses. A given krt might be allowed in one sense +/// but blocked in another. To model and test this behavior, we use the enum below. +#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)] +pub enum KrtArtha { + /// Agent. (3.4.67) + Karta, + /// Existence. (3.3.18) + Bhava, + /// Having a habit, nature, or skill. + TacchilaTaddharmaTatsadhukara, + /// Designation. (3.3.118) + Samjna, + /// Solidity. (3.3.77) + Murti, + /// Location. (3.3.78) + Desha, +} + +/// The information required to derive a krdanta. #[derive(Clone, Debug, Hash, Eq, PartialEq)] pub struct Krdanta { - /// The dhatu to add the krt-pratyaya to. + /// The dhatu to which we will add our krt-pratyaya. dhatu: Dhatu, /// The krt-pratyaya to use. krt: Krt, @@ -404,7 +405,7 @@ pub struct Krdanta { /// Whether this krdanta must replace a specific `Lakara`. If unset, default to `Lat` if /// necessary. lakara: Option, - /// Whether this krdanta is allowed only with a specific upapada. + /// Whether this krdanta is allowed only with a specific *upapada*. upapada: Option, /// Whether the derived krdanta must have exactly the specified value. require: Option, @@ -435,7 +436,7 @@ impl Krdanta { &self.dhatu } - /// The krt pratyaya to use in the derivation. + /// The krt-pratyaya to use in the derivation. pub fn krt(&self) -> Krt { self.krt } diff --git a/vidyut-prakriya/src/args/pada.rs b/vidyut-prakriya/src/args/pada.rs index 750c52e..8499cce 100644 --- a/vidyut-prakriya/src/args/pada.rs +++ b/vidyut-prakriya/src/args/pada.rs @@ -1,6 +1,6 @@ use crate::args::{Subanta, Tinanta}; -/// Models a Sanskrit pada. +/// The information required to derive a word. #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub enum Pada { /// A nominal word or an indeclinable. diff --git a/vidyut-prakriya/src/args/pratipadika.rs b/vidyut-prakriya/src/args/pratipadika.rs index 77c3b83..6f38c6d 100644 --- a/vidyut-prakriya/src/args/pratipadika.rs +++ b/vidyut-prakriya/src/args/pratipadika.rs @@ -15,13 +15,13 @@ pub enum Pratipadika { Basic(BasicPratipadika), /// A krdanta. Krdanta(Box), - /// A taddhitanta. + /// A *taddhitānta*. Taddhitanta(Box), - /// A samasa. + /// A *samāsa*. Samasa(Box), } -/// Models a basic pratipadika. +/// Models a basic *prātipadika* that is not created with any other *pratyaya*s. #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct BasicPratipadika { pub(crate) text: String, diff --git a/vidyut-prakriya/src/args/samasa.rs b/vidyut-prakriya/src/args/samasa.rs index 0605249..dd69c4d 100644 --- a/vidyut-prakriya/src/args/samasa.rs +++ b/vidyut-prakriya/src/args/samasa.rs @@ -22,7 +22,7 @@ pub enum SamasaType { SamaharaDvandva, } -/// The information required to derive a samasa in the grammar. +/// The information required to derive a *samāsa*. #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct Samasa { /// The items to combine in the samasa. @@ -61,7 +61,7 @@ impl Samasa { } } -/// Convenience struct for building a `SamasaARgs` struct. +/// Convenience struct for building a `SamasaArgs` struct. #[derive(Clone, Debug, Default, Eq, Hash, PartialEq)] pub struct SamasaBuilder { padas: Vec, diff --git a/vidyut-prakriya/src/args/sup.rs b/vidyut-prakriya/src/args/sup.rs index eda876c..20f955a 100644 --- a/vidyut-prakriya/src/args/sup.rs +++ b/vidyut-prakriya/src/args/sup.rs @@ -5,7 +5,7 @@ use crate::core::Tag; use crate::enum_boilerplate; use wasm_bindgen::prelude::wasm_bindgen; -/// The gender of some subanta. +/// The gender of some *subanta*. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] #[wasm_bindgen] pub enum Linga { @@ -33,7 +33,13 @@ impl Linga { } } -/// The case ending of some subanta. +/// The case ending of some *subanta*. +/// +/// A *vibhakti* is a set of 3 endings that share all of the same properties except for their +/// number (singular, dual, plural). While *tiṅanta*s also have *vibhakti*s, in practice the term +/// *vibhakti* refers more specifically to the endings used with *subanta*s. +/// +/// *Vibhakti* is broadly similar to the Western notion of grammatical case. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] #[wasm_bindgen] pub enum Vibhakti { @@ -86,7 +92,7 @@ impl Vibhakti { } } -/// The information required to derive a subanta in the grammar. +/// The information required to derive a *subanta*. #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct Subanta { pratipadika: Pratipadika, @@ -114,7 +120,7 @@ impl Subanta { } } - /// Creates a subanta. + /// Defines a *subanta* that is also an *avyaya*. pub fn avyaya(pratipadika: impl Into) -> Self { let pratipadika = pratipadika.into(); Self { @@ -141,17 +147,17 @@ impl Subanta { self.linga } - /// The vacana to use in the derivation. + /// The *vacana* to use in the derivation. pub fn vacana(&self) -> Vacana { self.vacana } - /// The vibhakti to use in the derivation. + /// The *vibhakti* to use in the derivation. pub fn vibhakti(&self) -> Vibhakti { self.vibhakti } - /// Returns whether or not this subanta is an avyaya. + /// Returns whether or not this *subanta* is an *avyaya*. pub fn is_avyaya(&self) -> bool { self.is_avyaya } @@ -179,13 +185,13 @@ impl SubantaBuilder { self } - /// Sets the vacana to use in the derivation. + /// Sets the *vacana* to use in the derivation. pub fn vacana(&mut self, val: Vacana) -> &mut Self { self.vacana = Some(val); self } - /// Sets the vibhakti to use in the derivation. + /// Sets the *vibhakti* to use in the derivation. pub fn vibhakti(&mut self, val: Vibhakti) -> &mut Self { self.vibhakti = Some(val); self diff --git a/vidyut-prakriya/src/args/taddhita.rs b/vidyut-prakriya/src/args/taddhita.rs index 31f52dc..8dcc48a 100644 --- a/vidyut-prakriya/src/args/taddhita.rs +++ b/vidyut-prakriya/src/args/taddhita.rs @@ -830,7 +830,7 @@ impl TaddhitaArtha { } } -/// The information required to derive a taddhitanta in the grammar. +/// The information required to derive a *taddhitānta*. #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct Taddhitanta { pratipadika: Pratipadika, diff --git a/vidyut-prakriya/src/args/tin.rs b/vidyut-prakriya/src/args/tin.rs index eb1cff5..5b7af12 100644 --- a/vidyut-prakriya/src/args/tin.rs +++ b/vidyut-prakriya/src/args/tin.rs @@ -4,7 +4,9 @@ use crate::core::Tag; use crate::enum_boilerplate; use wasm_bindgen::prelude::wasm_bindgen; -/// The prayoga of some tiṅanta. +/// The *prayoga* of some *tiṅanta*. +/// +/// *Prayoga* is roughly similar to the Western concept of verb *voice*. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] #[wasm_bindgen] pub enum Prayoga { @@ -13,7 +15,7 @@ pub enum Prayoga { /// Usage coreferent with the object, e.g. "The village *is gone to* by the horse." Karmani, /// Usage without a referent, e.g. "*There is motion* by the horse to the village." - /// bhAve prayoga generally produces the same forms as karmani prayoga. + /// *bhāve prayoga* generally produces the same forms as karmani prayoga. Bhave, } @@ -33,7 +35,7 @@ impl Prayoga { } } -/// The person of some tiṅanta. +/// The person of some *tiṅanta*. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] #[wasm_bindgen] pub enum Purusha { @@ -61,7 +63,7 @@ impl Purusha { } } -/// The number of some tiṅanta or subanta. +/// The number of some *tiṅanta* or *subanta*. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] #[wasm_bindgen] pub enum Vacana { @@ -89,7 +91,7 @@ impl Vacana { } } -/// The tense/mood of some tiṅanta. +/// The tense/mood of some *tiṅanta*. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] #[wasm_bindgen] pub enum Lakara { @@ -154,13 +156,13 @@ impl Lakara { } } -/// The pada of some tiṅanta or kṛdanta. +/// The pada of some *tiṅanta* or *kṛdanta*. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] #[wasm_bindgen] pub enum DhatuPada { - /// Parasmaipada. + /// *Parasmaipada*. Parasmai, - /// Atmanepada. + /// *Ātmanepada*. Atmane, } @@ -178,12 +180,12 @@ impl DhatuPada { } } -/// The information required to derive a tiṅanta in the grammar. +/// The information required to derive a *tiṅanta*. /// -/// If a tiṅanta were just a matter of prayoga/purusha/lakara/vacana, a struct like this would not -/// be necessary. However, a tiṅanta's derivation can have many other constraints, including: +/// If a *tiṅanta* were just a matter of prayoga/purusha/lakara/vacana, a struct like this would +/// not be necessary. However, a *tiṅanta*'s derivation can have many other constraints, including: /// -/// - specific upasargas or other prefixes +/// - specific *upasarga*s or other prefixes /// - specific sanAdi pratyayas /// - other constraints on the overall derivation /// @@ -240,7 +242,7 @@ impl Tinanta { self.lakara } - /// The vacana to use in the derivation. + /// The *vacana* to use in the derivation. pub fn vacana(&self) -> Vacana { self.vacana } diff --git a/vidyut-prakriya/src/args/unadi.rs b/vidyut-prakriya/src/args/unadi.rs index dfa1c3d..9535e84 100644 --- a/vidyut-prakriya/src/args/unadi.rs +++ b/vidyut-prakriya/src/args/unadi.rs @@ -11,7 +11,7 @@ use wasm_bindgen::prelude::wasm_bindgen; /// /// NOTE: we generated this list programmatically. Many of these pratyayas have typos. #[allow(dead_code, non_camel_case_types)] -#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq, Ord, PartialOrd)] #[wasm_bindgen] pub enum Unadi { /// -a diff --git a/vidyut-prakriya/src/ashtadhyayi.rs b/vidyut-prakriya/src/ashtadhyayi.rs index e23a619..a0a4d05 100644 --- a/vidyut-prakriya/src/ashtadhyayi.rs +++ b/vidyut-prakriya/src/ashtadhyayi.rs @@ -99,7 +99,9 @@ fn prepare_dhatu( dhatu_karya::try_add_prefixes(p, n.prefixes()); sanadi::try_create_namadhatu(p, n); if !p.terms().last().expect("ok").is_dhatu() { - println!("{:#?}", p); + if cfg!(debug_assertions) { + println!("{:#?}", p); + } return Err(Error::Abort(p.rule_choices().clone())); } } @@ -155,7 +157,9 @@ fn prepare_krdanta(p: &mut Prakriya, args: &Krdanta) -> Result<()> { } let added = krt::run(p, args); if !added { - println!("{:#?}", p); + if cfg!(debug_assertions) { + println!("{:#?}", p); + } return Err(Error::Abort(p.rule_choices().clone())); } @@ -327,7 +331,7 @@ fn run_main_rules( p.debug("==== Vikaranas ===="); ardhadhatuka::run_before_vikarana(p, dhatu_args, lakara, is_ardhadhatuka); - vikarana::run(p)?; + vikarana::run(p); samjna::run(p); if let Some(lakara) = lakara { @@ -499,7 +503,9 @@ pub fn derive_subanta(mut prakriya: Prakriya, args: &Subanta) -> Result Result { let p = &mut prakriya; prepare_krdanta(p, args)?; - run_main_rules(p, None, None, true)?; + + let is_ardhadhatuka = p.terms().last().map_or(false, |t| t.is_ardhadhatuka()); + run_main_rules(p, None, None, is_ardhadhatuka)?; tripadi::run(p); Ok(prakriya) diff --git a/vidyut-prakriya/src/core/term.rs b/vidyut-prakriya/src/core/term.rs index 1a15fb7..73d840f 100644 --- a/vidyut-prakriya/src/core/term.rs +++ b/vidyut-prakriya/src/core/term.rs @@ -206,7 +206,7 @@ impl Term { } pub fn last_vowel(&self) -> Option { - self.chars().rev().filter(|c| sounds::is_ac(*c)).next() + self.chars().rev().find(|c| sounds::is_ac(*c)) } /// Returns the sound at index `i` if it exists. @@ -339,7 +339,7 @@ impl Term { /// Returns whether the term has a specific aupadeshika form. pub fn has_u(&self, s: &str) -> bool { match &self.u { - Some(u) => u == &s, + Some(u) => u == s, None => false, } } @@ -357,7 +357,7 @@ impl Term { } pub fn has_lakshana(&self, u: &str) -> bool { - self.lakshanas.iter().any(|s| s == &u) + self.lakshanas.iter().any(|s| s == u) } pub fn has_lakshana_in(&self, us: &[&str]) -> bool { @@ -742,7 +742,7 @@ impl Term { } else if self.text.contains('x') { // Don't save asiddha sounds. return; - } else { + } else if !self.sthanivat.is_empty() { let sthanivat_antya = self.sthanivat.chars().next_back().expect("ok"); let text_antya = self.text.chars().next_back().expect("ok"); if sounds::is_ac(sthanivat_antya) { diff --git a/vidyut-prakriya/src/core/term_view.rs b/vidyut-prakriya/src/core/term_view.rs index 6e6b94a..5a768c7 100644 --- a/vidyut-prakriya/src/core/term_view.rs +++ b/vidyut-prakriya/src/core/term_view.rs @@ -15,13 +15,6 @@ use crate::sounds::Pattern; /// /// `TermView` provides an API for working with these sequences. It provides a simple API that /// mirrors the `Term` API, and it provides raw access to its underlying `Term`s as escape hatches. -/// -/// Instead of creating TermView directly, we recommend using the [`pada`], [`nyap_pratipadika`], -/// or [`pratyaya`] methods on `Prakriya. -/// -/// [`pada`]: Prakriya::get -/// [`nyap_pratipadika`]: Prakriya::nyap_pratipadika -/// [`pratyaya`]: Prakriya::view #[derive(Debug)] pub struct TermView<'a> { /// All of the terms in the prakriya. We store the entire `Term` list so that our internal @@ -136,12 +129,9 @@ impl<'a> TermView<'a> { /// /// `end_non_empty` is useful if the view ends in an empty pratyaya, such as a kvip-pratyaya. pub fn end_non_empty(&self) -> Option { - for i in (self.start..=self.end).rev() { - if !self.terms.get(i).expect("present").is_empty() { - return Some(i); - } - } - None + (self.start..=self.end) + .rev() + .find(|&i| !self.terms.get(i).expect("present").is_empty()) } /// Returns whether the view's text is empty. diff --git a/vidyut-prakriya/src/dhatu_karya.rs b/vidyut-prakriya/src/dhatu_karya.rs index 5a71add..9637109 100644 --- a/vidyut-prakriya/src/dhatu_karya.rs +++ b/vidyut-prakriya/src/dhatu_karya.rs @@ -266,7 +266,7 @@ pub fn try_add_prefixes(p: &mut Prakriya, prefixes: &[String]) -> Option<()> { // TODO: prefixes that aren't upasargas? for prefix in prefixes { - let t = Term::make_upadesha(&prefix); + let t = Term::make_upadesha(prefix); p.insert_before(i_offset, t); samjna::try_nipata_rules(p, i_offset); diff --git a/vidyut-prakriya/src/dhatupatha.rs b/vidyut-prakriya/src/dhatupatha.rs index a5de060..055b732 100644 --- a/vidyut-prakriya/src/dhatupatha.rs +++ b/vidyut-prakriya/src/dhatupatha.rs @@ -54,10 +54,11 @@ impl Entry { } } -/// An interface to the Dhatupatha used on . +/// An interface to the Dhatupatha. /// /// Different traditional texts might use different dhatupathas. This struct manages the data for -/// the dhatupatha on ashtadhyayi.com, which is a superset of the dhatus from five sources: +/// the dhatupatha on , which is a superset of the dhatus from five +/// sources: /// /// - the *Siddhāntakaumudī* /// - the *Bṛhaddhātukusumākaraḥ* @@ -65,7 +66,7 @@ impl Entry { /// - the *Kṣīrataraṅgiṇī* /// - the *Dhātupradīpaḥ* /// -/// The specific dhatupatha we use matters: for certain dhatus, we can determine their metadata +/// The specific Dhatupatha we use matters: for certain dhatus, we can determine their metadata /// only if we know exactly where they are located. (For an example, see our implementation of the /// private `maybe_find_antargana` function.) pub struct Dhatupatha(Vec); @@ -172,6 +173,11 @@ impl Dhatupatha { Err(_) => None, } } + + /// Returns an iterator over this dhatupatha's contents. + pub fn iter(&self) -> std::slice::Iter { + self.0.iter() + } } impl IntoIterator for Dhatupatha { @@ -183,26 +189,34 @@ impl IntoIterator for Dhatupatha { } } +impl<'a> IntoIterator for &'a Dhatupatha { + type Item = &'a Entry; + type IntoIter = std::slice::Iter<'a, Entry>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// Returns the antargana of the dhatu at location `number` within `gana.` +/// +/// We need to check the numeric position explicitly because some dhatus appear multiple times in +/// their respective ganas with identical forms. (We can usually distinguish these dhatus by +/// meaning, but vidyut-prakriya has poor support for modeling and comparing dhatu meanings.) fn maybe_find_antargana(gana: Gana, number: u16) -> Option { if gana == Gana::Bhvadi && (867..=932).contains(&number) { - // Need to check range explicitly because some of these roots appear multiple times in the - // gana, e.g. svana~. Some(Antargana::Ghatadi) } else if gana == Gana::Tudadi && (93..=137).contains(&number) { - // Need to check range explicitly because some of these roots appear multiple times in the - // gana, e.g. juqa~. + // juqa~, etc. Some(Antargana::Kutadi) } else if gana == Gana::Curadi && (192..=236).contains(&number) { - // Need to check range explicitly because some of these roots appear multiple times in the - // gana, e.g. lakza~. + // lakza~, etc. Some(Antargana::Akusmiya) } else if gana == Gana::Curadi && (279..=337).contains(&number) { - // Need to check range explicitly because some of these roots appear multiple times in the - // gana, e.g. tuji~. + // tuji~, etc. Some(Antargana::Asvadiya) } else if gana == Gana::Curadi && (338..=388).contains(&number) { - // Need to check range explicitly because some of these roots appear multiple times in the - // gana, e.g. SraTa~. + // SraTa~, etc. Some(Antargana::Adhrshiya) } else { None diff --git a/vidyut-prakriya/src/vikarana.rs b/vidyut-prakriya/src/vikarana.rs index 285b802..5cf96eb 100644 --- a/vidyut-prakriya/src/vikarana.rs +++ b/vidyut-prakriya/src/vikarana.rs @@ -17,7 +17,6 @@ // substitutions by lopa that block the prakarana. use crate::args::Gana::*; -use crate::core::errors::*; use crate::core::operators as op; use crate::core::{Prakriya, Rule, Rule::Varttika, Tag as T, Term}; use crate::dhatu_gana::{DYUT_ADI, PUSH_ADI, TAN_ADI}; @@ -590,17 +589,17 @@ fn try_pratyaya_lopa(p: &mut Prakriya) -> Option<()> { Some(()) } -pub fn run(p: &mut Prakriya) -> Result<()> { +pub fn run(p: &mut Prakriya) -> Option<()> { + p.dump(); + // Skip if a vikarana is already present, e.g. when adding a subanta to a krdanta that has // already been created. if p.find_first(T::Vikarana).is_some() { - return Ok(()); + return None; } - let tin = match p.terms().last() { - Some(t) => t, - None => return Ok(()), - }; + let i_tin = p.find_last_where(|t| t.is_sarvadhatuka())?; + let tin = p.get(i_tin)?; if tin.has_lakshana_in(&["lf~w", "lf~N", "lu~w"]) { if tin.has_lakshana_in(&["lf~w", "lf~N"]) { @@ -626,7 +625,7 @@ pub fn run(p: &mut Prakriya) -> Result<()> { try_pratyaya_lopa(p); // Run it-samjna-prakarana only after the lopa phase is complete. if p.has(i_vikarana, |t| !t.is_empty()) { - it_samjna::run(p, i_vikarana)?; + it_samjna::run(p, i_vikarana).ok()?; } } @@ -634,12 +633,12 @@ pub fn run(p: &mut Prakriya) -> Result<()> { // it blocks `AtmanepadezvanataH` && `Ato GitaH`. let i = match p.find_first(T::Dhatu) { Some(i) => i, - None => return Ok(()), + None => return None, }; if p.has(i, |t| t.has_u("gA\\N")) && p.has(i + 1, |t| t.has_text("a")) { p.set(i + 1, |t| t.text.clear()); p.step("6.1.101"); } - Ok(()) + None } diff --git a/vidyut-prakriya/src/vyakarana.rs b/vidyut-prakriya/src/vyakarana.rs index 60e69b0..b3f98a5 100644 --- a/vidyut-prakriya/src/vyakarana.rs +++ b/vidyut-prakriya/src/vyakarana.rs @@ -76,7 +76,7 @@ impl Vyakarana { /// /// ### Examples /// - /// A mula-dhatu from the Dhatupatha: + /// A *mūla-dhātu* from the Dhatupatha: /// /// ``` /// # use vidyut_prakriya::Vyakarana; @@ -87,7 +87,7 @@ impl Vyakarana { /// assert_eq!(prakriyas[0].text(), "vand"); /// ``` /// - /// A mula-dhatu with one or more upasargas: + /// A *mūla-dhātu* with one or more *upasarga*s: /// /// ``` /// # use vidyut_prakriya::Vyakarana; @@ -99,7 +99,7 @@ impl Vyakarana { /// assert_eq!(prakriyas[0].text(), "upasaNgam"); /// ``` /// - /// A mula-dhatu with one or more sanAdi-pratyayas: + /// A *mūla-dhātu* with one or more *sanādi-pratyaya*s: /// /// ``` /// # use vidyut_prakriya::Vyakarana; @@ -128,7 +128,7 @@ impl Vyakarana { /// assert_eq!(prakriyas[0].text(), "vivandizi"); /// ``` /// - /// A nama-dhatu with an optional sanAdi-pratyaya: + /// A *nāma-dhātu* with an optional *sanādi-pratyaya*: /// /// ``` /// # use vidyut_prakriya::Vyakarana; @@ -140,7 +140,7 @@ impl Vyakarana { /// assert_eq!(prakriyas[0].text(), "putrIya"); /// ``` /// - /// A nama-dhatu with a mandatory sanAdi-pratyaya from some other sutra: + /// A *nāma-dhātu* with a mandatory *sanādi-pratyaya* from some other sutra: /// /// ``` /// # use vidyut_prakriya::Vyakarana; @@ -164,22 +164,85 @@ impl Vyakarana { /// /// ### Example /// + /// A basic *tiṅanta*: + /// /// ``` - /// # use vidyut_prakriya::Vyakarana; - /// # use vidyut_prakriya::Error; - /// # use vidyut_prakriya::args::*; + /// use vidyut_prakriya::Vyakarana; + /// use vidyut_prakriya::args::*; + /// use vidyut_prakriya::Error; + /// /// let v = Vyakarana::new(); - /// let dhatu = Dhatu::mula("BU", Gana::Bhvadi); + /// + /// let bhu = Dhatu::mula("BU", Gana::Bhvadi); /// let args = Tinanta::builder() - /// .dhatu(dhatu) + /// .dhatu(bhu) /// .lakara(Lakara::Lat) /// .prayoga(Prayoga::Kartari) /// .purusha(Purusha::Prathama) /// .vacana(Vacana::Eka) - /// .build()?; + /// .build() + /// .unwrap(); /// let prakriyas = v.derive_tinantas(&args); /// assert_eq!(prakriyas[0].text(), "Bavati"); - /// # Ok::<(), Error>(()) + /// ``` + /// + /// A *tiṅanta* with one or more *upasarga*s: + /// + /// ``` + /// # use vidyut_prakriya::Vyakarana; + /// # use vidyut_prakriya::args::*; + /// # let v = Vyakarana::new(); + /// let abhibhu = Dhatu::mula("BU", Gana::Bhvadi).with_prefixes(&["aBi"]); + /// let args = Tinanta::builder() + /// .dhatu(abhibhu) + /// .lakara(Lakara::Lat) + /// .prayoga(Prayoga::Kartari) + /// .purusha(Purusha::Prathama) + /// .vacana(Vacana::Eka) + /// .build() + /// .unwrap(); + /// let prakriyas = v.derive_tinantas(&args); + /// assert_eq!(prakriyas[0].text(), "aBiBavati"); + /// ``` + /// + /// A *tiṅanta* whose *dhātu* has one or more *sanādi-pratyaya*s: + /// + /// ``` + /// # use vidyut_prakriya::Vyakarana; + /// # use vidyut_prakriya::args::*; + /// # let v = Vyakarana::new(); + /// let bobhuya = Dhatu::mula("BU", Gana::Bhvadi).with_sanadi(&[Sanadi::yaN]); + /// let args = Tinanta::builder() + /// .dhatu(bobhuya) + /// .lakara(Lakara::Lat) + /// .prayoga(Prayoga::Kartari) + /// .purusha(Purusha::Prathama) + /// .vacana(Vacana::Eka) + /// .build() + /// .unwrap(); + /// let prakriyas = v.derive_tinantas(&args); + /// assert_eq!(prakriyas[0].text(), "boBUyate"); + /// ``` + /// + /// A *tiṅanta* that must use *ātmanepada*. If the *dhātu* cannot support the requested *pada*, + /// this method returns no results: + /// + /// ``` + /// # use vidyut_prakriya::Vyakarana; + /// # use vidyut_prakriya::args::*; + /// # let v = Vyakarana::new(); + /// let kr = Dhatu::mula("qukf\\Y", Gana::Tanadi); + /// let args = Tinanta::builder() + /// .dhatu(kr) + /// .lakara(Lakara::Lat) + /// .prayoga(Prayoga::Kartari) + /// .purusha(Purusha::Prathama) + /// .vacana(Vacana::Eka) + /// .pada(DhatuPada::Atmane) + /// .build() + /// .unwrap(); + /// let prakriyas = v.derive_tinantas(&args); + /// assert_eq!(prakriyas[0].text(), "kurute"); /// ``` pub fn derive_tinantas(&self, args: &Tinanta) -> Vec { let mut stack = self.create_prakriya_stack(); @@ -280,9 +343,10 @@ impl Vyakarana { /// let args = Taddhitanta::builder() /// .pratipadika(Pratipadika::basic("nara")) /// .taddhita(Taddhita::matup) - /// .build()?; + /// .build() + /// .unwrap(); /// let prakriyas = v.derive_taddhitantas(&args); - /// # Ok::<(), Error>(()) + /// assert_eq!(prakriyas[0].text(), "naravat"); /// ``` pub fn derive_taddhitantas(&self, spec: &Taddhitanta) -> Vec { let mut stack = self.create_prakriya_stack(); @@ -290,8 +354,8 @@ impl Vyakarana { stack.prakriyas() } - /// Returns all possible stryanta prakriyas that can be derived with the given initial - /// conditions. + /// (Experimental) Returns all possible stryanta prakriyas that can be derived with the given + /// initial conditions. /// /// /// ### Example @@ -303,7 +367,6 @@ impl Vyakarana { /// let v = Vyakarana::new(); /// let pratipadika = Pratipadika::basic("nara"); /// let prakriyas = v.derive_stryantas(&pratipadika); - /// # Ok::<(), Error>(()) /// ``` pub fn derive_stryantas(&self, pratipadika: &Pratipadika) -> Vec { let mut stack = self.create_prakriya_stack(); @@ -318,17 +381,31 @@ impl Vyakarana { /// /// ``` /// # use vidyut_prakriya::Vyakarana; - /// # use vidyut_prakriya::Error; /// # use vidyut_prakriya::args::*; /// let v = Vyakarana::new(); - /// # Ok::<(), Error>(()) + /// + /// let rajan = Pratipadika::basic("rAjan"); + /// let purusha = Pratipadika::basic("puruza"); + /// let args = Samasa::builder() + /// .padas(vec![ + /// Subanta::new(rajan, Linga::Pum, Vibhakti::Sasthi, Vacana::Eka), + /// Subanta::new(purusha, Linga::Pum, Vibhakti::Prathama, Vacana::Eka), + /// ]) + /// .samasa_type(SamasaType::Tatpurusha) + /// .build() + /// .unwrap(); + /// + /// let prakriyas = v.derive_samasas(&args); + /// assert_eq!(prakriyas[0].text(), "rAjapuruza"); + /// ``` pub fn derive_samasas(&self, args: &Samasa) -> Vec { let mut stack = self.create_prakriya_stack(); stack.find_all(|p| ashtadhyayi::derive_samasa(p, args)); stack.prakriyas() } - /// Returns all possible sandhi results that follow from the given initial conditions. + /// (Experimental) Returns all possible sandhi results that follow from the given initial + /// conditions. /// /// /// ### Example diff --git a/vidyut-prakriya/tests/kashika_3_2.rs b/vidyut-prakriya/tests/kashika_3_2.rs index ddae85b..04a264b 100644 --- a/vidyut-prakriya/tests/kashika_3_2.rs +++ b/vidyut-prakriya/tests/kashika_3_2.rs @@ -1057,7 +1057,6 @@ fn sutra_3_2_123() { assert_has_tip(&[], &d("paWa~", Bhvadi), Lat, &["paWati"]); } -#[ignore] #[test] fn sutra_3_2_124() { let pac = d("qupa\\ca~^z", Bhvadi); @@ -1065,7 +1064,13 @@ fn sutra_3_2_124() { assert_has_krdanta(&[], &pac, Krt::SAnac, &["pacamAna"]); // others assert_has_krdanta(&[], &d("asa~", Adadi), Krt::Satf, &["sat"]); - // TODO: more +} + +#[test] +fn sutra_3_2_125() { + let pac = d("qupa\\ca~^z", Bhvadi); + assert_has_krdanta(&[], &pac, Krt::Satf, &["pacat"]); + assert_has_krdanta(&[], &pac, Krt::SAnac, &["pacamAna"]); } #[test] @@ -1083,6 +1088,13 @@ fn sutra_3_2_128() { assert_has_krdanta(&[], &d("ya\\ja~^", Bhvadi), Krt::SAnan, &["yajamAna"]); } +#[ignore] +#[test] +fn sutra_3_2_130() { + assert_has_krdanta(&["aDi"], &d("i\\N", Adadi), Krt::Satf, &["aDIyat"]); + assert_has_krdanta(&[], &d("Df\\Y", Bhvadi), Krt::Satf, &["DArayat"]); +} + #[test] fn sutra_3_2_131() { let dvishat = krdanta(&[], &d("dvi\\za~^", Adadi), Krt::Satf); diff --git a/vidyut-prakriya/tests/regressions.rs b/vidyut-prakriya/tests/regressions.rs index c6980cf..92cfb36 100644 --- a/vidyut-prakriya/tests/regressions.rs +++ b/vidyut-prakriya/tests/regressions.rs @@ -4,8 +4,10 @@ //! has been fixed and help ensure that the bug does not reappear. extern crate test_utils; use test_utils::*; +use vidyut_prakriya::args::BaseKrt; use vidyut_prakriya::args::Gana::*; use vidyut_prakriya::args::Lakara::*; +use vidyut_prakriya::args::Linga::*; #[test] fn ambibat() { @@ -61,3 +63,12 @@ fn adhyajigapat_adhyapipat() { &["aDyajIgapat", "aDyApipat"], ); } + +// Verifies that Sap-pratyaya is added when Satf follows. +#[test] +fn shap_shatr() { + let bhavat = krdanta(&[], &d("BU", Bhvadi), BaseKrt::Satf); + assert_has_sup_1s(&bhavat, Pum, &["Bavan"]); + assert_has_sup_1d(&bhavat, Pum, &["BavantO"]); + assert_has_sup_1p(&bhavat, Pum, &["BavantaH"]); +}