[chandas] Add support for jati meters

- Add support for AryA, upagIti, etc. - Add an experimental `classify_all` method. - Remove `dyn Error` from vidyut-chandas in favor of real error types. - Rename vidyut data dir to `vidyut-latest` to avoid version numbers from vidyut-py. - Various `clippy` cleanups.
ambuda-org · Nov 4, 2024 · 84c8441 · 84c8441
1 parent ea4112a
commit 84c8441
Show file tree

Hide file tree

Showing 19 changed files with 470 additions and 130 deletions.
diff --git a/Makefile b/Makefile
@@ -32,18 +32,18 @@ create_all_data:
 
 create_sandhi_rules:
 	RUST_LOG=info cargo run --release --bin create_sandhi_rules -- \
-			 --data-dir data/build/vidyut-0.2.0
+			 --data-dir data/build/vidyut-latest
 
 # Creates a koshas and write it to disk.
 create_kosha:
 	RUST_LOG=info cargo run --release --bin create_kosha -- \
-			 --input-dir data/raw/lex --output-dir data/build/vidyut-0.2.0
+			 --input-dir data/raw/lex --output-dir data/build/vidyut-latest
 
 # Trains a padaccheda model and saves important features to disk.
 # NOTE: when training, exclude the file paths used in `make eval`.
 train_cheda:
 	cargo run --release --bin train_cheda -- \
-		--vidyut-dir "data/build/vidyut-0.2.0" \
+		--vidyut-dir "data/build/vidyut-latest" \
 		--include "data/raw/dcs/conllu/files/**/*.conllu" \
 		--exclude "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-088*.conllu" \
 		--exclude "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-089*.conllu" \
@@ -55,13 +55,13 @@ train_cheda:
 
 # Runs basic end-to-end tests against the given kosha.
 test_kosha:
-	RUST_LOG=info cargo run --release --bin test_kosha -- --data-dir data/build/vidyut-0.2.0/kosha
+	RUST_LOG=info cargo run --release --bin test_kosha -- --data-dir data/build/vidyut-latest/kosha
 
 
 # Evaluate our parsing quality on a large sample of text.
 eval_cheda:
 	cargo run --release --bin eval_cheda -- \
-		--vidyut-dir "data/build/vidyut-0.2.0" \
+		--vidyut-dir "data/build/vidyut-latest" \
 		--paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-088*.conllu" \
 		--paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-089*.conllu" \
 		--paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-0900-MBh, 6, BhaGī 18-7707.conllu"

diff --git a/scripts/create_all_data.sh b/scripts/create_all_data.sh
@@ -10,10 +10,10 @@ rm -Rf dcs-data 2&> /dev/null
 set -e
 
 # Create necessary directories.
-mkdir -p "data/build/${1}"
+OUTPUT_DIR="data/build/vidyut-latest"
 
 echo "========================="
-echo "| DCS corpus data       |"
+echo "Data fetch"
 echo "========================="
 echo
 if [ -e "data/raw/dcs" ]; then
@@ -26,10 +26,6 @@ else
     rm -Rf dcs-data
 fi
 echo
-echo "========================="
-echo "| Linguistic data fetch |"
-echo "========================="
-echo
 if [ -e "data/raw/lex" ]; then
     echo "Lexical data already exists -- skipping fetch."
 else
@@ -42,12 +38,38 @@ else
 fi
 echo
 echo "========================="
-echo "| Vidyut build          |"
+echo "vidyut-chandas"
 echo "========================="
+mkdir -p "${OUTPUT_DIR}/chandas"
+cp -r vidyut-chandas/data "${OUTPUT_DIR}/chandas"
+echo "Copied files to output dir."
 echo
+echo "========================="
+echo "vidyut-kosha"
+echo "========================="
 make create_kosha
 make test_kosha
+echo
+echo "========================="
+echo "vidyut-lipi"
+echo "========================="
+echo "(no data files needed)"
+echo
+echo "========================="
+echo "vidyut-prakriya"
+echo "========================="
+mkdir -p "${OUTPUT_DIR}/prakriya"
+cp -r "vidyut-prakriya/data/" "${OUTPUT_DIR}/prakriya"
+echo "Copied files to output dir."
+echo
+echo "========================="
+echo "vidyut-sandhi"
+echo "========================="
 make create_sandhi_rules
+echo
+echo "========================="
+echo "vidyut-cheda"
+echo "========================="
 make train_cheda
 make eval_cheda
 echo

diff --git a/src/bin/create_sandhi_rules.rs b/src/bin/create_sandhi_rules.rs
@@ -26,7 +26,7 @@ fn write_rules(rules: &[Rule], path: &Path) -> Result<()> {
 fn main() {
     let args = Args::parse();
     let rules = generate_rules();
-    let config = Config::new(&args.data_dir);
+    let config = Config::new(args.data_dir);
 
     if let Err(err) = write_rules(&rules, config.sandhi()) {
         println!("{}", err);

diff --git a/src/bin/train_cheda.rs b/src/bin/train_cheda.rs
@@ -70,7 +70,7 @@ fn process_sentence(tokens: &[Token], s: &mut Statistics) {
         let c = s
             .transitions
             .entry(prev_state)
-            .or_insert_with(HashMap::new)
+            .or_default()
             .entry(cur_state)
             .or_insert(0);
         *c += 1;
@@ -81,7 +81,7 @@ fn process_sentence(tokens: &[Token], s: &mut Statistics) {
         let c = s
             .emissions
             .entry(cur_state)
-            .or_insert_with(HashMap::new)
+            .or_default()
             .entry(to_slp1(lemma))
             .or_insert(0);
         *c += 1;

diff --git a/vidyut-chandas/src/akshara.rs b/vidyut-chandas/src/akshara.rs
@@ -3,9 +3,9 @@ use crate::sounds;
 /// The weight of an akshara.
 #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
 pub enum Weight {
-    /// A heavy syllable.
+    /// A *guru* or heavy syllable.
     G,
-    /// A light syllable.
+    /// A *laghu* or light syllable.
     L,
 }
 
@@ -18,7 +18,7 @@ pub enum Weight {
 /// - It must not start with an anusvara or visarga.
 ///
 /// Together, these three rurles mean that an input string has exactly one division into aksharas.
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, Eq, PartialEq)]
 pub struct Akshara {
     pub(crate) text: String,
     pub(crate) weight: Weight,
@@ -43,7 +43,7 @@ impl Akshara {
     }
 
     /// The length of this akshara in matras.
-    pub fn num_matras(&self) -> usize {
+    pub fn num_matras(&self) -> i32 {
         match self.weight {
             Weight::L => 1,
             Weight::G => 2,
@@ -55,6 +55,7 @@ impl Akshara {
 ///
 /// Any text that is not a valid Sanskrit sound in SLP1 will be ignored.
 pub fn scan_line(text: impl AsRef<str>) -> Vec<Akshara> {
+    // Split into aksharas.
     let mut akshara_strs = Vec::new();
     let mut cur = String::new();
     for c in text.as_ref().chars() {
@@ -71,6 +72,7 @@ pub fn scan_line(text: impl AsRef<str>) -> Vec<Akshara> {
             if let Some(prev) = akshara_strs.last_mut() {
                 prev.push(c);
             }
+            // `else` means `M` and `H` follow a non-vowel, which indicates an error.
         }
 
         // Skip all other punctuation, spaces, etc.
@@ -85,8 +87,10 @@ pub fn scan_line(text: impl AsRef<str>) -> Vec<Akshara> {
             // Case 2: extend old syllable
             last.push_str(&cur);
         }
+        // `else` means that `text` contains only consonants, which indicates an error.
     }
 
+    // Calculate weights.
     akshara_strs
         .iter()
         .enumerate()
@@ -97,10 +101,12 @@ pub fn scan_line(text: impl AsRef<str>) -> Vec<Akshara> {
                 false
             };
 
-            let weight = if !cur.ends_with(sounds::is_hrasva) || next_is_samyogadi {
-                Weight::G
-            } else {
+            let has_hrasva = cur.chars().any(sounds::is_hrasva);
+            let has_visarga_or_anusvara = matches!(cur.chars().last(), Some('M') | Some('H'));
+            let weight = if has_hrasva && !next_is_samyogadi && !has_visarga_or_anusvara {
                 Weight::L
+            } else {
+                Weight::G
             };
             Akshara::new(cur.to_string(), weight)
         })
@@ -254,15 +260,14 @@ mod tests {
     #[test]
     fn test_scan_block_with_laghu_weight_change() {
         let scan = scan_lines("anIkam".lines());
-        assert_eq!(weights(&scan[0]), vec![L, G, G]);
+        assert_eq!(weights(&scan[0]), vec![L, G, L]);
 
         // Last syllable of `anIkam` becomes guru due to following samyoga.
         let scan = scan_lines("anIkam\nvyUQam".lines());
         assert_eq!(weights(&scan[0]), vec![L, G, G]);
 
         // Last syllable of `anIka` stays laghu due to following vowel.
-        // TODO: this is buggy.
-        // let scan = scan_block("anIkam\neva");
-        // assert_eq!(weights(&scan[0]), vec![L, G, L]);
+        let scan = scan_lines("anIkam\neva".lines());
+        assert_eq!(weights(&scan[0]), vec![L, G, L]);
     }
 }