Skip to content

Commit

Permalink
[chandas] Add support for jati meters
Browse files Browse the repository at this point in the history
- Add support for AryA, upagIti, etc.
- Add an experimental `classify_all` method.
- Remove `dyn Error` from vidyut-chandas in favor of real error types.
- Rename vidyut data dir to `vidyut-latest` to avoid version numbers
  from vidyut-py.
- Various `clippy` cleanups.
  • Loading branch information
akprasad committed Nov 4, 2024
1 parent ea4112a commit 84c8441
Show file tree
Hide file tree
Showing 19 changed files with 470 additions and 130 deletions.
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,18 @@ create_all_data:

create_sandhi_rules:
RUST_LOG=info cargo run --release --bin create_sandhi_rules -- \
--data-dir data/build/vidyut-0.2.0
--data-dir data/build/vidyut-latest

# Creates a koshas and write it to disk.
create_kosha:
RUST_LOG=info cargo run --release --bin create_kosha -- \
--input-dir data/raw/lex --output-dir data/build/vidyut-0.2.0
--input-dir data/raw/lex --output-dir data/build/vidyut-latest

# Trains a padaccheda model and saves important features to disk.
# NOTE: when training, exclude the file paths used in `make eval`.
train_cheda:
cargo run --release --bin train_cheda -- \
--vidyut-dir "data/build/vidyut-0.2.0" \
--vidyut-dir "data/build/vidyut-latest" \
--include "data/raw/dcs/conllu/files/**/*.conllu" \
--exclude "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-088*.conllu" \
--exclude "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-089*.conllu" \
Expand All @@ -55,13 +55,13 @@ train_cheda:

# Runs basic end-to-end tests against the given kosha.
test_kosha:
RUST_LOG=info cargo run --release --bin test_kosha -- --data-dir data/build/vidyut-0.2.0/kosha
RUST_LOG=info cargo run --release --bin test_kosha -- --data-dir data/build/vidyut-latest/kosha


# Evaluate our parsing quality on a large sample of text.
eval_cheda:
cargo run --release --bin eval_cheda -- \
--vidyut-dir "data/build/vidyut-0.2.0" \
--vidyut-dir "data/build/vidyut-latest" \
--paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-088*.conllu" \
--paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-089*.conllu" \
--paths "data/raw/dcs/conllu/files/Mahābhārata/Mahābhārata-0900-MBh, 6, BhaGī 18-7707.conllu"
Expand Down
36 changes: 29 additions & 7 deletions scripts/create_all_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ rm -Rf dcs-data 2&> /dev/null
set -e

# Create necessary directories.
mkdir -p "data/build/${1}"
OUTPUT_DIR="data/build/vidyut-latest"

echo "========================="
echo "| DCS corpus data |"
echo "Data fetch"
echo "========================="
echo
if [ -e "data/raw/dcs" ]; then
Expand All @@ -26,10 +26,6 @@ else
rm -Rf dcs-data
fi
echo
echo "========================="
echo "| Linguistic data fetch |"
echo "========================="
echo
if [ -e "data/raw/lex" ]; then
echo "Lexical data already exists -- skipping fetch."
else
Expand All @@ -42,12 +38,38 @@ else
fi
echo
echo "========================="
echo "| Vidyut build |"
echo "vidyut-chandas"
echo "========================="
mkdir -p "${OUTPUT_DIR}/chandas"
cp -r vidyut-chandas/data "${OUTPUT_DIR}/chandas"
echo "Copied files to output dir."
echo
echo "========================="
echo "vidyut-kosha"
echo "========================="
make create_kosha
make test_kosha
echo
echo "========================="
echo "vidyut-lipi"
echo "========================="
echo "(no data files needed)"
echo
echo "========================="
echo "vidyut-prakriya"
echo "========================="
mkdir -p "${OUTPUT_DIR}/prakriya"
cp -r "vidyut-prakriya/data/" "${OUTPUT_DIR}/prakriya"
echo "Copied files to output dir."
echo
echo "========================="
echo "vidyut-sandhi"
echo "========================="
make create_sandhi_rules
echo
echo "========================="
echo "vidyut-cheda"
echo "========================="
make train_cheda
make eval_cheda
echo
Expand Down
2 changes: 1 addition & 1 deletion src/bin/create_sandhi_rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ fn write_rules(rules: &[Rule], path: &Path) -> Result<()> {
fn main() {
let args = Args::parse();
let rules = generate_rules();
let config = Config::new(&args.data_dir);
let config = Config::new(args.data_dir);

if let Err(err) = write_rules(&rules, config.sandhi()) {
println!("{}", err);
Expand Down
4 changes: 2 additions & 2 deletions src/bin/train_cheda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ fn process_sentence(tokens: &[Token], s: &mut Statistics) {
let c = s
.transitions
.entry(prev_state)
.or_insert_with(HashMap::new)
.or_default()
.entry(cur_state)
.or_insert(0);
*c += 1;
Expand All @@ -81,7 +81,7 @@ fn process_sentence(tokens: &[Token], s: &mut Statistics) {
let c = s
.emissions
.entry(cur_state)
.or_insert_with(HashMap::new)
.or_default()
.entry(to_slp1(lemma))
.or_insert(0);
*c += 1;
Expand Down
27 changes: 16 additions & 11 deletions vidyut-chandas/src/akshara.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ use crate::sounds;
/// The weight of an akshara.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum Weight {
/// A heavy syllable.
/// A *guru* or heavy syllable.
G,
/// A light syllable.
/// A *laghu* or light syllable.
L,
}

Expand All @@ -18,7 +18,7 @@ pub enum Weight {
/// - It must not start with an anusvara or visarga.
///
/// Together, these three rurles mean that an input string has exactly one division into aksharas.
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct Akshara {
pub(crate) text: String,
pub(crate) weight: Weight,
Expand All @@ -43,7 +43,7 @@ impl Akshara {
}

/// The length of this akshara in matras.
pub fn num_matras(&self) -> usize {
pub fn num_matras(&self) -> i32 {
match self.weight {
Weight::L => 1,
Weight::G => 2,
Expand All @@ -55,6 +55,7 @@ impl Akshara {
///
/// Any text that is not a valid Sanskrit sound in SLP1 will be ignored.
pub fn scan_line(text: impl AsRef<str>) -> Vec<Akshara> {
// Split into aksharas.
let mut akshara_strs = Vec::new();
let mut cur = String::new();
for c in text.as_ref().chars() {
Expand All @@ -71,6 +72,7 @@ pub fn scan_line(text: impl AsRef<str>) -> Vec<Akshara> {
if let Some(prev) = akshara_strs.last_mut() {
prev.push(c);
}
// `else` means `M` and `H` follow a non-vowel, which indicates an error.
}

// Skip all other punctuation, spaces, etc.
Expand All @@ -85,8 +87,10 @@ pub fn scan_line(text: impl AsRef<str>) -> Vec<Akshara> {
// Case 2: extend old syllable
last.push_str(&cur);
}
// `else` means that `text` contains only consonants, which indicates an error.
}

// Calculate weights.
akshara_strs
.iter()
.enumerate()
Expand All @@ -97,10 +101,12 @@ pub fn scan_line(text: impl AsRef<str>) -> Vec<Akshara> {
false
};

let weight = if !cur.ends_with(sounds::is_hrasva) || next_is_samyogadi {
Weight::G
} else {
let has_hrasva = cur.chars().any(sounds::is_hrasva);
let has_visarga_or_anusvara = matches!(cur.chars().last(), Some('M') | Some('H'));
let weight = if has_hrasva && !next_is_samyogadi && !has_visarga_or_anusvara {
Weight::L
} else {
Weight::G
};
Akshara::new(cur.to_string(), weight)
})
Expand Down Expand Up @@ -254,15 +260,14 @@ mod tests {
#[test]
fn test_scan_block_with_laghu_weight_change() {
let scan = scan_lines("anIkam".lines());
assert_eq!(weights(&scan[0]), vec![L, G, G]);
assert_eq!(weights(&scan[0]), vec![L, G, L]);

// Last syllable of `anIkam` becomes guru due to following samyoga.
let scan = scan_lines("anIkam\nvyUQam".lines());
assert_eq!(weights(&scan[0]), vec![L, G, G]);

// Last syllable of `anIka` stays laghu due to following vowel.
// TODO: this is buggy.
// let scan = scan_block("anIkam\neva");
// assert_eq!(weights(&scan[0]), vec![L, G, L]);
let scan = scan_lines("anIkam\neva".lines());
assert_eq!(weights(&scan[0]), vec![L, G, L]);
}
}
Loading

0 comments on commit 84c8441

Please sign in to comment.