diff --git a/Links.md b/Links.md index 94b72a8..8fb8085 100644 --- a/Links.md +++ b/Links.md @@ -15,3 +15,4 @@ * `tokenisers/` * [tokeniser-disamb-gt-desc.pmscript](tools-tokenisers-tokeniser-disamb-gt-desc.pmscript.html) ([src](https://github.com/giellalt/lang-hin/blob/main/tools/tokenisers/tokeniser-disamb-gt-desc.pmscript)) * [tokeniser-gramcheck-gt-desc.pmscript](tools-tokenisers-tokeniser-gramcheck-gt-desc.pmscript.html) ([src](https://github.com/giellalt/lang-hin/blob/main/tools/tokenisers/tokeniser-gramcheck-gt-desc.pmscript)) + * [tokeniser-tts-cggt-desc.pmscript](tools-tokenisers-tokeniser-tts-cggt-desc.pmscript.html) ([src](https://github.com/giellalt/lang-hin/blob/main/tools/tokenisers/tokeniser-tts-cggt-desc.pmscript)) diff --git a/Makefile.in b/Makefile.in index c20261c..035b120 100644 --- a/Makefile.in +++ b/Makefile.in @@ -420,7 +420,7 @@ HEADER = $(srcdir)/index-header.md INDEX = $(srcdir)/index.md # no regenerations while debugging -doc_DATA = $(INDEX) $(LINKS) $(ALLINONE_MD_PAGE) +doc_DATA = $(INDEX) $(LINKS) $(ALLINONE_MD_PAGE) lemmacount.json DOCC2MDWIKI = $(GTCORE)/scripts/doccomments2ghpages.awk DOCC2MDWIKI_CG3 = $(GTCORE)/scripts/doccomments2ghpages-vislcg.awk GRAPHPLOTTER = $(GTCORE)/scripts/plot-speller-progress.R @@ -932,6 +932,11 @@ regenerate-markdown: generate-markdown @WANT_SPELLERS_FALSE@ @echo need to configure --enable-spellers to generate statistics @WANT_SPELLERS_FALSE@ touch $@ +# Generate endpoint json file for shield.io lemma count badge. +# Only to be stored in the gh-pages branch, ignored in main. +$(srcdir)/lemmacount.json: $(top_srcdir)/src/fst/stems/*.lexc + $(AM_V_GEN)$(GTCORE)/scripts/make-lemmacount.json.sh $(abs_top_srcdir) > $@ + # Convert source filenames to extracted documentation filenames, VPATH safe: # ../../../src/fst/stems/adverbs.lexc => src-fst-stems-adverbs.lexc.md define src2md diff --git a/hin.md b/hin.md index ffb83dd..760751d 100644 --- a/hin.md +++ b/hin.md @@ -803,3 +803,67 @@ Finally we mark as a token any sequence making up a: --- +# TTS tokenisation for smj + +Requires a recent version of HFST (3.10.0 / git revision>=3aecdbc) +Then just: +```sh +make +echo "ja, ja" \ +| hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst +``` + +More usage examples: +```sh +echo "Juos gorreválggain lea (dárbbašlaš) deavdit gáibádusa \ +boasttu olmmoš, man mielde lahtuid." \ +| hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst +echo "(gáfe) 'ja' ja 3. ja? ц jaja ukjend \"ukjend\"" \ +| hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst +echo "márffibiillagáffe" \ +| hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst +``` + +Pmatch documentation: + + +Characters which have analyses in the lexicon, but can appear without spaces +before/after, that is, with no context conditions, and adjacent to words: +* Punct contains ASCII punctuation marks +* The symbol after m-dash is soft-hyphen `U+00AD` +* The symbol following {•} is byte-order-mark / zero-width no-break space +`U+FEFF`. + +Whitespace contains ASCII white space and +the List contains some unicode white space characters +* En Quad U+2000 to Zero-Width Joiner U+200d' +* Narrow No-Break Space U+202F +* Medium Mathematical Space U+205F +* Word joiner U+2060 + +Apart from what's in our morphology, there are +1) unknown word-like forms, and +2) unmatched strings +We want to give 1) a match, but let 2) be treated specially by hfst-tokenise -a +* select extended latin symbols +* select symbols +* various symbols from Private area (probably Microsoft), +so far: +* U+F0B7 for "x in box" + +TODO: Could use something like this, but built-in's don't include šžđčŋ: + +Simply give an empty reading when something is unknown: +hfst-tokenise --giella-cg will treat such empty analyses as unknowns, and +remove empty analyses from other readings. Empty readings are also +legal in CG, they get a default baseform equal to the wordform, but +no tag to check, so it's safer to let hfst-tokenise handle them. + +Needs hfst-tokenise to output things differently depending on the tag they get + +* * * + +This (part of) documentation was generated from [tools/tokenisers/tokeniser-tts-cggt-desc.pmscript](https://github.com/giellalt/lang-hin/blob/main/tools/tokenisers/tokeniser-tts-cggt-desc.pmscript) + +--- + diff --git a/index-header.md b/index-header.md index 5bbc8bb..ba6575a 100644 --- a/index-header.md +++ b/index-header.md @@ -1,9 +1,10 @@ # Hindi documentation [![Maturity: Experiment](https://img.shields.io/badge/Maturity-Experiment-black.svg)](https://giellalt.github.io/MaturityClassification.html) +![Endpoint Badge](https://img.shields.io/endpoint?url=https%3A%2F%2Fraw.githubusercontent.com%2Fgiellalt%2Flang-hin%2Fgh-pages%2Flemmacount.json) [![License](https://img.shields.io/github/license/giellalt/lang-hin)](https://github.com/giellalt/lang-hin/blob/main/LICENSE) [![Issues](https://img.shields.io/github/issues/giellalt/lang-hin)](https://github.com/giellalt/lang-hin/issues) -[![Build Status](https://divvun-tc.thetc.se/api/github/v1/repository/giellalt/lang-hin/main/badge.svg)](https://github.com/giellalt/lang-hin/actions) +[![Build Status](https://divvun-tc.giellalt.org/api/github/v1/repository/giellalt/lang-hin/main/badge.svg)](https://github.com/giellalt/lang-hin/actions) This page documents the work on the **Hindi language model**. diff --git a/index.md b/index.md index 8c9d1a6..8793768 100644 --- a/index.md +++ b/index.md @@ -1,9 +1,10 @@ # Hindi documentation [![Maturity: Experiment](https://img.shields.io/badge/Maturity-Experiment-black.svg)](https://giellalt.github.io/MaturityClassification.html) +![Endpoint Badge](https://img.shields.io/endpoint?url=https%3A%2F%2Fraw.githubusercontent.com%2Fgiellalt%2Flang-hin%2Fgh-pages%2Flemmacount.json) [![License](https://img.shields.io/github/license/giellalt/lang-hin)](https://github.com/giellalt/lang-hin/blob/main/LICENSE) [![Issues](https://img.shields.io/github/issues/giellalt/lang-hin)](https://github.com/giellalt/lang-hin/issues) -[![Build Status](https://divvun-tc.thetc.se/api/github/v1/repository/giellalt/lang-hin/main/badge.svg)](https://github.com/giellalt/lang-hin/actions) +[![Build Status](https://divvun-tc.giellalt.org/api/github/v1/repository/giellalt/lang-hin/main/badge.svg)](https://github.com/giellalt/lang-hin/actions) This page documents the work on the **Hindi language model**. @@ -32,3 +33,4 @@ Below is an autogenerated list of documentation pages built from structured comm * `tokenisers/` * [tokeniser-disamb-gt-desc.pmscript](tools-tokenisers-tokeniser-disamb-gt-desc.pmscript.html) ([src](https://github.com/giellalt/lang-hin/blob/main/tools/tokenisers/tokeniser-disamb-gt-desc.pmscript)) * [tokeniser-gramcheck-gt-desc.pmscript](tools-tokenisers-tokeniser-gramcheck-gt-desc.pmscript.html) ([src](https://github.com/giellalt/lang-hin/blob/main/tools/tokenisers/tokeniser-gramcheck-gt-desc.pmscript)) + * [tokeniser-tts-cggt-desc.pmscript](tools-tokenisers-tokeniser-tts-cggt-desc.pmscript.html) ([src](https://github.com/giellalt/lang-hin/blob/main/tools/tokenisers/tokeniser-tts-cggt-desc.pmscript)) diff --git a/lemmacount.json b/lemmacount.json new file mode 100644 index 0000000..2e9538e --- /dev/null +++ b/lemmacount.json @@ -0,0 +1 @@ +{ "schemaVersion": 1, "label": "Lemmas", "message": "7", "color": "black" } diff --git a/tools-tokenisers-tokeniser-tts-cggt-desc.pmscript.md b/tools-tokenisers-tokeniser-tts-cggt-desc.pmscript.md new file mode 100644 index 0000000..8145ffa --- /dev/null +++ b/tools-tokenisers-tokeniser-tts-cggt-desc.pmscript.md @@ -0,0 +1,64 @@ +# TTS tokenisation for smj + +Requires a recent version of HFST (3.10.0 / git revision>=3aecdbc) +Then just: +```sh +make +echo "ja, ja" \ +| hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst +``` + +More usage examples: +```sh +echo "Juos gorreválggain lea (dárbbašlaš) deavdit gáibádusa \ +boasttu olmmoš, man mielde lahtuid." \ +| hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst +echo "(gáfe) 'ja' ja 3. ja? ц jaja ukjend \"ukjend\"" \ +| hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst +echo "márffibiillagáffe" \ +| hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst +``` + +Pmatch documentation: + + +Characters which have analyses in the lexicon, but can appear without spaces +before/after, that is, with no context conditions, and adjacent to words: +* Punct contains ASCII punctuation marks +* The symbol after m-dash is soft-hyphen `U+00AD` +* The symbol following {•} is byte-order-mark / zero-width no-break space +`U+FEFF`. + +Whitespace contains ASCII white space and +the List contains some unicode white space characters +* En Quad U+2000 to Zero-Width Joiner U+200d' +* Narrow No-Break Space U+202F +* Medium Mathematical Space U+205F +* Word joiner U+2060 + +Apart from what's in our morphology, there are +1) unknown word-like forms, and +2) unmatched strings +We want to give 1) a match, but let 2) be treated specially by hfst-tokenise -a +* select extended latin symbols +* select symbols +* various symbols from Private area (probably Microsoft), +so far: +* U+F0B7 for "x in box" + +TODO: Could use something like this, but built-in's don't include šžđčŋ: + +Simply give an empty reading when something is unknown: +hfst-tokenise --giella-cg will treat such empty analyses as unknowns, and +remove empty analyses from other readings. Empty readings are also +legal in CG, they get a default baseform equal to the wordform, but +no tag to check, so it's safer to let hfst-tokenise handle them. + +Needs hfst-tokenise to output things differently depending on the tag they get + +* * * + +This (part of) documentation was generated from [tools/tokenisers/tokeniser-tts-cggt-desc.pmscript](https://github.com/giellalt/lang-hin/blob/main/tools/tokenisers/tokeniser-tts-cggt-desc.pmscript) + +--- +