From ed859974b1d09ee1278d686a40e2f0069ee5a449 Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Wed, 17 Apr 2024 16:46:24 +0000 Subject: [PATCH 1/9] add test --- test/data/short.aligned.case.nlp | 2 +- test/data/short.aligned.punc.nlp | 2 +- test/data/short.aligned.punc_case.nlp | 2 +- test/data/short_punc.ref.nlp | 66 +++++++++++++-------------- test/fstalign_Test.cc | 2 +- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/test/data/short.aligned.case.nlp b/test/data/short.aligned.case.nlp index d2fd08c..1d10606 100644 --- a/test/data/short.aligned.case.nlp +++ b/test/data/short.aligned.case.nlp @@ -23,7 +23,7 @@ sure|1|0.0000|0.0000|.||LC|[]|[]|||| When|1|0.0000|0.0000|||UC|[]|[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| hear|1|0.0000|0.0000|||LC|[]|[]|||| -Foobar|1|0.0000|0.0000|,||UC|[]|[]|||| +Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| I|1|0.0000|0.0000|||CA|[]|[]|||| think|1|0.0000|0.0000|||LC|[]|[]|||| about|1|0.0000|0.0000|||LC|[]|[]|||| diff --git a/test/data/short.aligned.punc.nlp b/test/data/short.aligned.punc.nlp index 421d1ac..8c4cf3c 100644 --- a/test/data/short.aligned.punc.nlp +++ b/test/data/short.aligned.punc.nlp @@ -31,7 +31,7 @@ sure|1|0.0000|0.0000|.||LC|[]|[]|||| When|1|0.0000|0.0000|||UC|[]|[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| hear|1|0.0000|0.0000|||LC|[]|[]|||| -Foobar|1|0.0000|0.0000|,||UC|[]|[]|||| +Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| ,|1|0.0000|0.0000|||||[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| think|1|0.0000|0.0000|||LC|[]|[]|||| diff --git a/test/data/short.aligned.punc_case.nlp b/test/data/short.aligned.punc_case.nlp index affb08c..07615c3 100644 --- a/test/data/short.aligned.punc_case.nlp +++ b/test/data/short.aligned.punc_case.nlp @@ -31,7 +31,7 @@ sure|1|0.0000|0.0000|.||LC|[]|[]|||| When|1|0.0000|0.0000|||UC|[]|[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| hear|1|0.0000|0.0000|||LC|[]|[]|||| -Foobar|1|0.0000|0.0000|,||UC|[]|[]|||| +Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| ,|1|0.0000|0.0000|||||[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| think|1|0.0000|0.0000|||LC|[]|[]|||| diff --git a/test/data/short_punc.ref.nlp b/test/data/short_punc.ref.nlp index 35e5ec5..1710724 100644 --- a/test/data/short_punc.ref.nlp +++ b/test/data/short_punc.ref.nlp @@ -1,33 +1,33 @@ -token|speaker|ts|endTs|punctuation|case|tags -|2||||LC|[] -Yeah|1|||,|UC|[] -yeah|1|||,|LC|[] -right|1|||.|LC|[] -Yeah|1|||,|UC|[] -all|1||||LC|[] -right|1|||,|LC|[] -probably|1||||LC|[] -just|1||||LC|[] -that|1|||.|LC|[] -Are|3||||UC|[] -there|3||||LC|[] -any|3||||LC|[] -visuals|3||||LC|[] -that|3||||LC|[] -come|3||||LC|[] -to|3||||LC|[] -mind|3||||LC|[] -or-|3||||LC|[] -Yeah|1|||,|UC|[] -sure|1|||.|LC|[] -When|1||||UC|[] -I|1||||CA|[] -hear|1||||LC|[] -Foobar|1|||,|UC|[] -I|1||||CA|[] -think|1||||LC|[] -about|1||||LC|[] -just|1||||LC|[] -that|1|||:|LC|[] -foo|1||||LC|[] -a|1||||LC|[] +token|speaker|ts|endTs|punctuation|case|tags|wer_tags +|2||||LC|[]|[] +Yeah|1|||,|UC|[]|[] +yeah|1|||,|LC|[]|[] +right|1|||.|LC|[]|[] +Yeah|1|||,|UC|[]|[] +all|1||||LC|[]|[] +right|1|||,|LC|[]|[] +probably|1||||LC|[]|[] +just|1||||LC|[]|[] +that|1|||.|LC|[]|[] +Are|3||||UC|[]|[] +there|3||||LC|[]|[] +any|3||||LC|[]|[] +visuals|3||||LC|[]|[] +that|3||||LC|[]|[] +come|3||||LC|[]|[] +to|3||||LC|[]|[] +mind|3||||LC|[]|[] +or-|3||||LC|[]|[] +Yeah|1|||,|UC|[]|[] +sure|1|||.|LC|[]|[] +When|1||||UC|[]|[] +I|1||||CA|[]|[] +hear|1||||LC|[]|[] +Foobar|1|||,|UC|[]|['1', '2'] +I|1||||CA|[]|[] +think|1||||LC|[]|[] +about|1||||LC|[]|[] +just|1||||LC|[]|[] +that|1|||:|LC|[]|[] +foo|1||||LC|[]|[] +a|1||||LC|[]|[] diff --git a/test/fstalign_Test.cc b/test/fstalign_Test.cc index 4b23e63..1a182e3 100644 --- a/test/fstalign_Test.cc +++ b/test/fstalign_Test.cc @@ -681,7 +681,7 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-adapted-composition()") { SECTION("NLP Hypothesis: wer with case and punctuation(nlp output)") { const auto result = - exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case"); + exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case --wer-sidecar short_punc.wer_tag.json"); const auto testFile = std::string{TEST_DATA} + "short.aligned.punc_case.nlp"; REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str())); From 8f73d42c7957d242f4ea3dc6f58fae893f723815 Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Wed, 17 Apr 2024 19:42:13 +0000 Subject: [PATCH 2/9] add test --- test/data/short.sbs.txt | 125 ++++++++++++++++++++++++++++++ test/data/short_punc.wer_tag.json | 8 ++ test/fstalign_Test.cc | 2 + 3 files changed, 135 insertions(+) create mode 100644 test/data/short.sbs.txt create mode 100644 test/data/short_punc.wer_tag.json diff --git a/test/data/short.sbs.txt b/test/data/short.sbs.txt new file mode 100644 index 0000000..bb1cc40 --- /dev/null +++ b/test/data/short.sbs.txt @@ -0,0 +1,125 @@ + ref_token hyp_token IsErr Class Wer_Tag_Entities + + yeah yeah + , ERR + yeah ERR + , ERR + right right + . ERR + yeah ERR + , ERR + all ERR + right ERR + , i'll ERR + probably do ERR + just just + that that + . ERR + are are + there there + any any + visuals visuals + that that + come come + to to + mind mind + or or + yeah yeah + , ERR + sure sure + . ERR + when when + i i + hear hear + foobar foobar ###1_PROPER_NOUN###|###2_SPACY>ORG###| + , ERR + i i + think think + about about + just just + that that + : ERR + foo foobar ERR + a a +------------------------------------------------------------ + Line Group + 4 , yeah , <-> *** + 8 . yeah , all right , probably <-> i'll do + 17 . <-> *** + 28 , <-> *** + 30 . <-> *** + 35 , <-> *** + 41 : foo <-> foobar +------------------------------------------------------------ + Unigram Prec. Recall + , 0/0 (0.0 %) 0/6 (0.0 %) + . 0/0 (0.0 %) 0/3 (0.0 %) + : 0/0 (0.0 %) 0/1 (0.0 %) + all 0/0 (0.0 %) 0/1 (0.0 %) + do 0/1 (0.0 %) 0/0 (0.0 %) + foo 0/0 (0.0 %) 0/1 (0.0 %) + i'll 0/1 (0.0 %) 0/0 (0.0 %) + probably 0/0 (0.0 %) 0/1 (0.0 %) + foobar 1/2 (50.0 %) 1/1 (100.0 %) + right 1/1 (100.0 %) 1/2 (50.0 %) + yeah 2/2 (100.0 %) 2/4 (50.0 %) + 1/1 (100.0 %) 1/1 (100.0 %) + a 1/1 (100.0 %) 1/1 (100.0 %) + about 1/1 (100.0 %) 1/1 (100.0 %) + any 1/1 (100.0 %) 1/1 (100.0 %) + are 1/1 (100.0 %) 1/1 (100.0 %) + come 1/1 (100.0 %) 1/1 (100.0 %) + hear 1/1 (100.0 %) 1/1 (100.0 %) + i 2/2 (100.0 %) 2/2 (100.0 %) + just 2/2 (100.0 %) 2/2 (100.0 %) + mind 1/1 (100.0 %) 1/1 (100.0 %) + or 1/1 (100.0 %) 1/1 (100.0 %) + sure 1/1 (100.0 %) 1/1 (100.0 %) + that 3/3 (100.0 %) 3/3 (100.0 %) + there 1/1 (100.0 %) 1/1 (100.0 %) + think 1/1 (100.0 %) 1/1 (100.0 %) + to 1/1 (100.0 %) 1/1 (100.0 %) + visuals 1/1 (100.0 %) 1/1 (100.0 %) + when 1/1 (100.0 %) 1/1 (100.0 %) +------------------------------------------------------------ + Bigram Precision Recall + , all 0/0 (0.0 %) 0/1 (0.0 %) + , i 0/0 (0.0 %) 0/1 (0.0 %) + , probably 0/0 (0.0 %) 0/1 (0.0 %) + , right 0/0 (0.0 %) 0/1 (0.0 %) + , sure 0/0 (0.0 %) 0/1 (0.0 %) + , yeah 0/0 (0.0 %) 0/1 (0.0 %) + . are 0/0 (0.0 %) 0/1 (0.0 %) + . when 0/0 (0.0 %) 0/1 (0.0 %) + . yeah 0/0 (0.0 %) 0/1 (0.0 %) + : foo 0/0 (0.0 %) 0/1 (0.0 %) + all right 0/0 (0.0 %) 0/1 (0.0 %) + do just 0/1 (0.0 %) 0/0 (0.0 %) + foo a 0/0 (0.0 %) 0/1 (0.0 %) + foobar , 0/0 (0.0 %) 0/1 (0.0 %) + foobar a 0/1 (0.0 %) 0/0 (0.0 %) + i'll do 0/1 (0.0 %) 0/0 (0.0 %) + probably just 0/0 (0.0 %) 0/1 (0.0 %) + right , 0/0 (0.0 %) 0/1 (0.0 %) + right . 0/0 (0.0 %) 0/1 (0.0 %) + sure . 0/0 (0.0 %) 0/1 (0.0 %) + that . 0/0 (0.0 %) 0/1 (0.0 %) + that : 0/0 (0.0 %) 0/1 (0.0 %) + yeah , 0/0 (0.0 %) 0/4 (0.0 %) + yeah 1/1 (100.0 %) 1/1 (100.0 %) + about just 1/1 (100.0 %) 1/1 (100.0 %) + any visuals 1/1 (100.0 %) 1/1 (100.0 %) + are there 1/1 (100.0 %) 1/1 (100.0 %) + come to 1/1 (100.0 %) 1/1 (100.0 %) + hear foobar 1/1 (100.0 %) 1/1 (100.0 %) + i hear 1/1 (100.0 %) 1/1 (100.0 %) + i think 1/1 (100.0 %) 1/1 (100.0 %) + just that 2/2 (100.0 %) 2/2 (100.0 %) + mind or 1/1 (100.0 %) 1/1 (100.0 %) + or yeah 1/1 (100.0 %) 1/1 (100.0 %) + that come 1/1 (100.0 %) 1/1 (100.0 %) + there any 1/1 (100.0 %) 1/1 (100.0 %) + think about 1/1 (100.0 %) 1/1 (100.0 %) + to mind 1/1 (100.0 %) 1/1 (100.0 %) + visuals that 1/1 (100.0 %) 1/1 (100.0 %) + when i 1/1 (100.0 %) 1/1 (100.0 %) diff --git a/test/data/short_punc.wer_tag.json b/test/data/short_punc.wer_tag.json new file mode 100644 index 0000000..24f8ba3 --- /dev/null +++ b/test/data/short_punc.wer_tag.json @@ -0,0 +1,8 @@ +{ + "1": { + "entity_type": "PROPER_NOUN" + }, + "2": { + "entity_type": "SPACY>ORG" + } +} diff --git a/test/fstalign_Test.cc b/test/fstalign_Test.cc index 1a182e3..e8ef9f1 100644 --- a/test/fstalign_Test.cc +++ b/test/fstalign_Test.cc @@ -683,8 +683,10 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-adapted-composition()") { const auto result = exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case --wer-sidecar short_punc.wer_tag.json"); const auto testFile = std::string{TEST_DATA} + "short.aligned.punc_case.nlp"; + const auto testSbsFile = std::string{TEST_DATA} + "short.sbs.txt"; REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str())); + REQUIRE(compareFiles(sbs_output.c_str(), testSbsFile.c_str())); REQUIRE_THAT(result, Contains("WER: 13/42 = 0.3095")); REQUIRE_THAT(result, Contains("WER: INS:2 DEL:7 SUB:4")); } From f1fa887c94c81864077bae7a27a8dfb3f09de669 Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Wed, 17 Apr 2024 19:48:31 +0000 Subject: [PATCH 3/9] fix --- src/Nlp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Nlp.cpp b/src/Nlp.cpp index 06cdfe8..5e2d015 100644 --- a/src/Nlp.cpp +++ b/src/Nlp.cpp @@ -30,7 +30,6 @@ NlpFstLoader::NlpFstLoader(std::vector &records, Json::Value norma // fuse multiple rows that have the same id/label into one entry only for (auto &row : records) { - mNlpRows.push_back(row); auto curr_tk = row.token; auto curr_label = row.best_label; auto curr_label_id = row.best_label_id; @@ -48,6 +47,7 @@ NlpFstLoader::NlpFstLoader(std::vector &records, Json::Value norma } row.wer_tags = real_wer_tags; std::string speaker = row.speakerId; + mNlpRows.push_back(row); if (processLabels && curr_label != "") { if (firstTk || curr_label != last_label) { From 7f8fae1f7ab14fabe38d24ccffdb768a13c2eee0 Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Wed, 17 Apr 2024 21:01:02 +0000 Subject: [PATCH 4/9] Add and use wer tag data structure --- src/Nlp.cpp | 18 +++++++++--------- src/Nlp.h | 9 +++++++-- src/fstalign.cpp | 2 +- src/wer.cpp | 15 ++++++--------- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/Nlp.cpp b/src/Nlp.cpp index 5e2d015..6c4c133 100644 --- a/src/Nlp.cpp +++ b/src/Nlp.cpp @@ -28,6 +28,7 @@ NlpFstLoader::NlpFstLoader(std::vector &records, Json::Value norma std::string last_label; bool firstTk = true; + auto logger = logger::GetOrCreateLogger("NlpFstLoader"); // fuse multiple rows that have the same id/label into one entry only for (auto &row : records) { auto curr_tk = row.token; @@ -37,15 +38,13 @@ NlpFstLoader::NlpFstLoader(std::vector &records, Json::Value norma auto curr_row_tags = row.wer_tags; // Update wer tags in records to real string labels - vector real_wer_tags; for (auto &tag : curr_row_tags) { - auto real_tag = tag; if (mWerSidecar != Json::nullValue) { - real_tag = "###" + real_tag + "_" + mWerSidecar[real_tag]["entity_type"].asString() + "###"; + tag.entity_type = mWerSidecar[tag.tag_id]["entity_type"].asString(); + logger->info(tag.entity_type); } - real_wer_tags.push_back(real_tag); } - row.wer_tags = real_wer_tags; + row.wer_tags = curr_row_tags; std::string speaker = row.speakerId; mNlpRows.push_back(row); @@ -411,8 +410,8 @@ std::string NlpReader::GetBestLabel(std::string &labels) { return labels; } -std::vector NlpReader::GetWerTags(std::string &wer_tags_str) { - std::vector wer_tags; +std::vector NlpReader::GetWerTags(std::string &wer_tags_str) { + std::vector wer_tags; if (wer_tags_str == "[]") { return wer_tags; } @@ -420,8 +419,9 @@ std::vector NlpReader::GetWerTags(std::string &wer_tags_str) { int current_pos = 2; auto pos = wer_tags_str.find("'", current_pos); while (pos != -1) { - std::string wer_tag = wer_tags_str.substr(current_pos, pos - current_pos); - wer_tags.push_back(wer_tag); + WerTagEntry entry; + entry.tag_id = wer_tags_str.substr(current_pos, pos - current_pos); + wer_tags.push_back(entry); current_pos = wer_tags_str.find("'", pos + 1) + 1; if (current_pos == 0) { break; diff --git a/src/Nlp.h b/src/Nlp.h index a9757f9..13c246d 100644 --- a/src/Nlp.h +++ b/src/Nlp.h @@ -16,6 +16,11 @@ using namespace std; using namespace fst; +struct WerTagEntry { + string tag_id; + string entity_type; +}; + struct RawNlpRecord { string token; string speakerId; @@ -27,7 +32,7 @@ struct RawNlpRecord { string labels; string best_label; string best_label_id; - vector wer_tags; + vector wer_tags; string confidence; }; @@ -37,7 +42,7 @@ class NlpReader { virtual ~NlpReader(); vector read_from_disk(const std::string &filename); string GetBestLabel(std::string &labels); - vector GetWerTags(std::string &wer_tags_str); + vector GetWerTags(std::string &wer_tags_str); string GetLabelId(std::string &label); }; diff --git a/src/fstalign.cpp b/src/fstalign.cpp index e7cc2fb..96fb33f 100644 --- a/src/fstalign.cpp +++ b/src/fstalign.cpp @@ -619,7 +619,7 @@ void write_stitches_to_nlp(vector& stitches, ofstream &output_nlp_fil << "["; /* for (auto wer_tag : nlpRow.wer_tags) { */ for (auto it = stitch.nlpRow.wer_tags.begin(); it != stitch.nlpRow.wer_tags.end(); ++it) { - output_nlp_file << "'" << *it << "'"; + output_nlp_file << "'" << it->tag_id << "'"; if (std::next(it) != stitch.nlpRow.wer_tags.end()) { output_nlp_file << ", "; } diff --git a/src/wer.cpp b/src/wer.cpp index beca178..5f8f5f2 100644 --- a/src/wer.cpp +++ b/src/wer.cpp @@ -350,19 +350,16 @@ void RecordTagWer(const vector& stitches) { for (const auto &stitch : stitches) { if (!stitch.nlpRow.wer_tags.empty()) { for (auto wer_tag : stitch.nlpRow.wer_tags) { - int tag_start = wer_tag.find_first_not_of('#'); - int tag_end = wer_tag.find('_'); - string wer_tag_id = wer_tag.substr(tag_start, tag_end - tag_start); - wer_results.insert(std::pair(wer_tag_id, {0, 0, 0, 0, 0})); + wer_results.insert(std::pair(wer_tag.tag_id, {0, 0, 0, 0, 0})); // Check with rfind since other comments can be there bool del = stitch.comment.rfind("del", 0) == 0; bool ins = stitch.comment.rfind("ins", 0) == 0; bool sub = stitch.comment.rfind("sub", 0) == 0; - wer_results[wer_tag_id].insertions += ins; - wer_results[wer_tag_id].deletions += del; - wer_results[wer_tag_id].substitutions += sub; + wer_results[wer_tag.tag_id].insertions += ins; + wer_results[wer_tag.tag_id].deletions += del; + wer_results[wer_tag.tag_id].substitutions += sub; if (!ins) { - wer_results[wer_tag_id].numWordsInReference += 1; + wer_results[wer_tag.tag_id].numWordsInReference += 1; } } } @@ -555,7 +552,7 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st string tk_wer_tags = ""; auto wer_tags = p_stitch.nlpRow.wer_tags; for (auto wer_tag: wer_tags) { - tk_wer_tags = tk_wer_tags + wer_tag + "|"; + tk_wer_tags = tk_wer_tags + "###" + wer_tag.tag_id + "_" + wer_tag.entity_type + "###|"; } string ref_tk = p_stitch.reftk; string hyp_tk = p_stitch.hyptk; From 666057ea12e59ca1aead5b99d14548443417096c Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Wed, 17 Apr 2024 21:38:57 +0000 Subject: [PATCH 5/9] fix test --- test/data/short.sbs.txt | 110 +++++++++++++++++++++------------------- test/fstalign_Test.cc | 3 +- 2 files changed, 60 insertions(+), 53 deletions(-) diff --git a/test/data/short.sbs.txt b/test/data/short.sbs.txt index bb1cc40..532b2db 100644 --- a/test/data/short.sbs.txt +++ b/test/data/short.sbs.txt @@ -1,21 +1,21 @@ ref_token hyp_token IsErr Class Wer_Tag_Entities - yeah yeah - , ERR + Yeah Yeah + , , yeah ERR , ERR right right . ERR - yeah ERR + Yeah ERR , ERR all ERR right ERR - , i'll ERR + , I'll ERR probably do ERR just just that that - . ERR - are are + . ? ERR + Are Are there there any any visuals visuals @@ -23,54 +23,57 @@ come come to to mind mind - or or - yeah yeah - , ERR + or or ___100002_SYN_1-1___ + ? ERR + Yeah Yeah + , , sure sure - . ERR - when when - i i + . . + When When + I I hear hear - foobar foobar ###1_PROPER_NOUN###|###2_SPACY>ORG###| - , ERR - i i + Foobar Foobar ###1_PROPER_NOUN###|###2_SPACY>ORG###| + , , + I I think think about about just just that that - : ERR - foo foobar ERR + : : + Foobar ERR + foo , ERR a a ------------------------------------------------------------ Line Group - 4 , yeah , <-> *** - 8 . yeah , all right , probably <-> i'll do - 17 . <-> *** - 28 , <-> *** - 30 . <-> *** - 35 , <-> *** - 41 : foo <-> foobar + 5 yeah , <-> *** + 8 . Yeah , all right , probably <-> I'll do + 17 . <-> ? + 27 *** <-> ? + 43 foo <-> Foobar , ------------------------------------------------------------ Unigram Prec. Recall - , 0/0 (0.0 %) 0/6 (0.0 %) - . 0/0 (0.0 %) 0/3 (0.0 %) - : 0/0 (0.0 %) 0/1 (0.0 %) + ? 0/2 (0.0 %) 0/0 (0.0 %) + I'll 0/1 (0.0 %) 0/0 (0.0 %) all 0/0 (0.0 %) 0/1 (0.0 %) do 0/1 (0.0 %) 0/0 (0.0 %) foo 0/0 (0.0 %) 0/1 (0.0 %) - i'll 0/1 (0.0 %) 0/0 (0.0 %) probably 0/0 (0.0 %) 0/1 (0.0 %) - foobar 1/2 (50.0 %) 1/1 (100.0 %) + yeah 0/0 (0.0 %) 0/1 (0.0 %) + Foobar 1/2 (50.0 %) 1/1 (100.0 %) + , 3/4 (75.0 %) 3/6 (50.0 %) + . 1/1 (100.0 %) 1/3 (33.3 %) right 1/1 (100.0 %) 1/2 (50.0 %) - yeah 2/2 (100.0 %) 2/4 (50.0 %) + Yeah 2/2 (100.0 %) 2/3 (66.7 %) + : 1/1 (100.0 %) 1/1 (100.0 %) 1/1 (100.0 %) 1/1 (100.0 %) + Are 1/1 (100.0 %) 1/1 (100.0 %) + I 2/2 (100.0 %) 2/2 (100.0 %) + When 1/1 (100.0 %) 1/1 (100.0 %) a 1/1 (100.0 %) 1/1 (100.0 %) about 1/1 (100.0 %) 1/1 (100.0 %) any 1/1 (100.0 %) 1/1 (100.0 %) - are 1/1 (100.0 %) 1/1 (100.0 %) come 1/1 (100.0 %) 1/1 (100.0 %) hear 1/1 (100.0 %) 1/1 (100.0 %) - i 2/2 (100.0 %) 2/2 (100.0 %) just 2/2 (100.0 %) 2/2 (100.0 %) mind 1/1 (100.0 %) 1/1 (100.0 %) or 1/1 (100.0 %) 1/1 (100.0 %) @@ -80,46 +83,49 @@ think 1/1 (100.0 %) 1/1 (100.0 %) to 1/1 (100.0 %) 1/1 (100.0 %) visuals 1/1 (100.0 %) 1/1 (100.0 %) - when 1/1 (100.0 %) 1/1 (100.0 %) ------------------------------------------------------------ Bigram Precision Recall + , a 0/1 (0.0 %) 0/0 (0.0 %) , all 0/0 (0.0 %) 0/1 (0.0 %) - , i 0/0 (0.0 %) 0/1 (0.0 %) , probably 0/0 (0.0 %) 0/1 (0.0 %) , right 0/0 (0.0 %) 0/1 (0.0 %) - , sure 0/0 (0.0 %) 0/1 (0.0 %) , yeah 0/0 (0.0 %) 0/1 (0.0 %) - . are 0/0 (0.0 %) 0/1 (0.0 %) - . when 0/0 (0.0 %) 0/1 (0.0 %) - . yeah 0/0 (0.0 %) 0/1 (0.0 %) - : foo 0/0 (0.0 %) 0/1 (0.0 %) + . Are 0/0 (0.0 %) 0/1 (0.0 %) + . Yeah 0/0 (0.0 %) 0/1 (0.0 %) + : Foobar 0/1 (0.0 %) 0/0 (0.0 %) + ? Are 0/1 (0.0 %) 0/0 (0.0 %) + ? Yeah 0/1 (0.0 %) 0/0 (0.0 %) + I'll do 0/1 (0.0 %) 0/0 (0.0 %) all right 0/0 (0.0 %) 0/1 (0.0 %) do just 0/1 (0.0 %) 0/0 (0.0 %) foo a 0/0 (0.0 %) 0/1 (0.0 %) - foobar , 0/0 (0.0 %) 0/1 (0.0 %) - foobar a 0/1 (0.0 %) 0/0 (0.0 %) - i'll do 0/1 (0.0 %) 0/0 (0.0 %) + or ? 0/1 (0.0 %) 0/0 (0.0 %) probably just 0/0 (0.0 %) 0/1 (0.0 %) right , 0/0 (0.0 %) 0/1 (0.0 %) right . 0/0 (0.0 %) 0/1 (0.0 %) - sure . 0/0 (0.0 %) 0/1 (0.0 %) that . 0/0 (0.0 %) 0/1 (0.0 %) - that : 0/0 (0.0 %) 0/1 (0.0 %) - yeah , 0/0 (0.0 %) 0/4 (0.0 %) - yeah 1/1 (100.0 %) 1/1 (100.0 %) + that ? 0/1 (0.0 %) 0/0 (0.0 %) + yeah , 0/0 (0.0 %) 0/1 (0.0 %) + Foobar , 1/2 (50.0 %) 1/1 (100.0 %) + Yeah , 2/2 (100.0 %) 2/3 (66.7 %) + , I 1/1 (100.0 %) 1/1 (100.0 %) + , sure 1/1 (100.0 %) 1/1 (100.0 %) + . When 1/1 (100.0 %) 1/1 (100.0 %) + Yeah 1/1 (100.0 %) 1/1 (100.0 %) + Are there 1/1 (100.0 %) 1/1 (100.0 %) + I hear 1/1 (100.0 %) 1/1 (100.0 %) + I think 1/1 (100.0 %) 1/1 (100.0 %) + When I 1/1 (100.0 %) 1/1 (100.0 %) about just 1/1 (100.0 %) 1/1 (100.0 %) any visuals 1/1 (100.0 %) 1/1 (100.0 %) - are there 1/1 (100.0 %) 1/1 (100.0 %) come to 1/1 (100.0 %) 1/1 (100.0 %) - hear foobar 1/1 (100.0 %) 1/1 (100.0 %) - i hear 1/1 (100.0 %) 1/1 (100.0 %) - i think 1/1 (100.0 %) 1/1 (100.0 %) + hear Foobar 1/1 (100.0 %) 1/1 (100.0 %) just that 2/2 (100.0 %) 2/2 (100.0 %) mind or 1/1 (100.0 %) 1/1 (100.0 %) - or yeah 1/1 (100.0 %) 1/1 (100.0 %) + sure . 1/1 (100.0 %) 1/1 (100.0 %) + that : 1/1 (100.0 %) 1/1 (100.0 %) that come 1/1 (100.0 %) 1/1 (100.0 %) there any 1/1 (100.0 %) 1/1 (100.0 %) think about 1/1 (100.0 %) 1/1 (100.0 %) to mind 1/1 (100.0 %) 1/1 (100.0 %) visuals that 1/1 (100.0 %) 1/1 (100.0 %) - when i 1/1 (100.0 %) 1/1 (100.0 %) diff --git a/test/fstalign_Test.cc b/test/fstalign_Test.cc index e8ef9f1..7c30f46 100644 --- a/test/fstalign_Test.cc +++ b/test/fstalign_Test.cc @@ -680,8 +680,9 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-adapted-composition()") { } SECTION("NLP Hypothesis: wer with case and punctuation(nlp output)") { + const auto wer_sidecar_path = TEST_DATA + "short_punc.wer_tag.json"; const auto result = - exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case --wer-sidecar short_punc.wer_tag.json"); + exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case --wer-sidecar " + wer_sidecar_path); const auto testFile = std::string{TEST_DATA} + "short.aligned.punc_case.nlp"; const auto testSbsFile = std::string{TEST_DATA} + "short.sbs.txt"; From 27779ffaeff81009489615be482d220d337b81c8 Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Thu, 18 Apr 2024 14:05:55 +0000 Subject: [PATCH 6/9] Remove debug log --- src/Nlp.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Nlp.cpp b/src/Nlp.cpp index 6c4c133..0bf287f 100644 --- a/src/Nlp.cpp +++ b/src/Nlp.cpp @@ -28,7 +28,6 @@ NlpFstLoader::NlpFstLoader(std::vector &records, Json::Value norma std::string last_label; bool firstTk = true; - auto logger = logger::GetOrCreateLogger("NlpFstLoader"); // fuse multiple rows that have the same id/label into one entry only for (auto &row : records) { auto curr_tk = row.token; @@ -41,7 +40,6 @@ NlpFstLoader::NlpFstLoader(std::vector &records, Json::Value norma for (auto &tag : curr_row_tags) { if (mWerSidecar != Json::nullValue) { tag.entity_type = mWerSidecar[tag.tag_id]["entity_type"].asString(); - logger->info(tag.entity_type); } } row.wer_tags = curr_row_tags; From 374b2c500296ce26eadac396c477b5d755236a1e Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Thu, 18 Apr 2024 14:10:48 +0000 Subject: [PATCH 7/9] remove unigram and bigram info from sbs output --- src/wer.cpp | 20 ---------- test/data/short.sbs.txt | 79 ------------------------------------- test/data/syn_1.hyp.sbs | 53 ------------------------- test/data/twenty.hyp-a2.sbs | 22 ----------- test/data/twenty.hyp.sbs | 21 ---------- 5 files changed, 195 deletions(-) diff --git a/src/wer.cpp b/src/wer.cpp index 5f8f5f2..94b1b3c 100644 --- a/src/wer.cpp +++ b/src/wer.cpp @@ -614,18 +614,6 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st jsonLogger::JsonLogger::getLogger().root["wer"]["unigrams"][word]["precision"] = u.precision; jsonLogger::JsonLogger::getLogger().root["wer"]["unigrams"][word]["recall"] = u.recall; } - // output error unigrams - myfile << string(60, '-') << endl << fmt::format("{0:>20}\t{1:10}\t{2:10}", "Unigram", "Prec.", "Recall") << endl; - for (const auto &a : topAlignment.unigram_stats) { - string word = a.first; - gram_error_counter u = a.second; - myfile << fmt::format("{0:>20}\t{1}/{2} ({3:.1f} %)\t{4}/{5} ({6:.1f} %)", word, u.correct, - (u.correct + u.ins + u.subst_fp), (float)u.precision, u.correct, (u.correct + u.del + u.subst_fn), - (float)u.recall) - << endl; - } - - myfile << string(60, '-') << endl << fmt::format("{0:>20}\t{1:20}\t{2:20}", "Bigram", "Precision", "Recall") << endl; for (const auto &a : topAlignment.bigrams_stats) { string word = a.first; @@ -638,14 +626,6 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st jsonLogger::JsonLogger::getLogger().root["wer"]["bigrams"][word]["precision"] = u.precision; jsonLogger::JsonLogger::getLogger().root["wer"]["bigrams"][word]["recall"] = u.recall; } - for (const auto &a : topAlignment.bigrams_stats) { - string word = a.first; - gram_error_counter u = a.second; - myfile << fmt::format("{0:>20}\t{1}/{2} ({3:.1f} %)\t{4}/{5} ({6:.1f} %)", word, u.correct, - (u.correct + u.ins + u.subst_fp), (float)u.precision, u.correct, (u.correct + u.del + u.subst_fn), - (float)u.recall) - << endl; - } myfile.close(); } diff --git a/test/data/short.sbs.txt b/test/data/short.sbs.txt index 532b2db..1406b34 100644 --- a/test/data/short.sbs.txt +++ b/test/data/short.sbs.txt @@ -50,82 +50,3 @@ 17 . <-> ? 27 *** <-> ? 43 foo <-> Foobar , ------------------------------------------------------------- - Unigram Prec. Recall - ? 0/2 (0.0 %) 0/0 (0.0 %) - I'll 0/1 (0.0 %) 0/0 (0.0 %) - all 0/0 (0.0 %) 0/1 (0.0 %) - do 0/1 (0.0 %) 0/0 (0.0 %) - foo 0/0 (0.0 %) 0/1 (0.0 %) - probably 0/0 (0.0 %) 0/1 (0.0 %) - yeah 0/0 (0.0 %) 0/1 (0.0 %) - Foobar 1/2 (50.0 %) 1/1 (100.0 %) - , 3/4 (75.0 %) 3/6 (50.0 %) - . 1/1 (100.0 %) 1/3 (33.3 %) - right 1/1 (100.0 %) 1/2 (50.0 %) - Yeah 2/2 (100.0 %) 2/3 (66.7 %) - : 1/1 (100.0 %) 1/1 (100.0 %) - 1/1 (100.0 %) 1/1 (100.0 %) - Are 1/1 (100.0 %) 1/1 (100.0 %) - I 2/2 (100.0 %) 2/2 (100.0 %) - When 1/1 (100.0 %) 1/1 (100.0 %) - a 1/1 (100.0 %) 1/1 (100.0 %) - about 1/1 (100.0 %) 1/1 (100.0 %) - any 1/1 (100.0 %) 1/1 (100.0 %) - come 1/1 (100.0 %) 1/1 (100.0 %) - hear 1/1 (100.0 %) 1/1 (100.0 %) - just 2/2 (100.0 %) 2/2 (100.0 %) - mind 1/1 (100.0 %) 1/1 (100.0 %) - or 1/1 (100.0 %) 1/1 (100.0 %) - sure 1/1 (100.0 %) 1/1 (100.0 %) - that 3/3 (100.0 %) 3/3 (100.0 %) - there 1/1 (100.0 %) 1/1 (100.0 %) - think 1/1 (100.0 %) 1/1 (100.0 %) - to 1/1 (100.0 %) 1/1 (100.0 %) - visuals 1/1 (100.0 %) 1/1 (100.0 %) ------------------------------------------------------------- - Bigram Precision Recall - , a 0/1 (0.0 %) 0/0 (0.0 %) - , all 0/0 (0.0 %) 0/1 (0.0 %) - , probably 0/0 (0.0 %) 0/1 (0.0 %) - , right 0/0 (0.0 %) 0/1 (0.0 %) - , yeah 0/0 (0.0 %) 0/1 (0.0 %) - . Are 0/0 (0.0 %) 0/1 (0.0 %) - . Yeah 0/0 (0.0 %) 0/1 (0.0 %) - : Foobar 0/1 (0.0 %) 0/0 (0.0 %) - ? Are 0/1 (0.0 %) 0/0 (0.0 %) - ? Yeah 0/1 (0.0 %) 0/0 (0.0 %) - I'll do 0/1 (0.0 %) 0/0 (0.0 %) - all right 0/0 (0.0 %) 0/1 (0.0 %) - do just 0/1 (0.0 %) 0/0 (0.0 %) - foo a 0/0 (0.0 %) 0/1 (0.0 %) - or ? 0/1 (0.0 %) 0/0 (0.0 %) - probably just 0/0 (0.0 %) 0/1 (0.0 %) - right , 0/0 (0.0 %) 0/1 (0.0 %) - right . 0/0 (0.0 %) 0/1 (0.0 %) - that . 0/0 (0.0 %) 0/1 (0.0 %) - that ? 0/1 (0.0 %) 0/0 (0.0 %) - yeah , 0/0 (0.0 %) 0/1 (0.0 %) - Foobar , 1/2 (50.0 %) 1/1 (100.0 %) - Yeah , 2/2 (100.0 %) 2/3 (66.7 %) - , I 1/1 (100.0 %) 1/1 (100.0 %) - , sure 1/1 (100.0 %) 1/1 (100.0 %) - . When 1/1 (100.0 %) 1/1 (100.0 %) - Yeah 1/1 (100.0 %) 1/1 (100.0 %) - Are there 1/1 (100.0 %) 1/1 (100.0 %) - I hear 1/1 (100.0 %) 1/1 (100.0 %) - I think 1/1 (100.0 %) 1/1 (100.0 %) - When I 1/1 (100.0 %) 1/1 (100.0 %) - about just 1/1 (100.0 %) 1/1 (100.0 %) - any visuals 1/1 (100.0 %) 1/1 (100.0 %) - come to 1/1 (100.0 %) 1/1 (100.0 %) - hear Foobar 1/1 (100.0 %) 1/1 (100.0 %) - just that 2/2 (100.0 %) 2/2 (100.0 %) - mind or 1/1 (100.0 %) 1/1 (100.0 %) - sure . 1/1 (100.0 %) 1/1 (100.0 %) - that : 1/1 (100.0 %) 1/1 (100.0 %) - that come 1/1 (100.0 %) 1/1 (100.0 %) - there any 1/1 (100.0 %) 1/1 (100.0 %) - think about 1/1 (100.0 %) 1/1 (100.0 %) - to mind 1/1 (100.0 %) 1/1 (100.0 %) - visuals that 1/1 (100.0 %) 1/1 (100.0 %) diff --git a/test/data/syn_1.hyp.sbs b/test/data/syn_1.hyp.sbs index 3e71f78..69df6df 100644 --- a/test/data/syn_1.hyp.sbs +++ b/test/data/syn_1.hyp.sbs @@ -30,56 +30,3 @@ 14 um it's <-> it is uh 21 do <-> *** 24 *** <-> uh ------------------------------------------------------------- - Unigram Prec. Recall - do 0/0 (0.0 %) 0/1 (0.0 %) - is 0/1 (0.0 %) 0/0 (0.0 %) - it 0/1 (0.0 %) 0/0 (0.0 %) - it's 0/0 (0.0 %) 0/1 (0.0 %) - uh 0/2 (0.0 %) 0/0 (0.0 %) - um 0/1 (0.0 %) 0/1 (0.0 %) - we 0/0 (0.0 %) 0/1 (0.0 %) - we'll 0/1 (0.0 %) 0/0 (0.0 %) - will 1/1 (100.0 %) 1/2 (50.0 %) - a 2/2 (100.0 %) 2/2 (100.0 %) - evening 1/1 (100.0 %) 1/1 (100.0 %) - good 1/1 (100.0 %) 1/1 (100.0 %) - happen 1/1 (100.0 %) 1/1 (100.0 %) - have 1/1 (100.0 %) 1/1 (100.0 %) - matter 1/1 (100.0 %) 1/1 (100.0 %) - nice 1/1 (100.0 %) 1/1 (100.0 %) - no 1/1 (100.0 %) 1/1 (100.0 %) - opportunity 1/1 (100.0 %) 1/1 (100.0 %) - see 1/1 (100.0 %) 1/1 (100.0 %) - this 1/1 (100.0 %) 1/1 (100.0 %) - to 1/1 (100.0 %) 1/1 (100.0 %) - what 1/1 (100.0 %) 1/1 (100.0 %) - you'll 1/1 (100.0 %) 1/1 (100.0 %) ------------------------------------------------------------- - Bigram Precision Recall - do this 0/0 (0.0 %) 0/1 (0.0 %) - evening um 0/1 (0.0 %) 0/0 (0.0 %) - happen it 0/1 (0.0 %) 0/0 (0.0 %) - is uh 0/1 (0.0 %) 0/0 (0.0 %) - it is 0/1 (0.0 %) 0/0 (0.0 %) - it's a 0/0 (0.0 %) 0/1 (0.0 %) - to do 0/0 (0.0 %) 0/1 (0.0 %) - uh a 0/1 (0.0 %) 0/0 (0.0 %) - uh see 0/1 (0.0 %) 0/0 (0.0 %) - um it's 0/0 (0.0 %) 0/1 (0.0 %) - um no 0/1 (0.0 %) 0/0 (0.0 %) - we will 0/0 (0.0 %) 0/1 (0.0 %) - we'll have 0/1 (0.0 %) 0/0 (0.0 %) - will have 0/0 (0.0 %) 0/1 (0.0 %) - you'll uh 0/1 (0.0 %) 0/0 (0.0 %) - a good 1/1 (100.0 %) 1/1 (100.0 %) - a nice 1/1 (100.0 %) 1/1 (100.0 %) - good opportunity 1/1 (100.0 %) 1/1 (100.0 %) - have a 1/1 (100.0 %) 1/1 (100.0 %) - matter what 1/1 (100.0 %) 1/1 (100.0 %) - nice evening 1/1 (100.0 %) 1/1 (100.0 %) - no matter 1/1 (100.0 %) 1/1 (100.0 %) - opportunity to 1/1 (100.0 %) 1/1 (100.0 %) - this you'll 1/1 (100.0 %) 1/1 (100.0 %) - what will 1/1 (100.0 %) 1/1 (100.0 %) - will happen 1/1 (100.0 %) 1/1 (100.0 %) diff --git a/test/data/twenty.hyp-a2.sbs b/test/data/twenty.hyp-a2.sbs index 7d49c95..ccaf44f 100644 --- a/test/data/twenty.hyp-a2.sbs +++ b/test/data/twenty.hyp-a2.sbs @@ -13,25 +13,3 @@ 2 20 <-> *** 5 twenty <-> thirty 9 *** <-> two ------------------------------------------------------------- - Unigram Prec. Recall - 20 0/0 (0.0 %) 0/1 (0.0 %) - thirty 0/1 (0.0 %) 0/0 (0.0 %) - two 0/1 (0.0 %) 0/0 (0.0 %) - twenty 2/2 (100.0 %) 2/3 (66.7 %) - in 1/1 (100.0 %) 1/1 (100.0 %) - is 1/1 (100.0 %) 1/1 (100.0 %) - one 1/1 (100.0 %) 1/1 (100.0 %) - three 1/1 (100.0 %) 1/1 (100.0 %) ------------------------------------------------------------- - Bigram Precision Recall - 20 in 0/0 (0.0 %) 0/1 (0.0 %) - thirty is 0/1 (0.0 %) 0/0 (0.0 %) - twenty is 0/0 (0.0 %) 0/1 (0.0 %) - twenty thirty 0/1 (0.0 %) 0/0 (0.0 %) - twenty twenty 0/0 (0.0 %) 0/1 (0.0 %) - twenty two 0/1 (0.0 %) 0/0 (0.0 %) - two three 0/1 (0.0 %) 0/0 (0.0 %) - in twenty 1/1 (100.0 %) 1/1 (100.0 %) - is one 1/1 (100.0 %) 1/1 (100.0 %) - one twenty 1/1 (100.0 %) 1/1 (100.0 %) diff --git a/test/data/twenty.hyp.sbs b/test/data/twenty.hyp.sbs index e75a474..9dccead 100644 --- a/test/data/twenty.hyp.sbs +++ b/test/data/twenty.hyp.sbs @@ -13,24 +13,3 @@ 2 twenty <-> *** 5 twenty <-> thirty 9 *** <-> two ------------------------------------------------------------- - Unigram Prec. Recall - thirty 0/1 (0.0 %) 0/0 (0.0 %) - two 0/1 (0.0 %) 0/0 (0.0 %) - twenty 2/2 (100.0 %) 2/4 (50.0 %) - in 1/1 (100.0 %) 1/1 (100.0 %) - is 1/1 (100.0 %) 1/1 (100.0 %) - one 1/1 (100.0 %) 1/1 (100.0 %) - three 1/1 (100.0 %) 1/1 (100.0 %) ------------------------------------------------------------- - Bigram Precision Recall - thirty is 0/1 (0.0 %) 0/0 (0.0 %) - twenty in 0/0 (0.0 %) 0/1 (0.0 %) - twenty is 0/0 (0.0 %) 0/1 (0.0 %) - twenty thirty 0/1 (0.0 %) 0/0 (0.0 %) - twenty twenty 0/0 (0.0 %) 0/1 (0.0 %) - twenty two 0/1 (0.0 %) 0/0 (0.0 %) - two three 0/1 (0.0 %) 0/0 (0.0 %) - in twenty 1/1 (100.0 %) 1/1 (100.0 %) - is one 1/1 (100.0 %) 1/1 (100.0 %) - one twenty 1/1 (100.0 %) 1/1 (100.0 %) From 2c766562e2e11044783ea83ee7d2fdc6a8405ca0 Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Thu, 18 Apr 2024 14:15:52 +0000 Subject: [PATCH 8/9] fix log json missing unigram bigram info if output sbs not set --- src/fstalign.cpp | 1 + src/wer.cpp | 6 ++++-- src/wer.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/fstalign.cpp b/src/fstalign.cpp index 96fb33f..37ee686 100644 --- a/src/fstalign.cpp +++ b/src/fstalign.cpp @@ -695,6 +695,7 @@ void HandleWer(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine &engine } } + JsonLogUnigramBigramStats(topAlignment); if (!output_sbs.empty()) { logger->info("output_sbs = {}", output_sbs); WriteSbs(topAlignment, stitches, output_sbs); diff --git a/src/wer.cpp b/src/wer.cpp index 94b1b3c..1b2f066 100644 --- a/src/wer.cpp +++ b/src/wer.cpp @@ -603,6 +603,10 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st myfile << fmt::format("{0:>20}\t{1}", group.first, group.second) << endl; } + myfile.close(); +} + +void JsonLogUnigramBigramStats(wer_alignment &topAlignment) { for (const auto &a : topAlignment.unigram_stats) { string word = a.first; gram_error_counter u = a.second; @@ -626,6 +630,4 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st jsonLogger::JsonLogger::getLogger().root["wer"]["bigrams"][word]["precision"] = u.precision; jsonLogger::JsonLogger::getLogger().root["wer"]["bigrams"][word]["recall"] = u.recall; } - - myfile.close(); } diff --git a/src/wer.h b/src/wer.h index 8ccf674..f0e9f35 100644 --- a/src/wer.h +++ b/src/wer.h @@ -50,3 +50,4 @@ typedef vector> ErrorGroups; void AddErrorGroup(ErrorGroups &groups, size_t &line, string &ref, string &hyp); void WriteSbs(wer_alignment &topAlignment, const vector& stitches, string sbs_filename); +void JsonLogUnigramBigramStats(wer_alignment &topAlignment); From 3be9d070d5a501bbfafc294be0f9ac7f7753c8d9 Mon Sep 17 00:00:00 2001 From: Nishchal Bhandari Date: Thu, 18 Apr 2024 19:15:16 +0000 Subject: [PATCH 9/9] version bump --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 5f7e767..01137a8 100644 --- a/src/version.h +++ b/src/version.h @@ -1,5 +1,5 @@ #pragma once #define FSTALIGNER_VERSION_MAJOR 1 -#define FSTALIGNER_VERSION_MINOR 12 +#define FSTALIGNER_VERSION_MINOR 13 #define FSTALIGNER_VERSION_PATCH 0