diff --git a/src/Nlp.cpp b/src/Nlp.cpp index 06cdfe8..0bf287f 100644 --- a/src/Nlp.cpp +++ b/src/Nlp.cpp @@ -30,7 +30,6 @@ NlpFstLoader::NlpFstLoader(std::vector &records, Json::Value norma // fuse multiple rows that have the same id/label into one entry only for (auto &row : records) { - mNlpRows.push_back(row); auto curr_tk = row.token; auto curr_label = row.best_label; auto curr_label_id = row.best_label_id; @@ -38,16 +37,14 @@ NlpFstLoader::NlpFstLoader(std::vector &records, Json::Value norma auto curr_row_tags = row.wer_tags; // Update wer tags in records to real string labels - vector real_wer_tags; for (auto &tag : curr_row_tags) { - auto real_tag = tag; if (mWerSidecar != Json::nullValue) { - real_tag = "###" + real_tag + "_" + mWerSidecar[real_tag]["entity_type"].asString() + "###"; + tag.entity_type = mWerSidecar[tag.tag_id]["entity_type"].asString(); } - real_wer_tags.push_back(real_tag); } - row.wer_tags = real_wer_tags; + row.wer_tags = curr_row_tags; std::string speaker = row.speakerId; + mNlpRows.push_back(row); if (processLabels && curr_label != "") { if (firstTk || curr_label != last_label) { @@ -411,8 +408,8 @@ std::string NlpReader::GetBestLabel(std::string &labels) { return labels; } -std::vector NlpReader::GetWerTags(std::string &wer_tags_str) { - std::vector wer_tags; +std::vector NlpReader::GetWerTags(std::string &wer_tags_str) { + std::vector wer_tags; if (wer_tags_str == "[]") { return wer_tags; } @@ -420,8 +417,9 @@ std::vector NlpReader::GetWerTags(std::string &wer_tags_str) { int current_pos = 2; auto pos = wer_tags_str.find("'", current_pos); while (pos != -1) { - std::string wer_tag = wer_tags_str.substr(current_pos, pos - current_pos); - wer_tags.push_back(wer_tag); + WerTagEntry entry; + entry.tag_id = wer_tags_str.substr(current_pos, pos - current_pos); + wer_tags.push_back(entry); current_pos = wer_tags_str.find("'", pos + 1) + 1; if (current_pos == 0) { break; diff --git a/src/Nlp.h b/src/Nlp.h index a9757f9..13c246d 100644 --- a/src/Nlp.h +++ b/src/Nlp.h @@ -16,6 +16,11 @@ using namespace std; using namespace fst; +struct WerTagEntry { + string tag_id; + string entity_type; +}; + struct RawNlpRecord { string token; string speakerId; @@ -27,7 +32,7 @@ struct RawNlpRecord { string labels; string best_label; string best_label_id; - vector wer_tags; + vector wer_tags; string confidence; }; @@ -37,7 +42,7 @@ class NlpReader { virtual ~NlpReader(); vector read_from_disk(const std::string &filename); string GetBestLabel(std::string &labels); - vector GetWerTags(std::string &wer_tags_str); + vector GetWerTags(std::string &wer_tags_str); string GetLabelId(std::string &label); }; diff --git a/src/fstalign.cpp b/src/fstalign.cpp index e7cc2fb..37ee686 100644 --- a/src/fstalign.cpp +++ b/src/fstalign.cpp @@ -619,7 +619,7 @@ void write_stitches_to_nlp(vector& stitches, ofstream &output_nlp_fil << "["; /* for (auto wer_tag : nlpRow.wer_tags) { */ for (auto it = stitch.nlpRow.wer_tags.begin(); it != stitch.nlpRow.wer_tags.end(); ++it) { - output_nlp_file << "'" << *it << "'"; + output_nlp_file << "'" << it->tag_id << "'"; if (std::next(it) != stitch.nlpRow.wer_tags.end()) { output_nlp_file << ", "; } @@ -695,6 +695,7 @@ void HandleWer(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine &engine } } + JsonLogUnigramBigramStats(topAlignment); if (!output_sbs.empty()) { logger->info("output_sbs = {}", output_sbs); WriteSbs(topAlignment, stitches, output_sbs); diff --git a/src/version.h b/src/version.h index 5f7e767..01137a8 100644 --- a/src/version.h +++ b/src/version.h @@ -1,5 +1,5 @@ #pragma once #define FSTALIGNER_VERSION_MAJOR 1 -#define FSTALIGNER_VERSION_MINOR 12 +#define FSTALIGNER_VERSION_MINOR 13 #define FSTALIGNER_VERSION_PATCH 0 diff --git a/src/wer.cpp b/src/wer.cpp index beca178..1b2f066 100644 --- a/src/wer.cpp +++ b/src/wer.cpp @@ -350,19 +350,16 @@ void RecordTagWer(const vector& stitches) { for (const auto &stitch : stitches) { if (!stitch.nlpRow.wer_tags.empty()) { for (auto wer_tag : stitch.nlpRow.wer_tags) { - int tag_start = wer_tag.find_first_not_of('#'); - int tag_end = wer_tag.find('_'); - string wer_tag_id = wer_tag.substr(tag_start, tag_end - tag_start); - wer_results.insert(std::pair(wer_tag_id, {0, 0, 0, 0, 0})); + wer_results.insert(std::pair(wer_tag.tag_id, {0, 0, 0, 0, 0})); // Check with rfind since other comments can be there bool del = stitch.comment.rfind("del", 0) == 0; bool ins = stitch.comment.rfind("ins", 0) == 0; bool sub = stitch.comment.rfind("sub", 0) == 0; - wer_results[wer_tag_id].insertions += ins; - wer_results[wer_tag_id].deletions += del; - wer_results[wer_tag_id].substitutions += sub; + wer_results[wer_tag.tag_id].insertions += ins; + wer_results[wer_tag.tag_id].deletions += del; + wer_results[wer_tag.tag_id].substitutions += sub; if (!ins) { - wer_results[wer_tag_id].numWordsInReference += 1; + wer_results[wer_tag.tag_id].numWordsInReference += 1; } } } @@ -555,7 +552,7 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st string tk_wer_tags = ""; auto wer_tags = p_stitch.nlpRow.wer_tags; for (auto wer_tag: wer_tags) { - tk_wer_tags = tk_wer_tags + wer_tag + "|"; + tk_wer_tags = tk_wer_tags + "###" + wer_tag.tag_id + "_" + wer_tag.entity_type + "###|"; } string ref_tk = p_stitch.reftk; string hyp_tk = p_stitch.hyptk; @@ -606,6 +603,10 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st myfile << fmt::format("{0:>20}\t{1}", group.first, group.second) << endl; } + myfile.close(); +} + +void JsonLogUnigramBigramStats(wer_alignment &topAlignment) { for (const auto &a : topAlignment.unigram_stats) { string word = a.first; gram_error_counter u = a.second; @@ -617,18 +618,6 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st jsonLogger::JsonLogger::getLogger().root["wer"]["unigrams"][word]["precision"] = u.precision; jsonLogger::JsonLogger::getLogger().root["wer"]["unigrams"][word]["recall"] = u.recall; } - // output error unigrams - myfile << string(60, '-') << endl << fmt::format("{0:>20}\t{1:10}\t{2:10}", "Unigram", "Prec.", "Recall") << endl; - for (const auto &a : topAlignment.unigram_stats) { - string word = a.first; - gram_error_counter u = a.second; - myfile << fmt::format("{0:>20}\t{1}/{2} ({3:.1f} %)\t{4}/{5} ({6:.1f} %)", word, u.correct, - (u.correct + u.ins + u.subst_fp), (float)u.precision, u.correct, (u.correct + u.del + u.subst_fn), - (float)u.recall) - << endl; - } - - myfile << string(60, '-') << endl << fmt::format("{0:>20}\t{1:20}\t{2:20}", "Bigram", "Precision", "Recall") << endl; for (const auto &a : topAlignment.bigrams_stats) { string word = a.first; @@ -641,14 +630,4 @@ void WriteSbs(wer_alignment &topAlignment, const vector& stitches, st jsonLogger::JsonLogger::getLogger().root["wer"]["bigrams"][word]["precision"] = u.precision; jsonLogger::JsonLogger::getLogger().root["wer"]["bigrams"][word]["recall"] = u.recall; } - for (const auto &a : topAlignment.bigrams_stats) { - string word = a.first; - gram_error_counter u = a.second; - myfile << fmt::format("{0:>20}\t{1}/{2} ({3:.1f} %)\t{4}/{5} ({6:.1f} %)", word, u.correct, - (u.correct + u.ins + u.subst_fp), (float)u.precision, u.correct, (u.correct + u.del + u.subst_fn), - (float)u.recall) - << endl; - } - - myfile.close(); } diff --git a/src/wer.h b/src/wer.h index 8ccf674..f0e9f35 100644 --- a/src/wer.h +++ b/src/wer.h @@ -50,3 +50,4 @@ typedef vector> ErrorGroups; void AddErrorGroup(ErrorGroups &groups, size_t &line, string &ref, string &hyp); void WriteSbs(wer_alignment &topAlignment, const vector& stitches, string sbs_filename); +void JsonLogUnigramBigramStats(wer_alignment &topAlignment); diff --git a/test/data/short.aligned.case.nlp b/test/data/short.aligned.case.nlp index d2fd08c..1d10606 100644 --- a/test/data/short.aligned.case.nlp +++ b/test/data/short.aligned.case.nlp @@ -23,7 +23,7 @@ sure|1|0.0000|0.0000|.||LC|[]|[]|||| When|1|0.0000|0.0000|||UC|[]|[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| hear|1|0.0000|0.0000|||LC|[]|[]|||| -Foobar|1|0.0000|0.0000|,||UC|[]|[]|||| +Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| I|1|0.0000|0.0000|||CA|[]|[]|||| think|1|0.0000|0.0000|||LC|[]|[]|||| about|1|0.0000|0.0000|||LC|[]|[]|||| diff --git a/test/data/short.aligned.punc.nlp b/test/data/short.aligned.punc.nlp index 421d1ac..8c4cf3c 100644 --- a/test/data/short.aligned.punc.nlp +++ b/test/data/short.aligned.punc.nlp @@ -31,7 +31,7 @@ sure|1|0.0000|0.0000|.||LC|[]|[]|||| When|1|0.0000|0.0000|||UC|[]|[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| hear|1|0.0000|0.0000|||LC|[]|[]|||| -Foobar|1|0.0000|0.0000|,||UC|[]|[]|||| +Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| ,|1|0.0000|0.0000|||||[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| think|1|0.0000|0.0000|||LC|[]|[]|||| diff --git a/test/data/short.aligned.punc_case.nlp b/test/data/short.aligned.punc_case.nlp index affb08c..07615c3 100644 --- a/test/data/short.aligned.punc_case.nlp +++ b/test/data/short.aligned.punc_case.nlp @@ -31,7 +31,7 @@ sure|1|0.0000|0.0000|.||LC|[]|[]|||| When|1|0.0000|0.0000|||UC|[]|[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| hear|1|0.0000|0.0000|||LC|[]|[]|||| -Foobar|1|0.0000|0.0000|,||UC|[]|[]|||| +Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| ,|1|0.0000|0.0000|||||[]|||| I|1|0.0000|0.0000|||CA|[]|[]|||| think|1|0.0000|0.0000|||LC|[]|[]|||| diff --git a/test/data/short.sbs.txt b/test/data/short.sbs.txt new file mode 100644 index 0000000..1406b34 --- /dev/null +++ b/test/data/short.sbs.txt @@ -0,0 +1,52 @@ + ref_token hyp_token IsErr Class Wer_Tag_Entities + + Yeah Yeah + , , + yeah ERR + , ERR + right right + . ERR + Yeah ERR + , ERR + all ERR + right ERR + , I'll ERR + probably do ERR + just just + that that + . ? ERR + Are Are + there there + any any + visuals visuals + that that + come come + to to + mind mind + or or ___100002_SYN_1-1___ + ? ERR + Yeah Yeah + , , + sure sure + . . + When When + I I + hear hear + Foobar Foobar ###1_PROPER_NOUN###|###2_SPACY>ORG###| + , , + I I + think think + about about + just just + that that + : : + Foobar ERR + foo , ERR + a a +------------------------------------------------------------ + Line Group + 5 yeah , <-> *** + 8 . Yeah , all right , probably <-> I'll do + 17 . <-> ? + 27 *** <-> ? + 43 foo <-> Foobar , diff --git a/test/data/short_punc.ref.nlp b/test/data/short_punc.ref.nlp index 35e5ec5..1710724 100644 --- a/test/data/short_punc.ref.nlp +++ b/test/data/short_punc.ref.nlp @@ -1,33 +1,33 @@ -token|speaker|ts|endTs|punctuation|case|tags -|2||||LC|[] -Yeah|1|||,|UC|[] -yeah|1|||,|LC|[] -right|1|||.|LC|[] -Yeah|1|||,|UC|[] -all|1||||LC|[] -right|1|||,|LC|[] -probably|1||||LC|[] -just|1||||LC|[] -that|1|||.|LC|[] -Are|3||||UC|[] -there|3||||LC|[] -any|3||||LC|[] -visuals|3||||LC|[] -that|3||||LC|[] -come|3||||LC|[] -to|3||||LC|[] -mind|3||||LC|[] -or-|3||||LC|[] -Yeah|1|||,|UC|[] -sure|1|||.|LC|[] -When|1||||UC|[] -I|1||||CA|[] -hear|1||||LC|[] -Foobar|1|||,|UC|[] -I|1||||CA|[] -think|1||||LC|[] -about|1||||LC|[] -just|1||||LC|[] -that|1|||:|LC|[] -foo|1||||LC|[] -a|1||||LC|[] +token|speaker|ts|endTs|punctuation|case|tags|wer_tags +|2||||LC|[]|[] +Yeah|1|||,|UC|[]|[] +yeah|1|||,|LC|[]|[] +right|1|||.|LC|[]|[] +Yeah|1|||,|UC|[]|[] +all|1||||LC|[]|[] +right|1|||,|LC|[]|[] +probably|1||||LC|[]|[] +just|1||||LC|[]|[] +that|1|||.|LC|[]|[] +Are|3||||UC|[]|[] +there|3||||LC|[]|[] +any|3||||LC|[]|[] +visuals|3||||LC|[]|[] +that|3||||LC|[]|[] +come|3||||LC|[]|[] +to|3||||LC|[]|[] +mind|3||||LC|[]|[] +or-|3||||LC|[]|[] +Yeah|1|||,|UC|[]|[] +sure|1|||.|LC|[]|[] +When|1||||UC|[]|[] +I|1||||CA|[]|[] +hear|1||||LC|[]|[] +Foobar|1|||,|UC|[]|['1', '2'] +I|1||||CA|[]|[] +think|1||||LC|[]|[] +about|1||||LC|[]|[] +just|1||||LC|[]|[] +that|1|||:|LC|[]|[] +foo|1||||LC|[]|[] +a|1||||LC|[]|[] diff --git a/test/data/short_punc.wer_tag.json b/test/data/short_punc.wer_tag.json new file mode 100644 index 0000000..24f8ba3 --- /dev/null +++ b/test/data/short_punc.wer_tag.json @@ -0,0 +1,8 @@ +{ + "1": { + "entity_type": "PROPER_NOUN" + }, + "2": { + "entity_type": "SPACY>ORG" + } +} diff --git a/test/data/syn_1.hyp.sbs b/test/data/syn_1.hyp.sbs index 3e71f78..69df6df 100644 --- a/test/data/syn_1.hyp.sbs +++ b/test/data/syn_1.hyp.sbs @@ -30,56 +30,3 @@ 14 um it's <-> it is uh 21 do <-> *** 24 *** <-> uh ------------------------------------------------------------- - Unigram Prec. Recall - do 0/0 (0.0 %) 0/1 (0.0 %) - is 0/1 (0.0 %) 0/0 (0.0 %) - it 0/1 (0.0 %) 0/0 (0.0 %) - it's 0/0 (0.0 %) 0/1 (0.0 %) - uh 0/2 (0.0 %) 0/0 (0.0 %) - um 0/1 (0.0 %) 0/1 (0.0 %) - we 0/0 (0.0 %) 0/1 (0.0 %) - we'll 0/1 (0.0 %) 0/0 (0.0 %) - will 1/1 (100.0 %) 1/2 (50.0 %) - a 2/2 (100.0 %) 2/2 (100.0 %) - evening 1/1 (100.0 %) 1/1 (100.0 %) - good 1/1 (100.0 %) 1/1 (100.0 %) - happen 1/1 (100.0 %) 1/1 (100.0 %) - have 1/1 (100.0 %) 1/1 (100.0 %) - matter 1/1 (100.0 %) 1/1 (100.0 %) - nice 1/1 (100.0 %) 1/1 (100.0 %) - no 1/1 (100.0 %) 1/1 (100.0 %) - opportunity 1/1 (100.0 %) 1/1 (100.0 %) - see 1/1 (100.0 %) 1/1 (100.0 %) - this 1/1 (100.0 %) 1/1 (100.0 %) - to 1/1 (100.0 %) 1/1 (100.0 %) - what 1/1 (100.0 %) 1/1 (100.0 %) - you'll 1/1 (100.0 %) 1/1 (100.0 %) ------------------------------------------------------------- - Bigram Precision Recall - do this 0/0 (0.0 %) 0/1 (0.0 %) - evening um 0/1 (0.0 %) 0/0 (0.0 %) - happen it 0/1 (0.0 %) 0/0 (0.0 %) - is uh 0/1 (0.0 %) 0/0 (0.0 %) - it is 0/1 (0.0 %) 0/0 (0.0 %) - it's a 0/0 (0.0 %) 0/1 (0.0 %) - to do 0/0 (0.0 %) 0/1 (0.0 %) - uh a 0/1 (0.0 %) 0/0 (0.0 %) - uh see 0/1 (0.0 %) 0/0 (0.0 %) - um it's 0/0 (0.0 %) 0/1 (0.0 %) - um no 0/1 (0.0 %) 0/0 (0.0 %) - we will 0/0 (0.0 %) 0/1 (0.0 %) - we'll have 0/1 (0.0 %) 0/0 (0.0 %) - will have 0/0 (0.0 %) 0/1 (0.0 %) - you'll uh 0/1 (0.0 %) 0/0 (0.0 %) - a good 1/1 (100.0 %) 1/1 (100.0 %) - a nice 1/1 (100.0 %) 1/1 (100.0 %) - good opportunity 1/1 (100.0 %) 1/1 (100.0 %) - have a 1/1 (100.0 %) 1/1 (100.0 %) - matter what 1/1 (100.0 %) 1/1 (100.0 %) - nice evening 1/1 (100.0 %) 1/1 (100.0 %) - no matter 1/1 (100.0 %) 1/1 (100.0 %) - opportunity to 1/1 (100.0 %) 1/1 (100.0 %) - this you'll 1/1 (100.0 %) 1/1 (100.0 %) - what will 1/1 (100.0 %) 1/1 (100.0 %) - will happen 1/1 (100.0 %) 1/1 (100.0 %) diff --git a/test/data/twenty.hyp-a2.sbs b/test/data/twenty.hyp-a2.sbs index 7d49c95..ccaf44f 100644 --- a/test/data/twenty.hyp-a2.sbs +++ b/test/data/twenty.hyp-a2.sbs @@ -13,25 +13,3 @@ 2 20 <-> *** 5 twenty <-> thirty 9 *** <-> two ------------------------------------------------------------- - Unigram Prec. Recall - 20 0/0 (0.0 %) 0/1 (0.0 %) - thirty 0/1 (0.0 %) 0/0 (0.0 %) - two 0/1 (0.0 %) 0/0 (0.0 %) - twenty 2/2 (100.0 %) 2/3 (66.7 %) - in 1/1 (100.0 %) 1/1 (100.0 %) - is 1/1 (100.0 %) 1/1 (100.0 %) - one 1/1 (100.0 %) 1/1 (100.0 %) - three 1/1 (100.0 %) 1/1 (100.0 %) ------------------------------------------------------------- - Bigram Precision Recall - 20 in 0/0 (0.0 %) 0/1 (0.0 %) - thirty is 0/1 (0.0 %) 0/0 (0.0 %) - twenty is 0/0 (0.0 %) 0/1 (0.0 %) - twenty thirty 0/1 (0.0 %) 0/0 (0.0 %) - twenty twenty 0/0 (0.0 %) 0/1 (0.0 %) - twenty two 0/1 (0.0 %) 0/0 (0.0 %) - two three 0/1 (0.0 %) 0/0 (0.0 %) - in twenty 1/1 (100.0 %) 1/1 (100.0 %) - is one 1/1 (100.0 %) 1/1 (100.0 %) - one twenty 1/1 (100.0 %) 1/1 (100.0 %) diff --git a/test/data/twenty.hyp.sbs b/test/data/twenty.hyp.sbs index e75a474..9dccead 100644 --- a/test/data/twenty.hyp.sbs +++ b/test/data/twenty.hyp.sbs @@ -13,24 +13,3 @@ 2 twenty <-> *** 5 twenty <-> thirty 9 *** <-> two ------------------------------------------------------------- - Unigram Prec. Recall - thirty 0/1 (0.0 %) 0/0 (0.0 %) - two 0/1 (0.0 %) 0/0 (0.0 %) - twenty 2/2 (100.0 %) 2/4 (50.0 %) - in 1/1 (100.0 %) 1/1 (100.0 %) - is 1/1 (100.0 %) 1/1 (100.0 %) - one 1/1 (100.0 %) 1/1 (100.0 %) - three 1/1 (100.0 %) 1/1 (100.0 %) ------------------------------------------------------------- - Bigram Precision Recall - thirty is 0/1 (0.0 %) 0/0 (0.0 %) - twenty in 0/0 (0.0 %) 0/1 (0.0 %) - twenty is 0/0 (0.0 %) 0/1 (0.0 %) - twenty thirty 0/1 (0.0 %) 0/0 (0.0 %) - twenty twenty 0/0 (0.0 %) 0/1 (0.0 %) - twenty two 0/1 (0.0 %) 0/0 (0.0 %) - two three 0/1 (0.0 %) 0/0 (0.0 %) - in twenty 1/1 (100.0 %) 1/1 (100.0 %) - is one 1/1 (100.0 %) 1/1 (100.0 %) - one twenty 1/1 (100.0 %) 1/1 (100.0 %) diff --git a/test/fstalign_Test.cc b/test/fstalign_Test.cc index 4b23e63..7c30f46 100644 --- a/test/fstalign_Test.cc +++ b/test/fstalign_Test.cc @@ -680,11 +680,14 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-adapted-composition()") { } SECTION("NLP Hypothesis: wer with case and punctuation(nlp output)") { + const auto wer_sidecar_path = TEST_DATA + "short_punc.wer_tag.json"; const auto result = - exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case"); + exec(command("wer", approach, "short_punc.ref.nlp", "short_punc.hyp.nlp", sbs_output, nlp_output, TEST_SYNONYMS)+" --use-punctuation --use-case --wer-sidecar " + wer_sidecar_path); const auto testFile = std::string{TEST_DATA} + "short.aligned.punc_case.nlp"; + const auto testSbsFile = std::string{TEST_DATA} + "short.sbs.txt"; REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str())); + REQUIRE(compareFiles(sbs_output.c_str(), testSbsFile.c_str())); REQUIRE_THAT(result, Contains("WER: 13/42 = 0.3095")); REQUIRE_THAT(result, Contains("WER: INS:2 DEL:7 SUB:4")); }