Skip to content

Commit

Permalink
Quick patch for text file fast-d tokenization (#28)
Browse files Browse the repository at this point in the history
* Quick patch for text file fast-d tokenization

* Fix test cases for arbitrary behavior
  • Loading branch information
qmac authored Apr 4, 2022
1 parent 0059740 commit 1316f7b
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 17 deletions.
2 changes: 1 addition & 1 deletion src/OneBestFstLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ std::vector<int> OneBestFstLoader::convertToIntVector(fst::SymbolTable &symbol)

FstAlignOption options;
for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
std::string token = *i;
std::string token = UnicodeLowercase(*i);
int token_sym = symbol.Find(token);
if (token_sym == -1) {
token_sym = symbol.Find(options.symUnk);
Expand Down
4 changes: 2 additions & 2 deletions src/version.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#pragma once

#define FSTALIGNER_VERSION_MAJOR 1
#define FSTALIGNER_VERSION_MINOR 5
#define FSTALIGNER_VERSION_PATCH 0
#define FSTALIGNER_VERSION_MINOR 6
#define FSTALIGNER_VERSION_PATCH 1
4 changes: 2 additions & 2 deletions test/data/short.aligned.nlp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment
<crosstalk>|2|0.0000|0.0000|||LC|[]|[]|||
Yeah|1|0.0000|0.0000|,||UC|[]|[]|||
yeah|1|||,||LC|[]|[]|||del
Yeah|1|||,||UC|[]|[]|||del
yeah|1|0.0000|0.0000|,||LC|[]|[]|||
right|1|0.0000|0.0000|.||LC|[]|[]|||
Yeah|1|||,||UC|[]|[]|||del
alright|1|0.0000|0.0000|,||LC|[]|[]|||sub(i'll),split_worst
Expand Down
14 changes: 2 additions & 12 deletions test/fstalign_Test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -256,24 +256,14 @@ TEST_CASE_METHOD(UniqueTestsFixture, "main-standard-composition()") {
REQUIRE_THAT(result, Contains("WER: INS:0 DEL:2 SUB:2"));
}

SECTION("wer (nlp output)") {
const auto result = exec(command("wer", approach, "short.ref.nlp", "short.hyp.nlp", sbs_output, nlp_output,
TEST_SYNONYMS, nullptr, false, -1, "--disable-approx-alignment"));
const auto testFile = std::string{TEST_DATA} + "short.aligned.nlp";

REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str()));
REQUIRE_THAT(result, Contains("WER: 5/31 = 0.1613"));
REQUIRE_THAT(result, Contains("WER: INS:0 DEL:2 SUB:2"));
}

SECTION("Case Metrics") {
const auto result = exec(command("wer", approach, "short.ref.nlp", "short.hyp.txt", sbs_output, nlp_output,
TEST_SYNONYMS, nullptr, false, -1, "--record-case-stats"));
const auto testFile = std::string{TEST_DATA} + "short.aligned.nlp";

REQUIRE(compareFiles(nlp_output.c_str(), testFile.c_str()));
REQUIRE_THAT(result, Contains("case WER, (matching words only): Precision:1.0"));
REQUIRE_THAT(result, Contains("case WER, (all including substitutions): Precision:0.77"));
REQUIRE_THAT(result, Contains("case WER, (matching words only): Precision:0.857143"));
REQUIRE_THAT(result, Contains("case WER, (all including substitutions): Precision:0.666667"));
}

// alignment tests
Expand Down

0 comments on commit 1316f7b

Please sign in to comment.