Skip to content

Commit

Permalink
fix: sgml parsing edge case when word list delimiter in transcript
Browse files Browse the repository at this point in the history
  - SCTK unfortunately doesn't escape the quote the entire field in the
    sgml file, and causes encoding/csv to split at the wrong place for
    a string like `S,"hello:","hello":C,"how","how"` with delimiter = `:`.

  - This commit applies a fix where we change the delimiter to an unlikely
    string using a carefully constructed regex, and them split on the new
    unlikely string.
  • Loading branch information
shahruk10 committed Jul 23, 2022
1 parent eb58394 commit a3b7d43
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 1 deletion.
23 changes: 22 additions & 1 deletion internal/sctk/sgml.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"bufio"
"fmt"
"os"
"regexp"
"strconv"
"strings"

Expand Down Expand Up @@ -254,7 +255,7 @@ func parsePathTag(tokenizer *html.Tokenizer, t html.Token, speakerID string) (*A
// Splitting word list into tuples of (label, ref word, hyp word).
listStr := strings.TrimSpace(tokenizer.Token().Data)

wordList := textutils.FieldsWithQuoted(listStr, wordListDelimiter)
wordList := splitAlignedTuples(listStr)
if len(wordList) != sent.WordCount {
logrus.WithFields(logrus.Fields{
"speaker": speakerID,
Expand Down Expand Up @@ -294,3 +295,23 @@ func parsePathTag(tokenizer *html.Tokenizer, t html.Token, speakerID string) (*A

return &sent, nil
}

// splitAlignedTuples splits the tuples containing (label, ref word, hyp word)
// in the sgml file. We don't use strings.Fields or textutils.FieldsWithQuoted
// here because sgml doesn't quote the entire tuple; this causes issues when the
// ref or hyp word contains the delimiter used by SCTK (":"). To get around that
// issue, we manually replace the delimiter using a regex and then split on the
// new delimiter, which should'nt really ever appear as a ref or hyp word.
func splitAlignedTuples(list string) []string {
const (
newDelimiter = "<sctk-break-here>"
)

// Case sensitive, and has surrounding context characters to match only the
// beginning of a aligned word tuple.
r := regexp.MustCompile(":([CSID]),")

list = r.ReplaceAllString(list, newDelimiter+"$1,")

return strings.Split(list, newDelimiter)
}
45 changes: 45 additions & 0 deletions internal/sctk/sgml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,51 @@ import (
"github.com/google/go-cmp/cmp"
)

func TestSplitAlignedTuples(t *testing.T) {
t.Parallel()

testCases := []struct {
name string
listStr string
want []string
}{
{
name: "normal",
listStr: `C,"a","a":S,"a","b":I,,"a":D,"b",`,
want: []string{`C,"a","a"`, `S,"a","b"`, `I,,"a"`, `D,"b",`},
},
{
name: "unquoted_fields",
listStr: `C,"a:","a:"` + ":" + `S,"a","b:"` + ":" + `I,,"a:"` + ":" + `D,"b:",`,
want: []string{`C,"a:","a:"`, `S,"a","b:"`, `I,,"a:"`, `D,"b:",`},
},
{
name: "single_part",
listStr: `C,"a","a"`,
want: []string{`C,"a","a"`},
},
{
name: "empty_string",
listStr: ``,
want: []string{""},
},
}

for _, tc := range testCases {
tc := tc

t.Run(tc.name, func(subT *testing.T) {
subT.Parallel()

got := splitAlignedTuples(tc.listStr)

if diff := cmp.Diff(tc.want, got); diff != "" {
subT.Errorf("unexpected split tuples (-want, +got):\n%s", diff)
}
})
}
}

//nolint: funlen // table tests can be long.
func TestReadAlignmentSgml(t *testing.T) {
t.Parallel()
Expand Down

0 comments on commit a3b7d43

Please sign in to comment.