fix: sgml parsing edge case when word list delimiter in transcript

- SCTK unfortunately doesn't escape the quote the entire field in the sgml file, and causes encoding/csv to split at the wrong place for a string like `S,"hello:","hello":C,"how","how"` with delimiter = `:`. - This commit applies a fix where we change the delimiter to an unlikely string using a carefully constructed regex, and them split on the new unlikely string.
shahruk10 · Jul 23, 2022 · a3b7d43 · a3b7d43
1 parent eb58394
commit a3b7d43
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 1 deletion.
diff --git a/internal/sctk/sgml.go b/internal/sctk/sgml.go
@@ -22,6 +22,7 @@ import (
 	"bufio"
 	"fmt"
 	"os"
+	"regexp"
 	"strconv"
 	"strings"
 
@@ -254,7 +255,7 @@ func parsePathTag(tokenizer *html.Tokenizer, t html.Token, speakerID string) (*A
 	// Splitting word list into tuples of (label, ref word, hyp word).
 	listStr := strings.TrimSpace(tokenizer.Token().Data)
 
-	wordList := textutils.FieldsWithQuoted(listStr, wordListDelimiter)
+	wordList := splitAlignedTuples(listStr)
 	if len(wordList) != sent.WordCount {
 		logrus.WithFields(logrus.Fields{
 			"speaker":    speakerID,
@@ -294,3 +295,23 @@ func parsePathTag(tokenizer *html.Tokenizer, t html.Token, speakerID string) (*A
 
 	return &sent, nil
 }
+
+// splitAlignedTuples splits the tuples containing (label, ref word, hyp word)
+// in the sgml file. We don't use strings.Fields or textutils.FieldsWithQuoted
+// here because sgml doesn't quote the entire tuple; this causes issues when the
+// ref or hyp word contains the delimiter used by SCTK (":"). To get around that
+// issue, we manually replace the delimiter using a regex and then split on the
+// new delimiter, which should'nt really ever appear as a ref or hyp word.
+func splitAlignedTuples(list string) []string {
+	const (
+		newDelimiter = "<sctk-break-here>"
+	)
+
+	// Case sensitive, and has surrounding context characters to match only the
+	// beginning of a aligned word tuple.
+	r := regexp.MustCompile(":([CSID]),")
+
+	list = r.ReplaceAllString(list, newDelimiter+"$1,")
+
+	return strings.Split(list, newDelimiter)
+}
diff --git a/internal/sctk/sgml_test.go b/internal/sctk/sgml_test.go
@@ -22,6 +22,51 @@ import (
 	"github.com/google/go-cmp/cmp"
 )
 
+func TestSplitAlignedTuples(t *testing.T) {
+	t.Parallel()
+
+	testCases := []struct {
+		name    string
+		listStr string
+		want    []string
+	}{
+		{
+			name:    "normal",
+			listStr: `C,"a","a":S,"a","b":I,,"a":D,"b",`,
+			want:    []string{`C,"a","a"`, `S,"a","b"`, `I,,"a"`, `D,"b",`},
+		},
+		{
+			name:    "unquoted_fields",
+			listStr: `C,"a:","a:"` + ":" + `S,"a","b:"` + ":" + `I,,"a:"` + ":" + `D,"b:",`,
+			want:    []string{`C,"a:","a:"`, `S,"a","b:"`, `I,,"a:"`, `D,"b:",`},
+		},
+		{
+			name:    "single_part",
+			listStr: `C,"a","a"`,
+			want:    []string{`C,"a","a"`},
+		},
+		{
+			name:    "empty_string",
+			listStr: ``,
+			want:    []string{""},
+		},
+	}
+
+	for _, tc := range testCases {
+		tc := tc
+
+		t.Run(tc.name, func(subT *testing.T) {
+			subT.Parallel()
+
+			got := splitAlignedTuples(tc.listStr)
+
+			if diff := cmp.Diff(tc.want, got); diff != "" {
+				subT.Errorf("unexpected split tuples (-want, +got):\n%s", diff)
+			}
+		})
+	}
+}
+
 //nolint: funlen // table tests can be long.
 func TestReadAlignmentSgml(t *testing.T) {
 	t.Parallel()