Skip to content

Commit c93ae10

Browse files
authored
Merge pull request #226 from ikawaha/feature/add-index-to-token
2 parents 3bf895b + 4e2dfea commit c93ae10

File tree

4 files changed

+34
-49
lines changed

4 files changed

+34
-49
lines changed

tokenizer/lattice/lattice.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -258,12 +258,13 @@ func (la *Lattice) Backward(m TokenizeMode) {
258258
runeLen := utf8.RuneCountInString(p.Surface)
259259
stack := make([]*node, 0, runeLen)
260260
i := 0
261-
for _, r := range p.Surface {
261+
for k, r := range p.Surface {
262262
stack = append(stack, &node{
263-
ID: p.ID,
264-
Start: p.Start + i,
265-
Class: DUMMY,
266-
Surface: string(r),
263+
ID: p.ID,
264+
Start: p.Start + i,
265+
Class: DUMMY,
266+
Surface: string(r),
267+
Position: p.Position + k,
267268
})
268269
i++
269270
}

tokenizer/token.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ func (c TokenClass) String() string {
4040

4141
// Token represents a morph of a sentence.
4242
type Token struct {
43+
Index int
4344
ID int
4445
Class TokenClass
4546
Position int // byte position
@@ -209,7 +210,7 @@ func (t Token) String() string {
209210
return fmt.Sprintf("%q (%d: %d, %d) %v [%d]", t.Surface, t.Position, t.Start, t.End, t.Class, t.ID)
210211
}
211212

212-
// Equal returns true if tokens are equal.
213+
// Equal returns true if tokens are equal. This function compares values other than the `Index` field.
213214
func (t Token) Equal(v Token) bool {
214215
return t.ID == v.ID &&
215216
t.Class == v.Class &&

tokenizer/tokenizer.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ func (t Tokenizer) Analyze(input string, mode TokenizeMode) []Token {
103103
continue
104104
}
105105
tok := Token{
106+
Index: len(tokens),
106107
ID: n.ID,
107108
Class: TokenClass(n.Class),
108109
Position: n.Position,

tokenizer/tokenizer_test.go

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -225,21 +225,18 @@ func Test_AnalyzeWithSearchMode(t *testing.T) {
225225
}
226226
tokens := tnz.Analyze("関西国際空港", Search)
227227
expected := []Token{
228-
{ID: -1, Surface: "BOS"},
229-
{ID: 372968, Surface: "関西", Start: 0, End: 2, Class: TokenClass(lattice.KNOWN)},
230-
{ID: 168541, Surface: "国際", Start: 2, End: 4, Class: TokenClass(lattice.KNOWN)},
231-
{ID: 307133, Surface: "空港", Start: 4, End: 6, Class: TokenClass(lattice.KNOWN)},
232-
{ID: -1, Surface: "EOS", Start: 6, End: 6},
228+
{Index: 0, ID: -1, Surface: "BOS"},
229+
{Index: 1, ID: 372967, Surface: "関西", Start: 0, End: 2, Position: 0, Class: TokenClass(lattice.KNOWN)},
230+
{Index: 2, ID: 168542, Surface: "国際", Start: 2, End: 4, Position: 6, Class: TokenClass(lattice.KNOWN)},
231+
{Index: 3, ID: 307134, Surface: "空港", Start: 4, End: 6, Position: 12, Class: TokenClass(lattice.KNOWN)},
232+
{Index: 4, ID: -1, Surface: "EOS", Start: 6, End: 6, Position: 18},
233233
}
234234

235235
if len(tokens) != len(expected) {
236236
t.Fatalf("got %v, expected %v", tokens, expected)
237237
}
238238
for i, tok := range tokens {
239-
if tok.Class != expected[i].Class ||
240-
tok.Start != expected[i].Start ||
241-
tok.End != expected[i].End ||
242-
tok.Surface != expected[i].Surface {
239+
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
243240
t.Errorf("got %v, expected %v", tok, expected[i])
244241
}
245242
}
@@ -257,19 +254,15 @@ func Test_AnalyzeWithSearchModeUnknown(t *testing.T) {
257254

258255
tokens := tnz.Analyze("ポポピ", Search)
259256
expected := []Token{
260-
{ID: -1, Surface: "BOS"},
261-
{ID: 34, Surface: "ポポピ", Start: 0, End: 3, Class: TokenClass(lattice.UNKNOWN)},
262-
{ID: -1, Surface: "EOS", Start: 3, End: 3},
257+
{Index: 0, ID: -1, Surface: "BOS"},
258+
{Index: 1, ID: 34, Surface: "ポポピ", Start: 0, End: 3, Class: TokenClass(lattice.UNKNOWN)},
259+
{Index: 2, ID: -1, Surface: "EOS", Start: 3, End: 3, Position: 9},
263260
}
264261
if len(tokens) != len(expected) {
265262
t.Fatalf("got %v, expected %v", tokens, expected)
266263
}
267264
for i, tok := range tokens {
268-
if tok.ID != expected[i].ID ||
269-
tok.Class != expected[i].Class ||
270-
tok.Start != expected[i].Start ||
271-
tok.End != expected[i].End ||
272-
tok.Surface != expected[i].Surface {
265+
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
273266
t.Errorf("got %v, expected %v", tok, expected[i])
274267
}
275268
}
@@ -287,18 +280,14 @@ func Test_AnalyzeWithExtendedModeEmpty(t *testing.T) {
287280

288281
tokens := tnz.Analyze("", Extended)
289282
expected := []Token{
290-
{ID: -1, Surface: "BOS"},
291-
{ID: -1, Surface: "EOS"},
283+
{Index: 0, ID: -1, Surface: "BOS"},
284+
{Index: 1, ID: -1, Surface: "EOS"},
292285
}
293286
if len(tokens) != len(expected) {
294287
t.Fatalf("got %v, expected %v", tokens, expected)
295288
}
296289
for i, tok := range tokens {
297-
if tok.ID != expected[i].ID ||
298-
tok.Class != expected[i].Class ||
299-
tok.Start != expected[i].Start ||
300-
tok.End != expected[i].End ||
301-
tok.Surface != expected[i].Surface {
290+
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
302291
t.Errorf("got %v, expected %v", tok, expected[i])
303292
}
304293
}
@@ -316,20 +305,17 @@ func Test_AnalyzeWithExtendedMode(t *testing.T) {
316305

317306
tokens := tnz.Analyze("関西国際空港", Extended)
318307
expected := []Token{
319-
{ID: -1, Surface: "BOS"},
320-
{ID: 372968, Surface: "関西", Start: 0, End: 2, Class: TokenClass(lattice.KNOWN)},
321-
{ID: 168541, Surface: "国際", Start: 2, End: 4, Class: TokenClass(lattice.KNOWN)},
322-
{ID: 307133, Surface: "空港", Start: 4, End: 6, Class: TokenClass(lattice.KNOWN)},
323-
{ID: -1, Surface: "EOS", Start: 6, End: 6},
308+
{Index: 0, ID: -1, Surface: "BOS"},
309+
{Index: 1, ID: 372967, Surface: "関西", Start: 0, End: 2, Position: 0, Class: TokenClass(lattice.KNOWN)},
310+
{Index: 2, ID: 168542, Surface: "国際", Start: 2, End: 4, Position: 6, Class: TokenClass(lattice.KNOWN)},
311+
{Index: 3, ID: 307134, Surface: "空港", Start: 4, End: 6, Position: 12, Class: TokenClass(lattice.KNOWN)},
312+
{Index: 4, ID: -1, Surface: "EOS", Start: 6, End: 6, Position: 18},
324313
}
325314
if len(tokens) != len(expected) {
326315
t.Fatalf("got %v, expected %v", tokens, expected)
327316
}
328317
for i, tok := range tokens {
329-
if tok.Class != expected[i].Class ||
330-
tok.Start != expected[i].Start ||
331-
tok.End != expected[i].End ||
332-
tok.Surface != expected[i].Surface {
318+
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
333319
t.Errorf("got %v, expected %v", tok, expected[i])
334320
}
335321
}
@@ -347,21 +333,17 @@ func Test_AnalyzeWithExtendedModeUnknown(t *testing.T) {
347333

348334
tokens := tnz.Analyze("ポポピ", Extended)
349335
expected := []Token{
350-
{ID: -1, Surface: "BOS"},
351-
{ID: 34, Surface: "ポ", Start: 0, End: 1, Class: TokenClass(lattice.DUMMY)},
352-
{ID: 34, Surface: "ポ", Start: 1, End: 2, Class: TokenClass(lattice.DUMMY)},
353-
{ID: 34, Surface: "ピ", Start: 2, End: 3, Class: TokenClass(lattice.DUMMY)},
354-
{ID: -1, Surface: "EOS", Start: 3, End: 3},
336+
{Index: 0, ID: -1, Surface: "BOS"},
337+
{Index: 1, ID: 34, Surface: "ポ", Start: 0, End: 1, Position: 0, Class: TokenClass(lattice.DUMMY)},
338+
{Index: 2, ID: 34, Surface: "ポ", Start: 1, End: 2, Position: 3, Class: TokenClass(lattice.DUMMY)},
339+
{Index: 3, ID: 34, Surface: "ピ", Start: 2, End: 3, Position: 6, Class: TokenClass(lattice.DUMMY)},
340+
{Index: 4, ID: -1, Surface: "EOS", Start: 3, End: 3, Position: 9},
355341
}
356342
if len(tokens) != len(expected) {
357343
t.Fatalf("got %v, expected %v", tokens, expected)
358344
}
359345
for i, tok := range tokens {
360-
if tok.ID != expected[i].ID ||
361-
tok.Class != expected[i].Class ||
362-
tok.Start != expected[i].Start ||
363-
tok.End != expected[i].End ||
364-
tok.Surface != expected[i].Surface {
346+
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
365347
t.Errorf("got %v, expected %v", tok, expected[i])
366348
}
367349
}

0 commit comments

Comments
 (0)