Skip to content

Commit fac3b06

Browse files
authored
Add functions to get custom segmentations/readings for tokens come from a user dict (#294)
* Add functions to get custom segmentations/readings for tokens come from a user dict * Rename Yomi to Reading
1 parent 4d47dba commit fac3b06

File tree

2 files changed

+50
-0
lines changed

2 files changed

+50
-0
lines changed

tokenizer/token.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,27 @@ func (t Token) FeatureAt(i int) (string, bool) {
131131
return "", false
132132
}
133133

134+
// UserExtra represents custom segmentation and custom reading for user entries.
135+
type UserExtra struct {
136+
Tokens []string
137+
Readings []string
138+
}
139+
140+
// UserExtra returns extra data if token comes from a user dict.
141+
func (t Token) UserExtra() *UserExtra {
142+
if t.Class != USER {
143+
return nil
144+
}
145+
tokens := make([]string, len(t.udict.Contents[t.ID].Tokens))
146+
copy(tokens, t.udict.Contents[t.ID].Tokens)
147+
yomi := make([]string, len(t.udict.Contents[t.ID].Yomi))
148+
copy(yomi, t.udict.Contents[t.ID].Yomi)
149+
return &UserExtra{
150+
Tokens: tokens,
151+
Readings: yomi,
152+
}
153+
}
154+
134155
// POS returns POS elements of features.
135156
func (t Token) POS() []string {
136157
switch t.Class {

tokenizer/token_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,35 @@ func Test_FeaturesAndPosUserDict(t *testing.T) {
346346
}
347347
}
348348

349+
func Test_FeaturesUserExtra(t *testing.T) {
350+
d, err := dict.LoadDictFile(testDictPath)
351+
if err != nil {
352+
t.Fatalf("unexpected error, %v", err)
353+
}
354+
tok := Token{
355+
ID: 0,
356+
Class: USER,
357+
Start: 0,
358+
End: 1,
359+
Surface: "",
360+
}
361+
tok.dict = d
362+
if udic, err := dict.NewUserDict(userDictSample); err != nil {
363+
t.Fatalf("build user dict error: %v", err)
364+
} else {
365+
tok.udict = udic
366+
}
367+
368+
got := tok.UserExtra()
369+
want := &UserExtra{
370+
Tokens: []string{"日本", "経済", "新聞"},
371+
Readings: []string{"ニホン", "ケイザイ", "シンブン"},
372+
}
373+
if !reflect.DeepEqual(want, got) {
374+
t.Errorf("want %v, got %v", want, got)
375+
}
376+
}
377+
349378
func Test_TokenString(t *testing.T) {
350379
tok := Token{
351380
ID: 123,

0 commit comments

Comments
 (0)