-
Notifications
You must be signed in to change notification settings - Fork 7
/
reverse_maximum_matching.go
92 lines (68 loc) · 1.88 KB
/
reverse_maximum_matching.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
package gotokenizer
import (
"strings"
)
// ReverseMaxMatch records dict and dictPath
type ReverseMaxMatch struct {
dict *Dict
dictPath string
WordFilter WordFilter
EnabledFilterStopToken bool
StopTokens *StopTokens
}
// NewReverseMaxMatch returns a newly initialized ReverseMaxMatch object
func NewReverseMaxMatch(dictPath string) *ReverseMaxMatch {
rmm := &ReverseMaxMatch{
dictPath: dictPath,
}
rmm.WordFilter = &NumAndLetterWordFilter{}
return rmm
}
// LoadDict loads dict that implements the Tokenizer interface
func (rmm *ReverseMaxMatch) LoadDict() error {
rmm.dict = NewDict(rmm.dictPath)
return rmm.dict.Load()
}
// Get returns segmentation that implements the Tokenizer interface
func (rmm *ReverseMaxMatch) Get(text string) ([]string, error) {
CheckDictIsLoaded(rmm.dict)
var result []string
startLen := rmm.dict.maxLen
text = strings.Trim(text, " ")
for len([]rune(text)) > 0 {
if len([]rune(text)) < startLen {
startLen = len([]rune(text))
}
word := string([]rune(text)[len([]rune(text))-startLen:])
isFind := false
for !isFind {
if len([]rune(word)) == 1 {
break
}
if _, ok := rmm.dict.Records[word]; !ok {
word = string([]rune(word)[1:])
} else {
isFind = true
}
if rmm.WordFilter.Filter(word) {
isFind = true
}
}
if rmm.EnabledFilterStopToken && !rmm.StopTokens.IsStopToken(word) {
result = append(result, word)
}
if !rmm.EnabledFilterStopToken {
result = append(result, word)
}
text = string([]rune(text)[0 : len([]rune(text))-len([]rune(word))])
}
return Reverse(result), nil
}
// GetFrequency returns token frequency that implements the Tokenizer interface
func (rmm *ReverseMaxMatch) GetFrequency(text string) (map[string]int, error) {
result, err := rmm.Get(text)
if err != nil {
return nil, err
}
return GetFrequency(result), nil
}