-
Notifications
You must be signed in to change notification settings - Fork 3
/
rustemmer.go
219 lines (189 loc) · 6.03 KB
/
rustemmer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
// Package rustemmer implements Porter stemmer for Russian language.
package rustemmer
import (
"regexp"
"strings"
"sync"
)
var instance = New()
const VOWEL = "аеёиоуыэюя"
var suffixNN = []string{"нн"}
var suffixPerfectiveGerunds = [][]string{
{"в", "вши", "вшись", "в", "вши", "вшись"},
{"ив", "ивши", "ившись", "ыв", "ывши", "ывшись"},
}
var suffixReflexives = []string{"ся", "сь"}
var suffixAdjective = []string{
"ее", "ие", "ые", "ое", "ими", "ыми", "ей", "ий", "ый", "ой", "ем", "им", "ым", "ом", "его", "ого", "ему",
"ому", "их", "ых", "ую", "юю", "ая", "яя", "ою", "ею",
}
var suffixVerb = [][]string{
{"ла", "на", "ете", "йте", "ли", "й", "л", "ем", "н", "ло", "но", "ет", "ют", "ны", "ть", "ешь", "нно"},
{
"ила", "ыла", "ена", "ейте", "уйте", "ите", "или", "ыли", "ей", "уй", "ил", "ыл", "им", "ым", "ен",
"ило", "ыло", "ено", "ят", "ует", "уют", "ит", "ыт", "ены", "ить", "ыть", "ишь", "ую", "ю",
},
}
var suffixNoun = []string{
"а", "ев", "ов", "ие", "ье", "е", "иями", "ями", "ами", "еи", "ии", "и", "ией", "ей", "ой", "ий", "й", "иям",
"ям", "ием", "ем", "ам", "ом", "о", "у", "ах", "иях", "ях", "ы", "ь", "ию", "ью", "ю", "ия", "ья", "я",
}
var suffixSuperlative = []string{"ейш", "ейше"}
var suffixSoftSign = []string{"ь"}
var suffixI = []string{"и"}
var suffixDerivational = []string{"ост", "ость"}
var suffixParticiple = [][]string{
appendPrefix(suffixAdjective, []string{"ем", "нн", "вш", "ющ", "щ"}),
appendPrefix(suffixAdjective, []string{"ивш", "ывш", "ующ"}),
}
type RuStemmer struct {
mu sync.Mutex
word []rune
RV int
R2 int
}
// New creates a new RuStemmer.
func New() *RuStemmer {
return &RuStemmer{
word: []rune(""),
RV: 0,
R2: 0,
}
}
// GetWordBase returns the base word.
func GetWordBase(word string) string {
instance.mu.Lock()
defer instance.mu.Unlock()
return instance.GetWordBase(word)
}
// NormalizeText returns normalized text.
// Returns text in which all words will be replaced with the basics of words separated by a space.
// All Special characters except "_" will be removed.
func NormalizeText(text string) string {
instance.mu.Lock()
defer instance.mu.Unlock()
return instance.NormalizeText(text)
}
// GetWordBase returns the base word.
func (r *RuStemmer) GetWordBase(word string) string {
r.word = []rune(word)
r.RV = 0
r.R2 = 0
r.findRegions()
// Step 1
// Find ending PERFECTIVE GERUND. If it exists - delete it and complete this step
if !r.removeEndings(r.RV, suffixPerfectiveGerunds[0], suffixPerfectiveGerunds[1]) {
// Otherwise, remove ending REFLEXIVE (if it exists)
r.removeEndings(r.RV, suffixReflexives)
// Then try the following procedure to remove ending: ADJECTIVE, VERB, NOUN.
// As soon as one of them is found - a step ends
ife := r.removeEndings(
r.RV,
suffixParticiple[0],
suffixParticiple[1],
) || r.removeEndings(r.RV, suffixAdjective)
if !ife && !r.removeEndings(r.RV, suffixVerb[0], suffixVerb[1]) {
r.removeEndings(r.RV, suffixNoun)
}
}
// Step 2
// If a word ends with "и" - remove the "и"
r.removeEndings(r.RV, suffixI)
// Step 3
// If in "R2" there DERIVATIONAL ending - delete it
r.removeEndings(r.R2, suffixDerivational)
// Step 4
// Possible is one of the three variants:
// If a word ending in "нн" - delete the last letter
if r.removeEndings(r.RV, suffixNN) {
r.word = []rune(string(r.word) + "н")
}
// If a word ending in SUPERLATIVE - remove it and remove the last letter again if the word ending in "нн"
r.removeEndings(r.RV, suffixSuperlative)
// If a word ending in "ь" - delete it
r.removeEndings(r.RV, suffixSoftSign)
return string(r.word)
}
// NormalizeText returns normalized text.
// Returns text in which all words will be replaced with the basics of words separated by a space.
// All Special characters except "_" will be removed.
func (r *RuStemmer) NormalizeText(text string) string {
regexWords := regexp.MustCompile("[\\p{L}\\d_]+")
words := regexWords.FindAllString(text, -1)
for k, word := range words {
words[k] = r.GetWordBase(word)
}
return strings.Join(words, " ")
}
func (r *RuStemmer) removeEndings(region int, suffixesPacks ...[]string) bool {
if region > len(r.word) {
region = len(r.word)
}
prefix := r.word[:region]
word_ := string(r.word[len(prefix):])
suffixes := suffixesPacks[0]
if len(suffixesPacks) == 2 {
if result := trimFirstSuffix(word_, suffixes, true); result != word_ {
r.word = []rune(string(prefix) + result)
return true
}
suffixes = suffixesPacks[1]
}
if result := trimFirstSuffix(word_, suffixes, false); result != word_ {
r.word = []rune(string(prefix) + result)
return true
}
return false
}
func appendPrefix(strs []string, prefixesPacks ...[]string) []string {
ret := []string{}
for _, str := range strs {
for _, prefixes := range prefixesPacks {
for _, prefix := range prefixes {
ret = append(ret, prefix + str)
}
}
}
return ret
}
func trimFirstSuffix(word string, suffixes []string, isAYA bool) string {
for _, suffix := range suffixes {
if isAYA && !(strings.HasSuffix(word, "а" + suffix) || strings.HasSuffix(word, "я" + suffix)) {
continue
}
if result := strings.TrimSuffix(word, suffix); result != word {
return result
}
}
return word
}
func (r *RuStemmer) findRegions() {
state := 0
wordLength := len(r.word)
for i := 1; i < wordLength; i++ {
prevChar := string(r.word[i - 1])
char := string(r.word[i])
switch state {
case 0:
if r.isVowel(char) {
r.RV = i + 1
state = 1
}
break
case 1:
if r.isVowel(prevChar) && !r.isVowel(char) {
state = 2
}
break
case 2:
if r.isVowel(prevChar) && !r.isVowel(char) {
r.R2 = i + 1
return
}
break
}
}
}
func (r *RuStemmer) isVowel(char string) bool {
return strings.Contains(VOWEL, char)
}