-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenize.go
71 lines (60 loc) · 1.64 KB
/
tokenize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package main
import (
"strings"
"fmt"
"regexp"
"unicode"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
"golang.org/x/text/runes"
)
type unicodeTokenizer struct {
Stripper transform.Transformer
Splitter *regexp.Regexp
Converter *strings.Replacer
}
func newUnicodeTokenizer(allow_wildcards bool) (*unicodeTokenizer, error) {
pattern := ""
if allow_wildcards {
pattern = "*?"
}
comp, err := regexp.Compile("[^\\p{L}\\p{N}\\p{Co}" + pattern + "-]+")
if err != nil {
return nil, fmt.Errorf("failed to compile regex; %w", err)
}
var replacer *strings.Replacer
if allow_wildcards {
// Convert the usual wildcards to SQLite wildcards.
replacer = strings.NewReplacer(
"?", "_",
"*", "%",
)
}
return &unicodeTokenizer {
Stripper: transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC),
Splitter: comp,
Converter: replacer,
}, nil
}
func (u *unicodeTokenizer) Tokenize(x string) ([]string, error) {
y, _, err := transform.String(u.Stripper, x)
if err != nil {
return nil, fmt.Errorf("failed to strip diacritics; %w", err)
}
y = strings.ToLower(y)
output := u.Splitter.Split(y, -1)
final := []string{}
present := map[string]bool{}
for _, t := range output {
if len(t) > 0 {
if u.Converter != nil {
t = u.Converter.Replace(t)
}
if _, ok := present[t]; !ok {
final = append(final, t)
present[t] = true
}
}
}
return final, nil
}