-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathnormalize_text.py
80 lines (66 loc) · 3.73 KB
/
normalize_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import unicodedata
def unicode_normalize(cls, s):
pt = re.compile('([{}]+)'.format(cls))
def norm(c):
return unicodedata.normalize('NFKC', c) if pt.match(c) else c
s = ''.join(norm(x) for x in re.split(pt, s))
return s
def remove_extra_spaces(s):
s = re.sub('[ \n\t]+', ' ', s)
blocks = ''.join(('\u4E00-\u9FFF', # CJK UNIFIED IDEOGRAPHS
'\u3040-\u309F', # HIRAGANA
'\u30A0-\u30FF', # KATAKANA
'\u3000-\u303F', # CJK SYMBOLS AND PUNCTUATION
'\uFF00-\uFFEF' # HALFWIDTH AND FULLWIDTH FORMS
))
basic_latin = '\u0000-\u007F'
def remove_space_between(cls1, cls2, s):
p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
while p.search(s):
s = p.sub(r'\1\2', s)
return s
s = remove_space_between(blocks, blocks, s)
s = remove_space_between(blocks, basic_latin, s)
s = remove_space_between(basic_latin, blocks, s)
return s
def normalize_text(s):
s = s.strip()
s = unicode_normalize('0-9A-Za-z。-゚', s)
def maketrans(f, t):
return {ord(x): ord(y) for x, y in zip(f, t)}
s = s.translate(
maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~。、・「」',
'!”#$%&’()*+,−./:;<=>?@[¥]^_`{|}〜。、・「」'))
s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s) # normalize hyphens
s = re.sub('[﹣-ー—―─━ー]+', 'ー', s) # normalize choonpus
s = re.sub('[~∼∾〜〰~]', '', s) # remove tildes
s = remove_extra_spaces(s)
s = unicode_normalize('!”#$%&’()*+,−./:;<>?@[¥]^_`{|}〜', s) # keep =,・,「,」
s = re.sub('[’]', '\'', s)
s = re.sub('[”]', '"', s)
s = re.sub(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)", "", s) # remove URLs
return s
if __name__ == "__main__":
assert "0123456789" == normalize_text("0123456789")
assert "ABCDEFGHIJKLMNOPQRSTUVWXYZ" == normalize_text("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
assert "abcdefghijklmnopqrstuvwxyz" == normalize_text("abcdefghijklmnopqrstuvwxyz")
assert "!\"#$%&'()*+,-./:;<>?@[¥]^_`{|}" == normalize_text("!”#$%&’()*+,−./:;<>?@[¥]^_`{|}")
assert "=。、・「」" == normalize_text("=。、・「」")
assert "ハンカク" == normalize_text("ハンカク")
assert "o-o" == normalize_text("o₋o")
assert "majikaー" == normalize_text("majika━")
assert "わい" == normalize_text("わ〰い")
assert "スーパー" == normalize_text("スーパーーーー")
assert "!#" == normalize_text("!#")
assert "ゼンカクスペース" == normalize_text("ゼンカク スペース")
assert "おお" == normalize_text("お お")
assert "おお" == normalize_text(" おお")
assert "おお" == normalize_text("おお ")
assert "検索エンジン自作入門を買いました!!!" == \
normalize_text("検索 エンジン 自作 入門 を 買い ました!!!")
assert "アルゴリズムC" == normalize_text("アルゴリズム C")
assert "PRML副読本" == normalize_text(" PRML 副 読 本 ")
assert "Coding the Matrix" == normalize_text("Coding the Matrix")
assert "南アルプスの天然水Sparking Lemonレモン一絞り" == \
normalize_text("南アルプスの 天然水 Sparking Lemon レモン一絞り")