forked from apertium/apertium-jpn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenize.py
32 lines (27 loc) · 813 Bytes
/
tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import MeCab
import sys
def process_text(sin, sout):
text = sin.read()
mecab = MeCab.Tagger("-Owakati")
buffer = ""
tokenized = ""
in_bracket = False
for char in text:
if char == "[":
if buffer.strip():
tokenized += mecab.parse(buffer.strip()).rstrip() + ' '
tokenized += '['
buffer = ""
in_bracket = True
elif char == "]":
if in_bracket:
tokenized += ''.join(mecab.parse(buffer.strip()).split()) + ']'
buffer = ""
in_bracket = False
else:
buffer += char
if buffer.strip():
tokenized += mecab.parse(buffer.strip()).rstrip()
sout.write(tokenized)
if __name__ == '__main__':
process_text(sys.stdin, sys.stdout)