-
Notifications
You must be signed in to change notification settings - Fork 0
/
ro_pretokenizer.py
148 lines (118 loc) · 4.62 KB
/
ro_pretokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import sys
from tokenizers import PreTokenizedString
from tokenizers import NormalizedString
from rodna.tokenizer import RoTokenizer
class RomanianPreTokenizer(object):
def __init__(self) -> None:
self._romanian_tokenizer = RoTokenizer()
@property
def maxwordlen(self) -> int:
"""The length of the longest word/MWE/ABBR for Romanian."""
return self._romanian_tokenizer._maxwordlen
def _romanian_split(self, index: int, normstr: NormalizedString) -> list[NormalizedString]:
norm_string = normstr.normalized
result = []
if not norm_string:
result.append(norm_string)
return result
# end if
ro_tokens = self._romanian_tokenizer.tokenize(
input_string=norm_string)
loff = 0
roff = 0
crt_token = ro_tokens.pop(0)
out_of_sync = False
while True:
for i in range(len(crt_token)):
if crt_token[i] == norm_string[roff] or \
(crt_token[i] == '_' and norm_string[roff] == ' '):
roff += 1
else:
print(f'Current [{crt_token}] token out of sync (i = {i}, roff = {roff}), in normalized string [{norm_string}]',
file=sys.stderr, flush=True)
out_of_sync = True
break
# end if
# end for
result.append(normstr.slice(range=(loff, roff)))
if roff == len(norm_string):
# Reached the EOS
break
elif out_of_sync:
# There is an offset synchronization problem
# Add the rest of the string as a single slice
# and bail out
if roff < len(norm_string):
result.append(normstr.slice(range=(roff, len(norm_string))))
# end if
break
# end if
# We can have only one space!
if norm_string[roff] == ' ':
roff += 1
# end if
loff = roff
crt_token = ro_tokens.pop(0)
# end while
return result
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(func=self._romanian_split)
def pre_tokenize_str(self, sequence: str) -> list[tuple[str, tuple[int, int]]]:
result = []
if not sequence:
result.append((sequence, (0, len(sequence))))
return result
# end if
ro_tokens = self._romanian_tokenizer.tokenize(
input_string=sequence)
loff = 0
roff = 0
crt_token = ro_tokens.pop(0)
out_of_sync = False
while True:
for i in range(len(crt_token)):
if crt_token[i] == sequence[roff] or \
(crt_token[i] == '_' and sequence[roff] == ' '):
roff += 1
else:
print(f'Current [{crt_token}] token out of sync (i = {i}, roff = {roff}), in normalized string [{sequence}]',
file=sys.stderr, flush=True)
out_of_sync = True
break
# end if
# end for
result.append((sequence[loff:roff], (loff, roff)))
if roff == len(sequence):
# Reached the EOS
break
elif out_of_sync:
# There is an offset synchronization problem
# Add the rest of the string as a single slice
# and bail out
if roff < len(sequence):
result.append((sequence[roff:], (roff, len(sequence))))
# end if
break
# end if
# We can have only one space!
if sequence[roff] == ' ':
roff += 1
# end if
loff = roff
crt_token = ro_tokens.pop(0)
# end while
return result
class TrainingPreTokenizer(object):
"""Only used when training on pre-tokenized data.
Just split at '_tk_' boundary and remove it."""
delimiter = '_tk_'
def _train_split(self, index: int, normstr: NormalizedString) -> list[NormalizedString]:
normstr.strip()
if TrainingPreTokenizer.delimiter in normstr.normalized:
return normstr.split(pattern=TrainingPreTokenizer.delimiter,
behavior='removed')
else:
return [normstr]
# end if
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(func=self._train_split)