-
Notifications
You must be signed in to change notification settings - Fork 23
/
asciify.py
47 lines (38 loc) · 1.77 KB
/
asciify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from common import pairwise
from unicode2ascii import read_u2a_data, convert_u2a
def document_to_ascii(document):
# traverse sentences, store sentence head (text before the
# first token), then traverse tokens in pairs, storing the
# tail of each token (space between the token and the next),
# convert token text to ASCII using convert_u2a(), redo token
# offsets, then redo the sentence text using the head, token
# texts and tails, and finally redo the document text using
# the sentences.
read_u2a_data()
document.unicode_text = document.text
offset = 0
for sentence in document.sentences:
sentence.unicode_text = sentence.text
sent_end = sentence.base_offset + len(sentence.text)
ascii_base_offset = offset
if sentence.tokens:
t_start = sentence.tokens[0].start
sent_head = document.text[sentence.base_offset:t_start]
else:
sent_head = sentence.text
assert not sent_head or sent_head.isspace()
offset += len(sent_head)
for t, next_t in pairwise(sentence.tokens, include_last=True):
t.unicode_text = t.text
tail_end = next_t.start if next_t is not None else sent_end
t.tail = document.text[t.end:tail_end]
assert not t.tail or t.tail.isspace()
t.text = convert_u2a(t.text)
t.start, t.end = offset, offset + len(t.text)
offset += len(t.text) + len(t.tail)
sentence.base_offset = ascii_base_offset
sentence.text = sent_head + ''.join([t.text + t.tail
for t in sentence.tokens])
assert sentence.is_valid()
document.text = ''.join(s.text for s in document.sentences)
assert document.is_valid()