-
Notifications
You must be signed in to change notification settings - Fork 5
/
conllu-text.py
103 lines (91 loc) · 1.84 KB
/
conllu-text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import sys;
def proc_tekst(blokk): #{
o = '' ;
first = True;
for tok in blokk.split(' '): #{
if tok in [',', '.', ':', ';', '!', '?', '...'] or first: #{
o = o + tok;
else: #{
o = o + ' ' + tok;
#}
first = False;
#}
# Hacks
o = o.replace('( ', '(');
o = o.replace(' )', ')');
o = o.replace('« ', '«');
o = o.replace(' »', '»');
o = o.replace('“ ', '“');
o = o.replace(' ”', '”');
o = o.replace(' - ', '-');
o = o.replace('!-', '! - ');
o = o.replace(',-', ', - ');
o = o.replace(':-', ': - ');
o = o.replace('?-', '? - ');
# " Осылай тұр! " дегендей екі иығынан басып қалды, бес-алты адымдай жерге барып тұра қалды.
new_o = '';
qc = 0;
lastc = '';
for c in o: #{
if c == '"': #{
qc = qc + 1;
#}
if c == ' ' and qc % 2 == 1 and lastc == '"': #{
continue;
#}
new_o += c;
lastc = c;
#}
o = new_o;
o = o.replace(' " ', '" ');
o = o.replace(' ",', '",');
return o ;
#}
idx = 0;
prefiks = "undefined";
if len(sys.argv) >= 1: #{
prefiks = sys.argv[1]
#}
for blokk in sys.stdin.read().split('\n\n'): #{
sent_id = '';
b = '';
for line in blokk.split('\n'): #{
if line.strip() == '': #{
break;
#}
if line[0] == '#': #{
if line.count('sent_id') > 0: #{
sent_id = line.strip();
#}
continue;
#}
row = line.split('\t');
if '.' in row[0] or '-' in row[0]: #{
continue;
#}
if b == '': #{
b = row[1];
else: #{
b = b + ' ' + row[1];
#}
#}
tekst = proc_tekst(b);
if blokk.strip() != '': #{
if sent_id == '': #{
print('# sent_id = %s::%d' % (prefiks, idx));
else: #{
print(sent_id);
#}
print('# text = %s' % (tekst));
#}
for line in blokk.split('\n'): #{
if line.strip() == '': #{
break;
#}
print(line);
#}
if blokk.strip() != '': #{
print('');
#}
idx += 1
#}