-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhdt_sentences.py
40 lines (35 loc) · 926 Bytes
/
hdt_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env python3
#
#
import re
import getopt, sys, os
debug = False
def DBG(*strs):
if debug:
sys.stderr.write("DBG:"+"".join(str(strs))+"\n")
filename = "hdt_Books_forFrog.col"
try:
opts, args = getopt.getopt(sys.argv[1:], "f:", [])
except getopt.GetoptError as err:
print(str(err))
sys.exit(1)
for o, a in opts:
if o in ("-f"):
filename = a
else:
assert False, "unhandled option"
outfilename = filename + ".s"
s = []
with open(filename, 'r') as f:
with open(outfilename, 'w') as of:
for l in f:
l = l.strip()
lr = l.replace("’", "") #this ’ causes problems for CLTK tagger
if lr.startswith("<utt>"):
print( " ".join(s), file=of )
s = []
continue
bits = lr.split()
if len(bits) != 3:
continue
s.append( bits[0] )