-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathidentify_prenucleus.py
265 lines (230 loc) · 11.3 KB
/
identify_prenucleus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
[*conll2praat*] identifier un ensemble ordonné des *intervalles temporelles
étiquetées* aux *composants illocutoires (CI)* à chacun des *prénoyeaux* la
transcription reportée en tant que la *tire* **tx_new**.
"""
WARNING_EN = False
ERR_EN = False
# todo: solve issues with embedded prenucleus
def core_routine(sents, srcCol, pauseSign, dest, ref, num_sent_to_read=-1):
# initialization
tokens = []
sentId = 0
pauseId = 0
cursor = 0
err_num = 0
dist_tot = 0
for n, sent in enumerate(sents):
# try a local search from cursor to end of time with by default thld.
tokens = sent.split(' ')
[begin, end, cursor_out, best_dist] = findTimes(tokens,
ref,
lowerbound=cursor,
upperbound=cursor + 50,
thld=0.10,
pauseSign=pauseSign)
if cursor_out >= cursor:
cursor = cursor_out
deb_print("L{} local (begin,end) = ({:8.3f},{:8.3f})".format(
n, begin, end))
# écrire le contenu dans le tier de destination
try:
dest.add_interval(begin=begin, end=end, value=sent, check=True)
except Exception as e:
err_print(u"Line {} @ CoNLL : {}".format(n, e))
err_num += 1
else:
# try a global search but with a more strict threshold for distance
[begin, end, cursor_out,
best_dist] = findTimes(tokens,
ref,
lowerbound=0,
upperbound=-1,
thld=0.05,
pauseSign=pauseSign)
if cursor_out >= 0:
# écrire le contenu dans le tier de destination
try:
dest.add_interval(begin=begin,
end=end,
value=sent,
check=True)
deb_print("Line {} global (begin,end) = ({:8.3f},{:8.3f})".
format(n, begin, end))
except Exception as e:
err_print(u"Line {} @ CoNLL : {}".format(n, e))
err_num += 1
else:
err_print("Search fails @ Line {} of the CoNLL".format(n))
err_num += 1
# early break if number of sentences to read is reached
if sentId > num_sent_to_read and num_sent_to_read > 0: break
# préparation à la prochaine phrase
dist_tot += best_dist
sentId += 1
tokens = []
return err_num, dist_tot
def core_routine_with_known_ref_tier(tg,
sents,
srcCol,
pauseSign,
valideRefTierName,
num_sent_to_read=-1):
# read the ref. tier
refTier = tg.get_tier(valideRefTierName)
# initilize the dest. tier
testDestTier = tg.add_tier("test")
# export the transcription from conll file
err_num, dist = core_routine(sents, srcCol, pauseSign, testDestTier,
refTier, num_sent_to_read)
tg.remove_tier("test")
# return error indicators
return err_num, dist
def detect_ref_tier(tg,
sents,
srcCol,
pauseSign,
destTierName,
avaliableTierNames,
num_sent_to_read=10):
warning_print(
'Registered time reference tiers do not exist in TextGrid, launch auto-detection !'
)
err_by_tier = collections.Counter()
dist_by_tier = collections.Counter()
# try all tier as time referece tiers one by one
for tierName in avaliableTierNames:
info_print(u'try {}'.format(tierName))
# try 10 sentences for each tier and collect their accumulated edit distance
err_by_tier[tierName],dist_by_tier[tierName] = \
core_routine_with_known_ref_tier(tg,sents,srcCol,pauseSign,tierName, num_sent_to_read)
# use the best one to make final exporting
best_ref_name, best_dist = dist_by_tier.most_common()[-1]
return best_ref_name, best_dist
"""
[begin, end, cursor_out, best_dist] = findTimes(tokens,
ref,
lowerbound=cursor,
upperbound=cursor + 50,
thld=0.10,
pauseSign=pauseSign)
"""
from exporter_lib import *
if __name__ == '__main__':
# command-line interface for
# praat_in: input file path or folder path of praat TextGrid file(s)
# praat_out: output folder path for output praat TextGrid file
parser = argparse.ArgumentParser(description='identifier un ensemble ordonné des intervalles temporelles étiquetées aux composants illocutoires (CI) à chacun des *prénoyeaux* la transcription reportée en tant que la tire tx_new.')
parser.add_argument('praat_in', help='path to a input folder or a single input file')
parser.add_argument('praat_out', help='path to output folder')
args = parser.parse_args()
# output folder to create if not exists
if not os.path.exists(args.praat_out):
os.makedirs(args.praat_out)
# input file paths
infile_paths = []
if os.path.isfile(args.praat_in):
infile_paths = [args.praat_in]
elif os.path.isdir(args.praat_in):
infile_paths = [os.path.join(args.praat_in,infile) for infile in os.listdir(args.praat_in)]
for infile_path in infile_paths:
try:
outfile_path = insert_to_basename(infile_path, '_ADDED_PRENUCLEUS','TextGrid')
outfile_path = os.path.join(args.praat_out, os.path.basename(outfile_path))
encoding = get_encoding(infile_path)
txTierName = 'tx_new'
tg = TextGridPlus(file_path=infile_path,
codec=encoding,
analorFileEn=javaobj_installed)
tx = tg.get_tier(txTierName)
avaliableTierNames = [
t.name for t in tg.get_tiers()
if t.name != 'tx' and t.name != txTierName
]
tg.add_tier('prenucleus_ic_key')
tg.add_tier('prenucleus_ic_value')
prenucleus_ic_id = tg.get_tier('prenucleus_ic_key')
prenucleus_ic_value = tg.get_tier('prenucleus_ic_value')
sents = [interval[-1] for interval in tx.get_all_intervals()]
all_IC_intervals = []
best_ref_name, best_dist = detect_ref_tier(
tg,
sents,
srcCol=2,
pauseSign="#",
destTierName=txTierName,
avaliableTierNames=avaliableTierNames,
num_sent_to_read=10)
info_print(
'Set \'{}\' as time reference tier'.format(best_ref_name))
refTier = tg.get_tier(best_ref_name)
class subRefTier:
def __init__(self, tier, tmin, tmax):
all_intervals = tier.get_all_intervals()
self.intervals = [
interval for interval in all_intervals
if interval[0] >= tmin and interval[1] <= tmax
]
def get_all_intervals(self):
return self.intervals
for interval in tx.get_all_intervals():
tmin_sent, tmax_sent, sent = interval
if sent:
# segment sentence in illocutionrary units
# using as delimiters
# typical boundaries between IUs '//'
# boudaries of parralled IUs '//+'
# boundaries of reported speech ' [ ' and ' ] '
IUs = re.split('//\+|//\=|//|\[|\]', sent)
IUs = [IU for IU in IUs if IU.strip()]
for n, IU in enumerate(IUs):
# identify prenucleus and extract illocutionrary compoents
IC_intervals = []
if ' < ' in IU:
# identify the temporal limits of IU
# inside the temporal limits of sentence
ref = subRefTier(refTier, tmin_sent, tmax_sent)
tokens = IU.split(' ')
[tmin_IU, tmax_IU, cursor_out,
best_dist] = findTimes(tokens=tokens,
refTier=ref,
lowerbound=0,
upperbound=-1,
thld=1000,
pauseSign="#")
#print(tmin_IU, tmax_IU)
ICs = IU.split('<')[:-1]
cursor = 0
for IC in ICs:
IC = IC.strip()
if IC:
ref = subRefTier(refTier, tmin_IU, tmax_IU)
tokens = IC.split(' ')
[tmin_IC, tmax_IC, cursor,
best_dist] = findTimes(tokens=tokens,
refTier=ref,
lowerbound=cursor,
upperbound=-1,
thld=1000,
pauseSign="#")
IC_intervals.append((tmin_IC, tmax_IC, IC))
if IC_intervals:
all_IC_intervals.append(IC_intervals)
for k, ICs_of_IU in enumerate(all_IC_intervals):
for n, IC_interval in enumerate(ICs_of_IU):
tmin, tmax, IC = IC_interval
print("{}.{}, ({},{}), '{}".format(k, n, tmin, tmax, IC))
prenucleus_ic_id.add_interval(begin=tmin,
end=tmax,
value='{}:{}'.format(k, n),
check=True)
prenucleus_ic_value.add_interval(begin=tmin,
end=tmax,
value=IC,
check=True)
print('{} -> {}'.format(infile_path, outfile_path))
tg.to_file(filepath=outfile_path, codec='utf-8', mode='binary')
except Exception as e:
print(e)