@@ -50,6 +50,10 @@ tgt_case=tc
50
50
remove_nonverbal=true # remove non-verbal labels such as "( Applaus )"
51
51
# NOTE: IWSLT community accepts this setting and therefore we use this by default
52
52
53
+ # iwslt segmentation related
54
+ max_interval=200
55
+ max_duration=1500
56
+
53
57
# bpemode (unigram or bpe)
54
58
nbpe=16000
55
59
bpemode=bpe
@@ -69,16 +73,17 @@ set -o pipefail
69
73
mustc_dir=../../must_c
70
74
mustc_v2_dir=../../must_c_v2
71
75
stted_dir=../../iwslt18
76
+
72
77
# test data directory
73
- iwslt_test_data =/n/rd8/iwslt18
78
+ iwslt_test_data_dir =/n/rd8/iwslt18
74
79
75
80
train_set=train.de
76
81
train_dev=dev.de
77
82
trans_subset=" et_mustc_dev_org.de et_mustc_tst-COMMON.de et_mustc_tst-HE.de"
78
83
trans_set=" et_mustc_dev_org.de et_mustc_tst-COMMON.de et_mustc_tst-HE.de \
79
- et_mustcv2_dev_org.de et_mustcv2_tst-COMMON.de et_mustcv2_tst-HE.de \
80
- et_stted_dev2010.de et_stted_tst2010.de et_stted_tst2013.de et_stted_tst2014.de et_stted_tst2015.de \
81
- et_stted_tst2018.de et_stted_tst2019.de"
84
+ et_mustcv2_dev_org.de et_mustcv2_tst-COMMON.de et_mustcv2_tst-HE.de"
85
+ iwslt_test_set= " et_stted_dev2010.de et_stted_tst2010.de et_stted_tst2013.de et_stted_tst2014.de et_stted_tst2015.de \
86
+ et_stted_tst2018.de et_stted_tst2019.de"
82
87
83
88
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
84
89
# ## Task dependent. You have to make data the following preparation part by yourself.
@@ -146,9 +151,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
146
151
steps/make_fbank_pitch.sh --cmd " $train_cmd " --nj 32 --write_utt2num_frames true \
147
152
data/${x} exp/make_fbank/${x} ${fbankdir}
148
153
utils/fix_data_dir.sh --utt_extra_files " text.tc text.lc text.lc.rm" data/${x}
154
+ rm data/${x} /segments
155
+ rm data/${x} /wav.scp
149
156
done
150
- rm data/* /segments
151
- rm data/* /wav.scp
152
157
153
158
for lang in en de; do
154
159
utils/combine_data.sh --extra_files " text.tc text.lc text.lc.rm" data/train.${lang} data/tr_mustc.${lang} data/tr_mustcv2.${lang} data/tr_stted.${lang}
@@ -184,6 +189,32 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
184
189
dump.sh --cmd " $train_cmd " --nj 32 --do_delta $do_delta \
185
190
data/${x} /feats.scp data/${train_set} /cmvn.ark exp/dump_feats/trans/${x} ${feat_trans_dir}
186
191
done
192
+
193
+ # concatenate short segments
194
+ for x in ${iwslt_test_set} ; do
195
+ output_dir=${x} _merge${max_interval} _duration${max_duration}
196
+ rm -rf data/${output_dir}
197
+ cp -rf data/${x} data/${output_dir}
198
+ rm data/${output_dir} /utt2num_frames
199
+
200
+ local/merge_short_segments.py \
201
+ data/${x} /segments \
202
+ data/${output_dir} /segments \
203
+ data/${output_dir} /utt2spk \
204
+ data/${output_dir} /spk2utt \
205
+ --min_interval ${max_interval} \
206
+ --max_duration ${max_duration} \
207
+ --delimiter " _" || exit 1;
208
+
209
+ # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
210
+ steps/make_fbank_pitch.sh --cmd " $train_cmd " --nj 32 --write_utt2num_frames true \
211
+ data/${output_dir} exp/make_fbank/${output_dir} ${fbankdir}
212
+ utils/fix_data_dir.sh --utt_extra_files " text.tc text.lc text.lc.rm" data/${output_dir}
213
+
214
+ feat_trans_dir=${dumpdir} /${output_dir} /delta${do_delta} ; mkdir -p ${feat_trans_dir}
215
+ dump.sh --cmd " $train_cmd " --nj 32 --do_delta $do_delta \
216
+ data/${output_dir} /feats.scp data/${train_set} /cmvn.ark exp/dump_feats/trans/${output_dir} ${feat_trans_dir}
217
+ done
187
218
fi
188
219
189
220
dict=data/lang_1spm/${train_set} _${bpemode}${nbpe} _units_${tgt_case} .txt
@@ -212,22 +243,20 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
212
243
echo " make json files"
213
244
data2json.sh --nj 16 --feat ${feat_tr_dir} /feats.scp --text data/${train_set} /text.${tgt_case} --bpecode ${bpemodel} .model --lang " de" \
214
245
data/${train_set} ${dict} > ${feat_tr_dir} /data_${bpemode}${nbpe} .${src_case} _${tgt_case} .json
215
- for x in ${train_dev} ${trans_set} ; do
216
- feat_trans_dir=${dumpdir} /${x} /delta${do_delta}
246
+ for x in ${train_dev} ${trans_set} ${iwslt_test_set} ; do
217
247
if [[ ${x} = * tst20* ]] || [[ ${x} = * dev20* ]]; then
248
+ feat_trans_dir=${dumpdir} /${x} _merge${max_interval} _duration${max_duration} /delta${do_delta}
218
249
local/data2json.sh --feat ${feat_trans_dir} /feats.scp --no_text true \
219
- data/${x} ${dict} > ${feat_trans_dir} /data_${bpemode}${nbpe} .${src_case} _${tgt_case} .json
250
+ data/${x} _merge ${max_interval} _duration ${max_duration} ${dict} > ${feat_trans_dir} /data_${bpemode}${nbpe} .${src_case} _${tgt_case} .json
220
251
else
252
+ feat_trans_dir=${dumpdir} /${x} /delta${do_delta}
221
253
data2json.sh --feat ${feat_trans_dir} /feats.scp --text data/${x} /text.${tgt_case} --bpecode ${bpemodel} .model --lang " de" \
222
254
data/${x} ${dict} > ${feat_trans_dir} /data_${bpemode}${nbpe} .${src_case} _${tgt_case} .json
223
255
fi
224
256
done
225
257
226
258
# update json (add source references)
227
259
for x in ${train_set} ${train_dev} ${trans_set} ; do
228
- if [[ ${x} = * tst20* ]] || [[ ${x} = * dev20* ]]; then
229
- continue
230
- fi
231
260
feat_dir=${dumpdir} /${x} /delta${do_delta}
232
261
data_dir=data/$( echo ${x} | cut -f 1 -d " ." ) .en
233
262
update_json.sh --text ${data_dir} /text.${src_case} --bpecode ${bpemodel} .model \
@@ -297,6 +326,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
297
326
pids=() # initialize pids
298
327
for x in ${trans_subset} ; do
299
328
(
329
+ if [[ ${x} = * tst20* ]] || [[ ${x} = * dev20* ]]; then
330
+ x=${x} _merge${max_interval} _duration${max_duration}
331
+ fi
300
332
decode_dir=decode_${x} _$( basename ${decode_config% .* } )
301
333
feat_trans_dir=${dumpdir} /${x} /delta${do_delta}
302
334
@@ -317,7 +349,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
317
349
set=$( echo ${x} | cut -f 1 -d " ." | cut -f 3 -d " _" )
318
350
local/score_bleu_reseg.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel} .model \
319
351
--remove_nonverbal ${remove_nonverbal} \
320
- ${expdir} /${decode_dir} ${dict} ${iwslt_test_data } ${set}
352
+ ${expdir} /${decode_dir} ${dict} ${iwslt_test_data_dir } ${set}
321
353
else
322
354
score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel} .model \
323
355
--remove_nonverbal ${remove_nonverbal} \
0 commit comments