Skip to content

Commit 827c90b

Browse files
committedApr 22, 2021
Concatenate short segments
1 parent f67a9f6 commit 827c90b

File tree

2 files changed

+166
-13
lines changed

2 files changed

+166
-13
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#! /usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
# Copyright 2021 Kyoto University (Hirofumi Inaguma)
5+
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
6+
7+
"""Merge adjacent utterances."""
8+
9+
10+
import argparse
11+
import codecs
12+
from collections import deque
13+
14+
parser = argparse.ArgumentParser()
15+
parser.add_argument("segments", type=str, help="path to segment file")
16+
17+
parser.add_argument("output_segments", type=str, help="path to output segment file")
18+
parser.add_argument("output_utt2spk", type=str, help="path to output utt2spk file")
19+
parser.add_argument("output_spk2utt", type=str, help="path to output spk2utt file")
20+
21+
parser.add_argument("--min_interval", type=int, default=200, help="")
22+
parser.add_argument(
23+
"--max_duration", type=int, default=1500, help="maximum duration [frame]"
24+
)
25+
parser.add_argument(
26+
"--delimiter", type=str, default="_", help="delimiter on utt_id start_time end_time"
27+
)
28+
args = parser.parse_args()
29+
30+
31+
def merge(segments, segments_dict):
32+
33+
while True:
34+
num_merge = 0
35+
new_segments = deque([])
36+
utt_id_prev, start_prev, end_prev = segments.popleft()
37+
utt_ids_merged = utt_id_prev
38+
for utt_ids, start, end in segments:
39+
interval = start - end_prev
40+
duration = end - start_prev
41+
if interval < args.min_interval and duration < args.max_duration:
42+
# merge
43+
end_prev = end
44+
utt_ids_merged.extend(utt_ids)
45+
num_merge += 1
46+
else:
47+
new_segments.append((utt_ids_merged, start_prev, end_prev))
48+
49+
start_prev = start
50+
end_prev = end
51+
utt_ids_merged = utt_ids
52+
53+
# for last segments
54+
new_segments.append((utt_ids_merged, start_prev, end))
55+
segments = new_segments
56+
57+
if num_merge == 0:
58+
break
59+
60+
delimiter = args.delimiter
61+
for utt_ids, _, _ in segments:
62+
spk = utt_ids[0].split(delimiter)[0]
63+
s = utt_ids[0].split(delimiter)[1]
64+
e = utt_ids[-1].split(delimiter)[2]
65+
new_utt_id = "%s" % (spk + delimiter + s + delimiter + e)
66+
67+
segments_dict[new_utt_id] = (
68+
segments_dict[utt_ids[0]][0],
69+
segments_dict[utt_ids[-1]][1],
70+
)
71+
72+
if len(utt_ids) > 1:
73+
for utt_id in utt_ids:
74+
del segments_dict[utt_id]
75+
76+
return segments_dict
77+
78+
79+
def main():
80+
segments_dict = {}
81+
with codecs.open(args.segments, "r", encoding="utf-8") as f:
82+
for line in f:
83+
utt_id, spk, start, end = line.strip().split()
84+
segments_dict[utt_id] = (start, end)
85+
86+
segments_spk = deque([])
87+
with codecs.open(args.segments, "r", encoding="utf-8") as f:
88+
spk_prev = None
89+
for line in f:
90+
utt_id, spk, start, end = line.strip().split()
91+
start = float(start) * 100 # per 10ms
92+
end = float(end) * 100 # per 10ms
93+
if spk_prev is not None and spk != spk_prev:
94+
segments_dict = merge(segments_spk, segments_dict)
95+
segments_spk = deque([]) # reset
96+
segments_spk.append(([utt_id], start, end))
97+
spk_prev = spk
98+
99+
with codecs.open(args.output_segments, "w", encoding="utf-8") as f:
100+
for utt_id, (start, end) in sorted(segments_dict.items(), key=lambda x: x[0]):
101+
spk = utt_id.split(args.delimiter)[0]
102+
f.write("%s %s %s %s\n" % (utt_id, spk, start, end))
103+
104+
spk2utt_dict = {}
105+
with codecs.open(args.output_utt2spk, "w", encoding="utf-8") as f:
106+
for utt_id, ref in sorted(segments_dict.items(), key=lambda x: x[0]):
107+
spk = utt_id.split("_")[0]
108+
f.write("%s %s\n" % (utt_id, spk))
109+
110+
if spk not in spk2utt_dict:
111+
spk2utt_dict[spk] = [utt_id]
112+
else:
113+
spk2utt_dict[spk] += [utt_id]
114+
115+
with codecs.open(args.output_spk2utt, "w", encoding="utf-8") as f:
116+
for spk, utt_ids in sorted(spk2utt_dict.items(), key=lambda x: x[0]):
117+
f.write("%s %s\n" % (spk, " ".join(utt_ids)))
118+
119+
120+
if __name__ == "__main__":
121+
main()

‎egs/iwslt21/st1/run.sh

+45-13
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ tgt_case=tc
5050
remove_nonverbal=true # remove non-verbal labels such as "( Applaus )"
5151
# NOTE: IWSLT community accepts this setting and therefore we use this by default
5252

53+
# iwslt segmentation related
54+
max_interval=200
55+
max_duration=1500
56+
5357
# bpemode (unigram or bpe)
5458
nbpe=16000
5559
bpemode=bpe
@@ -69,16 +73,17 @@ set -o pipefail
6973
mustc_dir=../../must_c
7074
mustc_v2_dir=../../must_c_v2
7175
stted_dir=../../iwslt18
76+
7277
# test data directory
73-
iwslt_test_data=/n/rd8/iwslt18
78+
iwslt_test_data_dir=/n/rd8/iwslt18
7479

7580
train_set=train.de
7681
train_dev=dev.de
7782
trans_subset="et_mustc_dev_org.de et_mustc_tst-COMMON.de et_mustc_tst-HE.de"
7883
trans_set="et_mustc_dev_org.de et_mustc_tst-COMMON.de et_mustc_tst-HE.de \
79-
et_mustcv2_dev_org.de et_mustcv2_tst-COMMON.de et_mustcv2_tst-HE.de \
80-
et_stted_dev2010.de et_stted_tst2010.de et_stted_tst2013.de et_stted_tst2014.de et_stted_tst2015.de \
81-
et_stted_tst2018.de et_stted_tst2019.de"
84+
et_mustcv2_dev_org.de et_mustcv2_tst-COMMON.de et_mustcv2_tst-HE.de"
85+
iwslt_test_set="et_stted_dev2010.de et_stted_tst2010.de et_stted_tst2013.de et_stted_tst2014.de et_stted_tst2015.de \
86+
et_stted_tst2018.de et_stted_tst2019.de"
8287

8388
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
8489
### Task dependent. You have to make data the following preparation part by yourself.
@@ -146,9 +151,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
146151
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
147152
data/${x} exp/make_fbank/${x} ${fbankdir}
148153
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}
154+
rm data/${x}/segments
155+
rm data/${x}/wav.scp
149156
done
150-
rm data/*/segments
151-
rm data/*/wav.scp
152157

153158
for lang in en de; do
154159
utils/combine_data.sh --extra_files "text.tc text.lc text.lc.rm" data/train.${lang} data/tr_mustc.${lang} data/tr_mustcv2.${lang} data/tr_stted.${lang}
@@ -184,6 +189,32 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
184189
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
185190
data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${x} ${feat_trans_dir}
186191
done
192+
193+
# concatenate short segments
194+
for x in ${iwslt_test_set}; do
195+
output_dir=${x}_merge${max_interval}_duration${max_duration}
196+
rm -rf data/${output_dir}
197+
cp -rf data/${x} data/${output_dir}
198+
rm data/${output_dir}/utt2num_frames
199+
200+
local/merge_short_segments.py \
201+
data/${x}/segments \
202+
data/${output_dir}/segments \
203+
data/${output_dir}/utt2spk \
204+
data/${output_dir}/spk2utt \
205+
--min_interval ${max_interval} \
206+
--max_duration ${max_duration} \
207+
--delimiter "_" || exit 1;
208+
209+
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
210+
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
211+
data/${output_dir} exp/make_fbank/${output_dir} ${fbankdir}
212+
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${output_dir}
213+
214+
feat_trans_dir=${dumpdir}/${output_dir}/delta${do_delta}; mkdir -p ${feat_trans_dir}
215+
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
216+
data/${output_dir}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${output_dir} ${feat_trans_dir}
217+
done
187218
fi
188219

189220
dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
@@ -212,22 +243,20 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
212243
echo "make json files"
213244
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
214245
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
215-
for x in ${train_dev} ${trans_set}; do
216-
feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
246+
for x in ${train_dev} ${trans_set} ${iwslt_test_set}; do
217247
if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
248+
feat_trans_dir=${dumpdir}/${x}_merge${max_interval}_duration${max_duration}/delta${do_delta}
218249
local/data2json.sh --feat ${feat_trans_dir}/feats.scp --no_text true \
219-
data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
250+
data/${x}_merge${max_interval}_duration${max_duration} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
220251
else
252+
feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
221253
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
222254
data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
223255
fi
224256
done
225257

226258
# update json (add source references)
227259
for x in ${train_set} ${train_dev} ${trans_set}; do
228-
if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
229-
continue
230-
fi
231260
feat_dir=${dumpdir}/${x}/delta${do_delta}
232261
data_dir=data/$(echo ${x} | cut -f 1 -d ".").en
233262
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
@@ -297,6 +326,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
297326
pids=() # initialize pids
298327
for x in ${trans_subset}; do
299328
(
329+
if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
330+
x=${x}_merge${max_interval}_duration${max_duration}
331+
fi
300332
decode_dir=decode_${x}_$(basename ${decode_config%.*})
301333
feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
302334

@@ -317,7 +349,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
317349
set=$(echo ${x} | cut -f 1 -d "." | cut -f 3 -d "_")
318350
local/score_bleu_reseg.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
319351
--remove_nonverbal ${remove_nonverbal} \
320-
${expdir}/${decode_dir} ${dict} ${iwslt_test_data} ${set}
352+
${expdir}/${decode_dir} ${dict} ${iwslt_test_data_dir} ${set}
321353
else
322354
score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
323355
--remove_nonverbal ${remove_nonverbal} \

0 commit comments

Comments
 (0)
Please sign in to comment.