Skip to content

Commit 0d7857e

Browse files
authoredApr 26, 2021
Merge pull request espnet#3174 from yuekaizhang/swbd
fix receipe bug for swbd
2 parents a5abe11 + 1ea1697 commit 0d7857e

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed
 

‎egs/swbd/asr1/run.sh

+9-6
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
115115
# remove utt having > 2000 frames or < 10 frames or
116116
# remove utt having > 400 characters or 0 characters
117117
remove_longshortdata.sh --maxchars 400 data/train_nodup data/train_nodup_trim
118-
remove_longshortdata.sh --maxchars 400 data/dev data/${train_dev}
118+
remove_longshortdata.sh --maxchars 400 data/train_dev data/${train_dev}
119119

120120
# speed-perturbed
121121
utils/perturb_data_dir_speed.sh 0.9 data/train_nodup_trim data/temp1
@@ -164,10 +164,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
164164
echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
165165

166166
# map acronym such as p._h._d. to p h d for train_set& dev_set
167-
cp data/${train_set}/text data/${train_set}/text.backup
168-
cp data/${train_dev}/text data/${train_dev}/text.backup
169-
sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/${train_set}/text
170-
sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/${train_dev}/text
167+
cp data/${train_set}/text data/${train_set}/text.tmp
168+
cp data/${train_dev}/text data/${train_dev}/text.tmp
169+
sed -i 's/\._/ /g; s/them_1/them/g' data/${train_set}/text.tmp
170+
sed -i 's/\._/ /g; s/them_1/them/g' data/${train_dev}/text.tmp
171+
# remove . from second columns, skiping first column, which includes sp0.9, sp1.1 etc.
172+
awk -F " " '{for(i=2;i<=NF;++i) gsub(/\._|\./,"",$i)}1' data/${train_set}/text.tmp > data/${train_set}/text
173+
awk -F " " '{for(i=2;i<=NF;++i) gsub(/\._|\./,"",$i)}1' data/${train_dev}/text.tmp > data/${train_dev}/text
171174
if [ -n "${fisher_dir}" ]; then
172175
cp data/train_fisher/text data/train_fisher/text.backup
173176
sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/train_fisher/text
@@ -193,7 +196,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
193196
wc -l ${dict}
194197

195198
echo "make json files"
196-
data2json.sh --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
199+
data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
197200
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
198201
data2json.sh --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
199202
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json

0 commit comments

Comments
 (0)
Please sign in to comment.