@@ -115,7 +115,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
115
115
# remove utt having > 2000 frames or < 10 frames or
116
116
# remove utt having > 400 characters or 0 characters
117
117
remove_longshortdata.sh --maxchars 400 data/train_nodup data/train_nodup_trim
118
- remove_longshortdata.sh --maxchars 400 data/dev data/${train_dev}
118
+ remove_longshortdata.sh --maxchars 400 data/train_dev data/${train_dev}
119
119
120
120
# speed-perturbed
121
121
utils/perturb_data_dir_speed.sh 0.9 data/train_nodup_trim data/temp1
@@ -164,10 +164,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
164
164
echo " <unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
165
165
166
166
# map acronym such as p._h._d. to p h d for train_set& dev_set
167
- cp data/${train_set} /text data/${train_set} /text.backup
168
- cp data/${train_dev} /text data/${train_dev} /text.backup
169
- sed -i ' s/\._/ /g; s/\.//g; s/them_1/them/g' data/${train_set} /text
170
- sed -i ' s/\._/ /g; s/\.//g; s/them_1/them/g' data/${train_dev} /text
167
+ cp data/${train_set} /text data/${train_set} /text.tmp
168
+ cp data/${train_dev} /text data/${train_dev} /text.tmp
169
+ sed -i ' s/\._/ /g; s/them_1/them/g' data/${train_set} /text.tmp
170
+ sed -i ' s/\._/ /g; s/them_1/them/g' data/${train_dev} /text.tmp
171
+ # remove . from second columns, skiping first column, which includes sp0.9, sp1.1 etc.
172
+ awk -F " " ' {for(i=2;i<=NF;++i) gsub(/\._|\./,"",$i)}1' data/${train_set} /text.tmp > data/${train_set} /text
173
+ awk -F " " ' {for(i=2;i<=NF;++i) gsub(/\._|\./,"",$i)}1' data/${train_dev} /text.tmp > data/${train_dev} /text
171
174
if [ -n " ${fisher_dir} " ]; then
172
175
cp data/train_fisher/text data/train_fisher/text.backup
173
176
sed -i ' s/\._/ /g; s/\.//g; s/them_1/them/g' data/train_fisher/text
@@ -193,7 +196,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
193
196
wc -l ${dict}
194
197
195
198
echo " make json files"
196
- data2json.sh --feat ${feat_tr_dir} /feats.scp --bpecode ${bpemodel} .model \
199
+ data2json.sh --nj ${nj} -- feat ${feat_tr_dir} /feats.scp --bpecode ${bpemodel} .model \
197
200
data/${train_set} ${dict} > ${feat_tr_dir} /data_${bpemode}${nbpe} .json
198
201
data2json.sh --feat ${feat_dt_dir} /feats.scp --bpecode ${bpemodel} .model \
199
202
data/${train_dev} ${dict} > ${feat_dt_dir} /data_${bpemode}${nbpe} .json
0 commit comments