-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.sh
executable file
·220 lines (175 loc) · 7.7 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/bin/bash
# Copyright (c) 2024 Seyed Ali Farokh
verify_stage() {
if [[ ! "$1" =~ ^[0-9]+$ || $1 -lt 1 || $1 -gt 7 ]]; then
echo "Error: The $2 argument should be an integer in range 1-7." && exit
fi
}
verify_stage "$1" "stage"
verify_stage "$2" "stop_stage"
stage=$1
stop_stage=$2
# =============================================
# Configuration
# =============================================
# Kaldi params
kaldi_root="../../"
kaldi_cmd=utils/run.pl
n_jobs=10
n_dnn_threads=4
# Dataset
train_list=dataset/train.txt # tab-separated train metadata
test_list=dataset/test.txt # tab-separated test metadata
lexicon_path=dataset/lexicon.txt
lm_arpa_path=dataset/fa-lm.arpa
# Script artifacts
raw_lang_dir=data/raw/lang # initial language files generated by this script
lang_dir=data/lang # language files generated by Kaldi
train_dir=data/train # train metadata will be stored here
test_dir=data/test # test metadata will be stored here
exps_dir=exp # models, logs, etc. will be stored here
feats_mfcc_dir=mfcc # extracted MFCC features will be stored here
feats_fbank_dir=fbank # extracted mel features will be stored here
# Context-independent (Monophone) HMM
mono_gaussians=200 # number of gaussians
mono_iters=40 # number of training iterations
# Context-dependent (Triphone) HMM
tri_states=1000
tri_gaussians=5000
# Hybrid DNN-HMM
dnn_hidden_layers=3
dnn_hidden_dim=300 # hidden dimension
dnn_epochs=15 # training epochs
dnn_batch_size=128
# =============================================
# [STAGE 1] Prepare Language Files
# =============================================
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "[STAGE 1] Preparing language files ..."
mkdir -p $raw_lang_dir
# Add special symbols to lexicon
echo "<SIL> SIL" > $raw_lang_dir/lexicon.txt
echo "<OOV> OOV" >> $raw_lang_dir/lexicon.txt
cat $lexicon_path >> $raw_lang_dir/lexicon.txt
# Create nonsilence_phones.txt, silence_phones.txt, and optional_silence.txt
awk '{$1=""; sub(/^ /, ""); for (i=1; i<=NF; i++) print $i}' $lexicon_path \
| sort -u \
> $raw_lang_dir/nonsilence_phones.txt
echo -e "SIL\nOOV" > $raw_lang_dir/silence_phones.txt
echo "SIL" > $raw_lang_dir/optional_silence.txt
# Generate Kaldi's language files
utils/prepare_lang.sh $raw_lang_dir "<OOV>" $raw_lang_dir/tmp $lang_dir
# Generate language model G.fst file
$kaldi_root/src/lmbin/arpa2fst --disambig-symbol=#0 --max-arpa-warnings=0 \
--read-symbol-table=$lang_dir/words.txt $lm_arpa_path $lang_dir/G.fst
fi
# =============================================
# [STAGE 2] Prepare Dataset
# =============================================
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "[STAGE 2] Preparing dataset subsets ..."
mkdir -p $train_dir $test_dir
# Train subset
cut -f 2,7 $train_list | tr "\t" " " > $train_dir/text # utt_id transcript
cut -f 2,3 $train_list | tr "\t" " " > $train_dir/utt2spk # utt_id speaker_id
cut -f 1,6 $train_list | tr "\t" " " > $train_dir/wav.scp # file_id wav_path
cut -f 1,2,4,5 $train_list | awk '{temp=$1; $1=$2; $2=temp; print}' > $train_dir/segments # utt_id file_id start end
utils/utt2spk_to_spk2utt.pl $train_dir/utt2spk > $train_dir/spk2utt
utils/fix_data_dir.sh $train_dir
# Train subset
cut -f 2,7 $test_list | tr "\t" " " > $test_dir/text # utt_id transcript
cut -f 2,3 $test_list | tr "\t" " " > $test_dir/utt2spk # utt_id speaker_id
cut -f 1,6 $test_list | tr "\t" " " > $test_dir/wav.scp # file_id wav_path
cut -f 1,2,4,5 $test_list | awk '{temp=$1; $1=$2; $2=temp; print}' > $test_dir/segments # utt_id file_id start end
utils/utt2spk_to_spk2utt.pl $test_dir/utt2spk > $test_dir/spk2utt
utils/fix_data_dir.sh $test_dir
fi
# =============================================
# [STAGE 3] Feature Extraction (MFCC)
# =============================================
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "[STAGE 3] Extracting MFCC features ..."
mkdir -p $exps_dir $feats_mfcc_dir/train $feats_mfcc_dir/test
steps/make_mfcc.sh --cmd $kaldi_cmd --nj $n_jobs $train_dir $exps_dir/make_mfcc/data/train $feats_mfcc_dir
steps/make_mfcc.sh --cmd $kaldi_cmd --nj $n_jobs $test_dir $exps_dir/make_mfcc/data/test $feats_mfcc_dir
steps/compute_cmvn_stats.sh $train_dir $exps_dir/make_mfcc/data/train $feats_mfcc_dir
steps/compute_cmvn_stats.sh $test_dir $exps_dir/make_mfcc/data/test $feats_mfcc_dir
fi
# =============================================
# [STAGE 4] Train Context-independent (Monophone) HMM
# =============================================
mono_dir=$exps_dir/mono
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "[STAGE 4] Training mono HMM ..."
mkdir $mono_dir
# Train the model
steps/train_mono.sh --nj $n_jobs --cmd $kaldi_cmd \
--totgauss $mono_gaussians --num_iters $mono_iters \
$train_dir $lang_dir $mono_dir
# Create the decoding graph
utils/mkgraph.sh $lang_dir $mono_dir $mono_dir/graph
# Decode & evaluate
steps/decode.sh --nj $n_jobs --cmd $kaldi_cmd \
$mono_dir/graph $test_dir $mono_dir/decode_test
fi
# =============================================
# [STAGE 5] Train Context-dependent (Triphone) HMM
# =============================================
tri_dir=$exps_dir/tri
mono_dir_align=$exps_dir/mono_ali
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
echo "[STAGE 5] Training triphone HMM ..."
mkdir $tri_dir $mono_dir_align
# Align monophone states to training samples
steps/align_si.sh --nj $n_jobs --cmd $kaldi_cmd \
$train_dir $lang_dir $mono_dir $mono_dir_align
# Train the model
steps/train_deltas.sh --nj $n_jobs --cmd $kaldi_cmd \
$tri_states $tri_gaussians \
$train_dir $lang_dir $mono_dir_align $tri_dir
# Create the decoding graph
utils/mkgraph.sh $lang_dir $tri_dir $tri_dir/graph
# Decode & evaluate
steps/decode.sh --nj $n_jobs --cmd $kaldi_cmd \
$tri_dir/graph $test_dir $tri_dir/decode_test
fi
# =============================================
# [STAGE 6] Extract FBank Features
# =============================================
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
echo "[STAGE 6] Extracting mel features ..."
mkdir -p $feats_fbank_dir/train $feats_fbank_dir/test
# Train subset
steps/make_fbank.sh --nj $n_jobs --cmd $kaldi_cmd \
$train_dir $exps_dir/make_fbank/train $feats_fbank_dir/train
steps/compute_cmvn_stats.sh $train_dir \
$exps_dir/make_fbank/train $feats_fbank_dir/train
# Test subset
steps/make_fbank.sh --nj $n_jobs --cmd $kaldi_cmd \
$test_dir $exps_dir/make_fbank/test $feats_fbank_dir/test
steps/compute_cmvn_stats.sh $test_dir \
$exps_dir/make_fbank/test $feats_fbank_dir/test
fi
# =============================================
# [STAGE 7] Train DNN-HMM
# =============================================
dnn_dir=$exps_dir/dnn
tri_dir_align=$exps_dir/tri_ali
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
echo "[STAGE 7] Training DNN-HMM ..."
mkdir -p $dnn_dir $tri_dir_align
# Align triphone states to training samples
steps/align_si.sh --nj $nj --cmd $kaldi_cmd \
$train_dir $lang_dir $tri_dir $tri_dir_align
# Train the model
steps/nnet2/train_tanh_fast.sh --num-jobs-nnet $n_jobs --num-threads $n_dnn_threads \
--num-hidden-layers $dnn_hidden_layers \
--hidden-layer-dim $dnn_hidden_layers \
--add-layers-period 2 \
--num-epochs $dnn_epochs \
--minibatch-size $dnn_batch_size \
$train_dir $lang_dir $tri_dir_align $dnn_dir
# Decode & evaluate
steps/nnet2/decode.sh --nj $n_jobs --cmd $kaldi_cmd
$tri_dir/graph $test_dir $dnn_dir/decode_test
fi