-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathrun_mstrenet.sh
executable file
·327 lines (229 loc) · 10.9 KB
/
run_mstrenet.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1
set -e # exit on error
# Begin configuration section
stage=1
chain_stage=1
train_stage=-10
nj=40
#DECLARE WHERE YOUR DAMP AND DALI DATA IS LOCATED!!!!
datadir_damp=wav/damp
datadir_dali=wav/dali
datadir_dali_talt=wav/dali_talt
datadir_jamendo=wav/jamendo
#Declare path to pretrained model: should be sth like 'models/ijcnn'
pretrained_model=
help=false
echo "Using steps and utils from WSJ recipe"
[[ ! -L "steps" ]] && ln -s $KALDI_ROOT/egs/wsj/s5/steps
[[ ! -L "utils" ]] && ln -s $KALDI_ROOT/egs/wsj/s5/utils
[[ ! -L "rnnlm" ]] && ln -s $KALDI_ROOT/egs/wsj/s5/rnnlm
# End configuration section
. ./utils/parse_options.sh
. ./path.sh
. ./cmd.sh
if [ $help = true ]; then
echo "Usage: ./run_mstrenet.sh "
echo "This is the main script for the training of the MStreNet"
echo "automatic lyrics transcription model (ISMIR2021). "
echo "You just have to specify where the datasets are located."
echo ""
echo "main options (for others, see top of script file)"
echo " --stage # stage of the main running script"
echo " --chain_stage # stage for the DNN training pipeline (chain recipe at stage 13)"
echo " --train_stage # DNN training stage. Should be -10 to initialize the training"
echo " --datadir_damp # path to DAMP dataset"
echo " --datadir_dali # path to DALI dataset"
echo " --datadir_dali_talt # path to DALI-TALT dataset"
echo " --datadir_jamendo # path to jamendo dataset"
echo " --pretrained_model <model> # directory to a pretrained model (if specificed, i.e. models/ijcnn)."
echo " # If this is non-empty, the script will skip training and directly go to stage 14."
echo " --nj <nj> # number of parallel jobs"
exit 1;
fi
trainset=train_damp_music # At first, we train the GMM-HMM model on DAMP dataset only (until stage 9).
# Once we generate alignments using the lexicon with estimated pronunciation
# probabilities, we retrain another GMM-HMM model on both 'train_damp' and 'train_dali'.
# The LFMMI training is done on the combination of these train sets.
# The '_music' suffix stands for the dataset version where music-informed
# silence modeling is applied.
devsets="dev_damp dev_dali"
test_sets="test_damp dali_talt jamendo"
# This script also needs the phonetisaurus g2p, srilm, sox
#./local/check_tools.sh || exit 1
chain_affix=_mstrenet
lang_affix=_music
echo "Linking data to local directories"
mkdir -p wav
[[ ! -L "wav/damp" ]] && ln -s $datadir_damp
[[ ! -L "wav/dali" ]] && ln -s $datadir_dali
[[ ! -L "wav/dali_talt" ]] && ln -s $datadir_dali_talt
[[ ! -L "wav/jamendo" ]] && ln -s $datadir_jamendo
echo; echo "===== Starting at $(date +"%D_%T") ====="; echo
mfccdir=mfcc
affix=_music # Label for music informed silence tagging
if [ $stage -le 1 ]; then
echo
echo "============================="
echo "---- DATA PREPROCESSING ----"
echo "===== $(date +"%D_%T") ====="
mkdir -p data/local/dict${affix}
cp conf/corpus_v2.txt data/local/corpus.txt # Corpus.txt for language model that includes lyrics from conf/corpus_v1.txt and train_dali
# Here, we add <music> phoneme in the class set and the relevant entry in the pronunciation dictionary
local/prepare_dict_music.sh --words 30000 --affix ${affix}
# Prepare necessary files for creating language FST.
utils/prepare_lang.sh --share-silence-phones true data/local/dict${affix} "<UNK>" data/local/lang${affix} data/lang${affix}
fi
if [ $stage -le 2 ]; then
echo
echo "============================="
echo "---- BUILDING THE LANGUAGE MODEL ----"
echo "===== $(date +"%D_%T") ====="
# Constructing the 4-gram MaxEnt language model
local/train_lms_srilm.sh \
--train-text data/local/corpus_en.txt \
--oov-symbol "<UNK>" --words-file data/lang$affix/words.txt \
data/ data/srilm$affix
# Compiles G for DSing 4-g LM
utils/format_lm.sh data/lang$affix data/srilm$affix/best_4gram.gz data/local/dict${affix}/lexicon.txt data/lang_4G$affix
fi
if [[ $stage -le 3 ]]; then
echo
echo "============================="
echo "---- MFCC FEATURES EXTRACTION ----"
echo "===== $(date +"%D_%T") ====="
for datadir in train_damp_music train_dali_music $test_sets; do
echo; echo "---- ${datadir}"
utils/fix_data_dir.sh data/$datadir
steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/$datadir exp$affix/make_mfcc/$datadir $mfccdir
steps/compute_cmvn_stats.sh data/${datadir}
utils/fix_data_dir.sh data/$datadir
done
fi
if [[ $stage -le 4 ]]; then
echo
echo "============================="
echo "TRAIN GMM-HMM : Mono - MONOPHONE"
echo
echo "===== $(date +"%D_%T") ====="
steps/train_mono.sh --nj $nj --cmd "$train_cmd" --totgauss 2000 --boost-silence 1.25 \
--num_iters 40 data/${trainset} data/lang$lang_affix exp$affix/mono
steps/align_si.sh --nj $nj --cmd "$train_cmd" --beam 50 --retry_beam 700 \
data/${trainset} data/lang$lang_affix exp$affix/mono exp$affix/mono_ali
utils/mkgraph.sh data/lang_4G$lang_affix exp$affix/mono exp$affix/mono/graph
fi
if [[ $stage -le 5 ]];then
echo
echo "============================="
echo "TRAIN GMM-HMM : Tri 1 - DELTA-BASED TRIPHONES"
echo "===== $(date +"%D_%T") ====="
steps/train_deltas.sh --cmd "$train_cmd" --boost_silence 1.25 --beam 50 --retry_beam 700 4000 24000 \
data/${trainset} data/lang$lang_affix exp$affix/mono_ali exp$affix/tri1
steps/align_si.sh --nj $nj --cmd "$train_cmd" --beam 50 --retry_beam 700 \
data/${trainset} data/lang$lang_affix exp$affix/tri1 exp$affix/tri1_ali
utils/mkgraph.sh data/lang_4G$lang_affix exp$affix/tri1 exp$affix/tri1/graph
fi
if [[ $stage -le 6 ]];then
echo
echo "============================="
echo "TRAIN GMM-HMM : Tri 2 - LDA-MLLT TRIPHONES"
echo "===== $(date +"%D_%T") ====="
steps/train_lda_mllt.sh --cmd "$train_cmd" --beam 50 --retry_beam 700 5000 40000 \
data/${trainset} data/lang$lang_affix exp$affix/tri1_ali exp$affix/tri2b
steps/align_si.sh --nj $nj --cmd "$train_cmd" --beam 50 --retry_beam 700 \
data/${trainset} data/lang$lang_affix exp$affix/tri2b exp$affix/tri2b_ali
fi
if [[ $stage -le 8 ]];then
echo
echo "TRAIN GMM-HMM : Tri 3 - SAT TRIPHONES"
echo "===== $(date +"%D_%T") ====="
steps/train_sat.sh --cmd "$train_cmd" --beam 40 --retry_beam 100 6000 70000 \
data/${trainset} data/lang$lang_affix exp$affix/tri2b_ali exp$affix/tri3b
utils/mkgraph.sh data/lang_4G$lang_affix exp$affix/tri3b exp$affix/tri3b/graph
fi
if [[ $stage -le 9 ]]; then
echo
echo "============================="
echo "------- DECODE USING TRIPHONE + SAT (TRI3B) MODEL --------"
echo "===== $(date +"%D_%T") ====="
echo
for datadir in $test_sets ; do
steps/decode_fmllr.sh --config conf/decode.config --nj 9 --cmd "$decode_cmd" \
--scoring-opts "--min-lmwt 10 --max-lmwt 20" --num-threads 4 --beam 40 \
exp$affix/tri3b/graph data/${datadir} exp$affix/tri3b/decode_${datadir}
done
fi
if [[ $stage -le 10 ]]; then
echo
echo "============================="
echo "------- COMPUTING PRONUNCIATION PROBABILITIES --------"
echo "===== $(date +"%D_%T") ====="
# Estimate pronunciation and silence probabilities.
# Silence probability for normal lexicon.
steps/get_prons.sh --cmd "$train_cmd" \
data/${trainset} data/lang_4G$affix exp$affix/tri3b || exit 1;
utils/dict_dir_add_pronprobs.sh --max-normalize true data/local/dict${affix} \
exp${affix}/tri3b/pron_counts_nowb.txt exp${affix}/tri3b/sil_counts_nowb.txt \
exp${affix}/tri3b/pron_bigram_counts_nowb.txt data/local/dict${affix}_prons || exit 1
echo
echo "============================="
echo "------- CREATING THE LANGUAGE MODEL WITH PRONUNCIATION PROBABILITIES --------"
echo "===== $(date +"%D_%T") ====="
utils/prepare_lang.sh data/local/dict${affix}_prons \
"<UNK>" data/local/lang${affix} data/lang${affix}_prons || exit 1;
mkdir -p data/lang_4G${affix}_prons
cp -r data/lang${affix}_prons/* data/lang_4G${affix}_prons/ || exit 1;
rm -rf data/local/lang_tmp
cp data/lang_4G${affix}/G.* data/lang_4G${affix}_prons/
fi
if [ $stage -le 11 ]; then
echo
echo "TRAIN (ANOTHER) SAT TRIPHONES GMM-HMM WITH PRONUNCIATION PROBABILITIES: Tri3b - PRONS "
echo "===== $(date +"%D_%T") ====="
#Combine DAMP and DALI data and retrain the final GMM-HMM model
trainset=train_damp_dali_music
./utils/combine_data_dir.sh $trainset data/train_damp_music data/train_dali_music
#Generate alignments for the combined train set.
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" --beam 40 --retry_beam 300 \
data/${trainset} data/lang${affix}_prons exp$affix/tri3b exp$lang_affix/tri3b_ali_prons
steps/train_sat.sh --cmd "$train_cmd" --beam 20 --retry_beam 100 6000 70000 \
data/${trainset} data/lang${lang_affix}_prons exp$affix/tri3b_ali_prons exp$affix/tri3b_prons
utils/mkgraph.sh data/lang_4G${affix}_prons exp$affix/tri3b_prons exp$affix/tri3b_prons/graph
echo
echo "------ END OF GMM-HMM TRAINING --------"
echo "===== $(date +"%D_%T") ====="
fi
if [[ $stage -le 12 ]]; then
echo
echo "============================="
echo "------- DECODE USING TRIPHONE + SAT (TRI3B) MODEL WITH PRON. PROBS.--------"
echo "===== $(date +"%D_%T") ====="
echo
for datadir in $test_sets ; do
steps/decode_fmllr.sh --config conf/decode.config --nj 9 --cmd "$decode_cmd" \
--scoring-opts "--min-lmwt 10 --max-lmwt 20" --num-threads 4 --beam 30 \
exp$affix/tri3b_prons/graph data/${datadir} exp$affix/tri3b_prons/decode_${datadir}
done
fi
trainset=train_damp_dali_music
if [[ $stage -le 13 ]]; then
echo
echo "=================="
echo "----- MSTRE-NET: TRAINING DNN-HMM BASED ON LFMMI OBJECTIVE -----"
echo "===== $(date +"%D_%T") ====="
echo
local/chain/run_multistream.sh --affix ${affix} --nnet3_affix ${chain_affix} --chain_affix ${chain_affix} \
--stage ${chain_stage} --train_stage ${train_stage} --train_set ${trainset} --test_sets ${test_sets}
fi
if [[ $stage -le 14 ]]; then
echo
echo "=================="
echo "----- INFERENCE WITH A PRETRAINED MODEL (MONOPHONIC - SINGLE STREAM A.M.) -----"
echo "===== $(date +"%D_%T") ====="
echo
fi
echo
echo "===== $(date +"%D_%T") ====="
echo "===== PROCESS ENDED ====="
echo
exit 1