@@ -79,11 +79,13 @@ struct whisper_params {
79
79
bool use_gpu = true ;
80
80
bool flash_attn = false ;
81
81
bool suppress_nst = false ;
82
+ bool vad = false ;
82
83
83
84
std::string language = " en" ;
84
85
std::string prompt;
85
86
std::string font_path = " /System/Library/Fonts/Supplemental/Courier New Bold.ttf" ;
86
87
std::string model = " models/ggml-base.en.bin" ;
88
+ std::string vad_model = " " ;
87
89
std::string grammar;
88
90
std::string grammar_rule;
89
91
@@ -176,8 +178,10 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
176
178
else if (arg == " -nt" || arg == " --no-timestamps" ) { params.no_timestamps = true ; }
177
179
else if (arg == " -l" || arg == " --language" ) { params.language = whisper_param_turn_lowercase (ARGV_NEXT); }
178
180
else if (arg == " -dl" || arg == " --detect-language" ) { params.detect_language = true ; }
181
+ else if (arg == " -v" || arg == " --vad" ) { params.vad = true ; }
179
182
else if ( arg == " --prompt" ) { params.prompt = ARGV_NEXT; }
180
183
else if (arg == " -m" || arg == " --model" ) { params.model = ARGV_NEXT; }
184
+ else if (arg == " -vm" || arg == " --vad-model" ) { params.vad_model = ARGV_NEXT; }
181
185
else if (arg == " -f" || arg == " --file" ) { params.fname_inp .emplace_back (ARGV_NEXT); }
182
186
else if (arg == " -oved" || arg == " --ov-e-device" ) { params.openvino_encode_device = ARGV_NEXT; }
183
187
else if (arg == " -dtw" || arg == " --dtw" ) { params.dtw = ARGV_NEXT; }
@@ -245,8 +249,10 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
245
249
fprintf (stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n " , params.no_timestamps ? " true" : " false" );
246
250
fprintf (stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n " , params.language .c_str ());
247
251
fprintf (stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n " , params.detect_language ? " true" : " false" );
252
+ fprintf (stderr, " -v, --vad [%-7s] enable Voice Activity Detection (VAD)\n " , params.vad ? " true" : " false" );
248
253
fprintf (stderr, " --prompt PROMPT [%-7s] initial prompt (max n_text_ctx/2 tokens)\n " , params.prompt .c_str ());
249
254
fprintf (stderr, " -m FNAME, --model FNAME [%-7s] model path\n " , params.model .c_str ());
255
+ fprintf (stderr, " -vm FNAME, --vad-model FNAME [%-7s] Voice Activity Detection (VAD) model path\n " , params.vad_model .c_str ());
250
256
fprintf (stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n " , " " );
251
257
fprintf (stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n " , params.openvino_encode_device .c_str ());
252
258
fprintf (stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n " , params.dtw .c_str ());
@@ -1154,8 +1160,12 @@ int main(int argc, char ** argv) {
1154
1160
1155
1161
wparams.suppress_nst = params.suppress_nst ;
1156
1162
1163
+ wparams.vad = params.vad ;
1164
+ wparams.vad_model_path = params.vad_model .c_str ();
1165
+
1157
1166
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
1158
1167
1168
+
1159
1169
const auto & grammar_parsed = params.grammar_parsed ;
1160
1170
auto grammar_rules = grammar_parsed.c_rules ();
1161
1171
0 commit comments