Skip to content

Commit 9f0ed3d

Browse files
committed
examples : add VAD support to whisper-cli [no ci]
1 parent 7a1b991 commit 9f0ed3d

File tree

3 files changed

+17
-0
lines changed

3 files changed

+17
-0
lines changed

examples/cli/cli.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,13 @@ struct whisper_params {
7979
bool use_gpu = true;
8080
bool flash_attn = false;
8181
bool suppress_nst = false;
82+
bool vad = false;
8283

8384
std::string language = "en";
8485
std::string prompt;
8586
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
8687
std::string model = "models/ggml-base.en.bin";
88+
std::string vad_model = "";
8789
std::string grammar;
8890
std::string grammar_rule;
8991

@@ -176,8 +178,10 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
176178
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
177179
else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); }
178180
else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; }
181+
else if (arg == "-v" || arg == "--vad") { params.vad = true; }
179182
else if ( arg == "--prompt") { params.prompt = ARGV_NEXT; }
180183
else if (arg == "-m" || arg == "--model") { params.model = ARGV_NEXT; }
184+
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; }
181185
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(ARGV_NEXT); }
182186
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = ARGV_NEXT; }
183187
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = ARGV_NEXT; }
@@ -245,8 +249,10 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
245249
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
246250
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
247251
fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
252+
fprintf(stderr, " -v, --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
248253
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt (max n_text_ctx/2 tokens)\n", params.prompt.c_str());
249254
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
255+
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] Voice Activity Detection (VAD) model path\n", params.vad_model.c_str());
250256
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", "");
251257
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
252258
fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
@@ -1154,8 +1160,12 @@ int main(int argc, char ** argv) {
11541160

11551161
wparams.suppress_nst = params.suppress_nst;
11561162

1163+
wparams.vad = params.vad;
1164+
wparams.vad_model_path = params.vad_model.c_str();
1165+
11571166
whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
11581167

1168+
11591169
const auto & grammar_parsed = params.grammar_parsed;
11601170
auto grammar_rules = grammar_parsed.c_rules();
11611171

examples/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
8080
}
8181
} else if (arg == "-tt" || arg == "--token_test") {
8282
params.token_test = get_next_arg(i, argc, argv, arg, params);
83+
} else if (arg == "--vad") {
84+
params.vad = true;
85+
} else if (arg == "-vm" || arg == "--vad-model") {
86+
params.vad_model = get_next_arg(i, argc, argv, arg, params);
8387
}
8488
else {
8589
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());

examples/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ struct gpt_params {
3939

4040
bool interactive = false;
4141
int32_t interactive_port = -1;
42+
43+
bool vad = false; // Enable VAD
44+
std::string vad_model = ""; // VAD model path
4245
};
4346

4447
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

0 commit comments

Comments
 (0)