|
11 | 11 | #include <thread>
|
12 | 12 | #include <vector>
|
13 | 13 | #include <cstring>
|
| 14 | +#include <cfloat> |
14 | 15 |
|
15 | 16 | #if defined(_WIN32)
|
16 | 17 | #ifndef NOMINMAX
|
@@ -97,6 +98,16 @@ struct whisper_params {
|
97 | 98 | std::vector<std::string> fname_out = {};
|
98 | 99 |
|
99 | 100 | grammar_parser::parse_state grammar_parsed;
|
| 101 | + |
| 102 | + // Voice Activity Detection (VAD) parameters |
| 103 | + bool vad = false; |
| 104 | + std::string vad_model = ""; |
| 105 | + float vad_threshold = 0.5f; |
| 106 | + int vad_min_speech_duration_ms = 250; |
| 107 | + int vad_min_silence_duration_ms = 100; |
| 108 | + float vad_max_speech_duration_s = FLT_MAX; |
| 109 | + int vad_speech_pad_ms = 30; |
| 110 | + float vad_samples_overlap = 0.1f; |
100 | 111 | };
|
101 | 112 |
|
102 | 113 | static void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
@@ -185,6 +196,15 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
185 | 196 | else if ( arg == "--grammar") { params.grammar = ARGV_NEXT; }
|
186 | 197 | else if ( arg == "--grammar-rule") { params.grammar_rule = ARGV_NEXT; }
|
187 | 198 | else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); }
|
| 199 | + // Voice Activity Detection (VAD) |
| 200 | + else if (arg == "-v" || arg == "--vad") { params.vad = true; } |
| 201 | + else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; } |
| 202 | + else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); } |
| 203 | + else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); } |
| 204 | + else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); } |
| 205 | + else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); } |
| 206 | + else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); } |
| 207 | + else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); } |
188 | 208 | else {
|
189 | 209 | fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
190 | 210 | whisper_print_usage(argc, argv, params);
|
@@ -254,6 +274,18 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
|
254 | 274 | fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str());
|
255 | 275 | fprintf(stderr, " --grammar-rule RULE [%-7s] top-level GBNF grammar rule name\n", params.grammar_rule.c_str());
|
256 | 276 | fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty);
|
| 277 | + // Voice Activity Detection (VAD) parameters |
| 278 | + fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n"); |
| 279 | + fprintf(stderr, " -v, --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false"); |
| 280 | + fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str()); |
| 281 | + fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold); |
| 282 | + fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms); |
| 283 | + fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms); |
| 284 | + fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ? |
| 285 | + std::string("FLT_MAX").c_str() : |
| 286 | + std::to_string(params.vad_max_speech_duration_s).c_str()); |
| 287 | + fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms); |
| 288 | + fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap); |
257 | 289 | fprintf(stderr, "\n");
|
258 | 290 | }
|
259 | 291 |
|
@@ -1134,6 +1166,16 @@ int main(int argc, char ** argv) {
|
1134 | 1166 |
|
1135 | 1167 | wparams.suppress_nst = params.suppress_nst;
|
1136 | 1168 |
|
| 1169 | + wparams.vad = params.vad; |
| 1170 | + wparams.vad_model_path = params.vad_model.c_str(); |
| 1171 | + |
| 1172 | + wparams.vad_params.threshold = params.vad_threshold; |
| 1173 | + wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms; |
| 1174 | + wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms; |
| 1175 | + wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s; |
| 1176 | + wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; |
| 1177 | + wparams.vad_params.samples_overlap = params.vad_samples_overlap; |
| 1178 | + |
1137 | 1179 | whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
1138 | 1180 |
|
1139 | 1181 | const auto & grammar_parsed = params.grammar_parsed;
|
|
0 commit comments