@@ -170,7 +170,6 @@ static bool ggml_graph_compute_helper(
170
170
int n_threads,
171
171
ggml_abort_callback abort_callback,
172
172
void * abort_callback_data) {
173
-
174
173
ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
175
174
176
175
auto * reg = ggml_backend_dev_backend_reg (ggml_backend_get_device (backend.get ()));
@@ -191,8 +190,8 @@ static bool ggml_graph_compute_helper(
191
190
static bool ggml_graph_compute_helper (
192
191
ggml_backend_sched_t sched,
193
192
struct ggml_cgraph * graph,
194
- int n_threads) {
195
-
193
+ int n_threads,
194
+ bool sched_reset = true ) {
196
195
for (int i = 0 ; i < ggml_backend_sched_get_n_backends (sched); ++i) {
197
196
ggml_backend_t backend = ggml_backend_sched_get_backend (sched, i);
198
197
ggml_backend_dev_t dev = ggml_backend_get_device (backend);
@@ -204,8 +203,12 @@ static bool ggml_graph_compute_helper(
204
203
}
205
204
}
206
205
207
- bool t = ggml_backend_sched_graph_compute (sched, graph) == GGML_STATUS_SUCCESS;
208
- ggml_backend_sched_reset (sched);
206
+ const bool t = (ggml_backend_sched_graph_compute (sched, graph) == GGML_STATUS_SUCCESS);
207
+
208
+ if (!t || sched_reset) {
209
+ ggml_backend_sched_reset (sched);
210
+ }
211
+
209
212
return t;
210
213
}
211
214
@@ -4421,6 +4424,10 @@ struct whisper_vad_state {
4421
4424
struct ggml_tensor * h_state;
4422
4425
struct ggml_tensor * c_state;
4423
4426
4427
+ ggml_backend_buffer_t buffer = nullptr ;
4428
+
4429
+ std::vector<uint8_t > ctx_buf;
4430
+
4424
4431
whisper_sched sched;
4425
4432
};
4426
4433
@@ -4443,9 +4450,7 @@ struct whisper_vad_context {
4443
4450
struct whisper_vad_context_params whisper_vad_default_context_params (void ) {
4444
4451
whisper_vad_context_params result = {
4445
4452
/* .n_thread = */ 4 ,
4446
- // TODO(danbev) Default to true when CUDA GPU support is working:
4447
- // https://github.com/ggml-org/whisper.cpp/pull/3065#issuecomment-2858583911
4448
- /* .use_gpu = */ false ,
4453
+ /* .use_gpu = */ true ,
4449
4454
/* .gpu_device = */ 0 ,
4450
4455
};
4451
4456
return result;
@@ -4601,6 +4606,7 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
4601
4606
4602
4607
// Create add operation to get preactivations for all gates.
4603
4608
struct ggml_tensor * out_gate = ggml_add (ctx0, inp_gate, hid_gate);
4609
+
4604
4610
const size_t hdim_size = ggml_row_size (out_gate->type , hdim);
4605
4611
4606
4612
// Create sigmoid for input gate (using the first 128 bytes from the preactivations).
@@ -4623,12 +4629,13 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
4623
4629
4624
4630
// Update hidden state
4625
4631
struct ggml_tensor * out = ggml_mul (ctx0, o_t , ggml_tanh (ctx0, c_out));
4626
- ggml_build_forward_expand (gf, ggml_cpy (ctx0, out, vctx.state ->h_state ));
4632
+ ggml_build_forward_expand (gf, ggml_cpy (ctx0, out, vctx.state ->h_state ));
4633
+
4627
4634
return out;
4628
4635
}
4629
4636
4630
4637
static struct ggml_cgraph * whisper_vad_build_graph (whisper_vad_context & vctx) {
4631
- const auto & model = vctx.model ;
4638
+ const auto & model = vctx.model ;
4632
4639
4633
4640
struct ggml_init_params params = {
4634
4641
/* .mem_size =*/ vctx.state ->sched .meta .size (),
@@ -4677,22 +4684,28 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
4677
4684
auto whisper_context_params = whisper_context_default_params ();
4678
4685
whisper_context_params.use_gpu = vctx->params .use_gpu ;
4679
4686
whisper_context_params.gpu_device = vctx->params .gpu_device ;
4687
+
4680
4688
state->backends = whisper_backend_init (whisper_context_params);
4681
4689
if (state->backends .empty ()) {
4682
4690
WHISPER_LOG_ERROR (" %s: whisper_backend_init() failed\n " , __func__);
4683
4691
whisper_vad_free_state (state);
4684
4692
return nullptr ;
4685
4693
}
4686
4694
4687
- int32_t lstm_hidden_size = vctx->model .hparams .lstm_hidden_size ;
4695
+ const int32_t lstm_hidden_size = vctx->model .hparams .lstm_hidden_size ;
4696
+
4697
+ state->ctx_buf .resize (2u *ggml_tensor_overhead ());
4698
+
4688
4699
struct ggml_init_params params = {
4689
- /* .mem_size =*/ size_t ( 2u *lstm_hidden_size* ggml_tensor_overhead () ),
4690
- /* .mem_buffer =*/ NULL ,
4700
+ /* .mem_size =*/ state-> ctx_buf . size ( ),
4701
+ /* .mem_buffer =*/ state-> ctx_buf . data () ,
4691
4702
/* .no_alloc =*/ true ,
4692
4703
};
4704
+
4693
4705
ggml_context * ctx = ggml_init (params);
4694
4706
if (!ctx) {
4695
4707
WHISPER_LOG_ERROR (" %s: failed to init LSTM state ggml context\n " , __func__);
4708
+ whisper_vad_free_state (state);
4696
4709
return nullptr ;
4697
4710
}
4698
4711
@@ -4704,6 +4717,13 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
4704
4717
state->c_state = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, lstm_hidden_size);
4705
4718
ggml_set_name (state->c_state , " c_state" );
4706
4719
4720
+ state->buffer = ggml_backend_alloc_ctx_tensors (ctx, state->backends [0 ]);
4721
+ if (!state->buffer ) {
4722
+ WHISPER_LOG_ERROR (" %s: failed to allocate memory for the VAD state\n " , __func__);
4723
+ whisper_vad_free_state (state);
4724
+ return nullptr ;
4725
+ }
4726
+
4707
4727
{
4708
4728
bool ok = whisper_sched_graph_init (state->sched , state->backends ,
4709
4729
[&]() {
@@ -5106,11 +5126,20 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
5106
5126
if (n_samples % vctx->n_window != 0 ) {
5107
5127
n_chunks += 1 ; // Add one more chunk for remaining samples.
5108
5128
}
5109
- auto & sched = vctx->state ->sched .sched ;
5110
-
5111
5129
WHISPER_LOG_INFO (" %s: detecting speech in %d samples\n " , __func__, n_samples);
5112
5130
WHISPER_LOG_INFO (" %s: n_chunks: %d\n " , __func__, n_chunks);
5113
5131
5132
+ // Reset LSTM hidden/cell states
5133
+ ggml_backend_buffer_clear (vctx->state ->buffer , 0 );
5134
+
5135
+ // TODO: move to vad state and change to std::vector<float>
5136
+ float * probs = new float [n_chunks];
5137
+ WHISPER_LOG_INFO (" %s: props size: %u\n " , __func__, n_chunks);
5138
+
5139
+ std::vector<float > window (vctx->n_window , 0 .0f );
5140
+
5141
+ auto & sched = vctx->state ->sched .sched ;
5142
+
5114
5143
ggml_cgraph * gf = whisper_vad_build_graph (*vctx);
5115
5144
5116
5145
if (!ggml_backend_sched_alloc_graph (sched, gf)) {
@@ -5120,19 +5149,13 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
5120
5149
5121
5150
struct ggml_tensor * frame = ggml_graph_get_tensor (gf, " frame" );
5122
5151
struct ggml_tensor * prob = ggml_graph_get_tensor (gf, " prob" );
5123
- ggml_set_zero (prob);
5124
5152
5125
- // Reset LSTM hidden/cell states
5126
- ggml_set_zero (vctx->state ->h_state );
5127
- ggml_set_zero (vctx->state ->c_state );
5128
-
5129
- float * probs= new float [n_chunks];
5130
- WHISPER_LOG_INFO (" %s: props size: %u\n " , __func__, n_chunks);
5131
-
5132
- std::vector<float > window (vctx->n_window , 0 .0f );
5153
+ // we are going to reuse the graph multiple times for each chunk
5154
+ // TODO: measure time and print timing information for this step
5133
5155
for (int i = 0 ; i < n_chunks; i++) {
5134
5156
int start_idx = i * vctx->n_window ;
5135
5157
int end_idx = std::min (start_idx + vctx->n_window , n_samples);
5158
+
5136
5159
int chunk_len = end_idx - start_idx;
5137
5160
5138
5161
if (chunk_len < vctx->n_window ) {
@@ -5150,28 +5173,33 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
5150
5173
} else {
5151
5174
// Copy current frame samples to the window.
5152
5175
int samples_to_copy = std::min (end_idx - start_idx, vctx->n_window );
5153
- std::copy (pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy,
5154
- window.begin ());
5176
+ std::copy (pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy, window.begin ());
5155
5177
}
5156
5178
5157
5179
// Set the frame tensor data with the samples.
5158
5180
ggml_backend_tensor_set (frame, window.data (), 0 , ggml_nelements (frame) * sizeof (float ));
5159
5181
5160
- if (!ggml_graph_compute_helper (sched, gf, vctx->n_threads )) {
5182
+ // do not reset the scheduler - we will reuse the graph in the next chunk
5183
+ if (!ggml_graph_compute_helper (sched, gf, vctx->n_threads , false )) {
5161
5184
WHISPER_LOG_ERROR (" %s: failed to compute VAD graph\n " , __func__);
5162
5185
break ;
5163
5186
}
5164
5187
5165
5188
// Get the probability for this chunk.
5166
5189
ggml_backend_tensor_get (prob, &probs[i], 0 , sizeof (float ));
5167
5190
5191
+ // WHISPER_LOG_DEBUG("chunk %d: p = %7.3f\n", i, probs[i]);
5168
5192
}
5193
+
5194
+ ggml_backend_sched_reset (sched);
5195
+
5169
5196
WHISPER_LOG_INFO (" %s: finished processing %d samples\n " , __func__, n_samples);
5170
5197
5171
5198
struct whisper_vad_speech speech = {
5172
5199
/* n_probs = */ n_chunks,
5173
5200
/* probs = */ probs,
5174
5201
};
5202
+
5175
5203
return speech;
5176
5204
}
5177
5205
0 commit comments