Skip to content

Commit 5a6236e

Browse files
committed
vad : fix buffers and enable GPU support by default
1 parent 4e6271f commit 5a6236e

File tree

1 file changed

+57
-27
lines changed

1 file changed

+57
-27
lines changed

src/whisper.cpp

Lines changed: 57 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ static bool ggml_graph_compute_helper(
170170
int n_threads,
171171
ggml_abort_callback abort_callback,
172172
void * abort_callback_data) {
173-
174173
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
175174

176175
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
@@ -191,8 +190,8 @@ static bool ggml_graph_compute_helper(
191190
static bool ggml_graph_compute_helper(
192191
ggml_backend_sched_t sched,
193192
struct ggml_cgraph * graph,
194-
int n_threads) {
195-
193+
int n_threads,
194+
bool sched_reset = true) {
196195
for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) {
197196
ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i);
198197
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
@@ -204,8 +203,12 @@ static bool ggml_graph_compute_helper(
204203
}
205204
}
206205

207-
bool t = ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS;
208-
ggml_backend_sched_reset(sched);
206+
const bool t = (ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS);
207+
208+
if (!t || sched_reset) {
209+
ggml_backend_sched_reset(sched);
210+
}
211+
209212
return t;
210213
}
211214

@@ -4421,6 +4424,10 @@ struct whisper_vad_state {
44214424
struct ggml_tensor * h_state;
44224425
struct ggml_tensor * c_state;
44234426

4427+
ggml_backend_buffer_t buffer = nullptr;
4428+
4429+
std::vector<uint8_t> ctx_buf;
4430+
44244431
whisper_sched sched;
44254432
};
44264433

@@ -4443,9 +4450,7 @@ struct whisper_vad_context {
44434450
struct whisper_vad_context_params whisper_vad_default_context_params(void) {
44444451
whisper_vad_context_params result = {
44454452
/*.n_thread = */ 4,
4446-
// TODO(danbev) Default to true when CUDA GPU support is working:
4447-
// https://github.com/ggml-org/whisper.cpp/pull/3065#issuecomment-2858583911
4448-
/*.use_gpu = */ false,
4453+
/*.use_gpu = */ true,
44494454
/*.gpu_device = */ 0,
44504455
};
44514456
return result;
@@ -4601,6 +4606,9 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
46014606

46024607
// Create add operation to get preactivations for all gates.
46034608
struct ggml_tensor * out_gate = ggml_add(ctx0, inp_gate, hid_gate);
4609+
4610+
ggml_build_forward_expand(gf, out_gate);
4611+
46044612
const size_t hdim_size = ggml_row_size(out_gate->type, hdim);
46054613

46064614
// Create sigmoid for input gate (using the first 128 bytes from the preactivations).
@@ -4623,12 +4631,13 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
46234631

46244632
// Update hidden state
46254633
struct ggml_tensor * out = ggml_mul(ctx0, o_t, ggml_tanh(ctx0, c_out));
4626-
ggml_build_forward_expand(gf, ggml_cpy(ctx0, out, vctx.state->h_state));
4634+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, out, vctx.state->h_state));
4635+
46274636
return out;
46284637
}
46294638

46304639
static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx) {
4631-
const auto & model = vctx.model;
4640+
const auto & model = vctx.model;
46324641

46334642
struct ggml_init_params params = {
46344643
/*.mem_size =*/ vctx.state->sched.meta.size(),
@@ -4677,22 +4686,28 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
46774686
auto whisper_context_params = whisper_context_default_params();
46784687
whisper_context_params.use_gpu = vctx->params.use_gpu;
46794688
whisper_context_params.gpu_device = vctx->params.gpu_device;
4689+
46804690
state->backends = whisper_backend_init(whisper_context_params);
46814691
if (state->backends.empty()) {
46824692
WHISPER_LOG_ERROR("%s: whisper_backend_init() failed\n", __func__);
46834693
whisper_vad_free_state(state);
46844694
return nullptr;
46854695
}
46864696

4687-
int32_t lstm_hidden_size = vctx->model.hparams.lstm_hidden_size;
4697+
const int32_t lstm_hidden_size = vctx->model.hparams.lstm_hidden_size;
4698+
4699+
state->ctx_buf.resize(2u*ggml_tensor_overhead());
4700+
46884701
struct ggml_init_params params = {
4689-
/*.mem_size =*/ size_t(2u*lstm_hidden_size*ggml_tensor_overhead()),
4690-
/*.mem_buffer =*/ NULL,
4702+
/*.mem_size =*/ state->ctx_buf.size(),
4703+
/*.mem_buffer =*/ state->ctx_buf.data(),
46914704
/*.no_alloc =*/ true,
46924705
};
4706+
46934707
ggml_context * ctx = ggml_init(params);
46944708
if (!ctx) {
46954709
WHISPER_LOG_ERROR("%s: failed to init LSTM state ggml context\n", __func__);
4710+
whisper_vad_free_state(state);
46964711
return nullptr;
46974712
}
46984713

@@ -4704,6 +4719,13 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
47044719
state->c_state = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, lstm_hidden_size);
47054720
ggml_set_name(state->c_state, "c_state");
47064721

4722+
state->buffer = ggml_backend_alloc_ctx_tensors(ctx, state->backends[0]);
4723+
if (!state->buffer) {
4724+
WHISPER_LOG_ERROR("%s: failed to allocate memory for the VAD state\n", __func__);
4725+
whisper_vad_free_state(state);
4726+
return nullptr;
4727+
}
4728+
47074729
{
47084730
bool ok = whisper_sched_graph_init(state->sched, state->backends,
47094731
[&]() {
@@ -5106,11 +5128,20 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51065128
if (n_samples % vctx->n_window != 0) {
51075129
n_chunks += 1; // Add one more chunk for remaining samples.
51085130
}
5109-
auto & sched = vctx->state->sched.sched;
5110-
51115131
WHISPER_LOG_INFO("%s: detecting speech in %d samples\n", __func__, n_samples);
51125132
WHISPER_LOG_INFO("%s: n_chunks: %d\n", __func__, n_chunks);
51135133

5134+
// Reset LSTM hidden/cell states
5135+
ggml_backend_buffer_clear(vctx->state->buffer, 0);
5136+
5137+
// TODO: move to vad state and change to std::vector<float>
5138+
float * probs = new float[n_chunks];
5139+
WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);
5140+
5141+
std::vector<float> window(vctx->n_window, 0.0f);
5142+
5143+
auto & sched = vctx->state->sched.sched;
5144+
51145145
ggml_cgraph * gf = whisper_vad_build_graph(*vctx);
51155146

51165147
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
@@ -5120,19 +5151,13 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51205151

51215152
struct ggml_tensor * frame = ggml_graph_get_tensor(gf, "frame");
51225153
struct ggml_tensor * prob = ggml_graph_get_tensor(gf, "prob");
5123-
ggml_set_zero(prob);
51245154

5125-
// Reset LSTM hidden/cell states
5126-
ggml_set_zero(vctx->state->h_state);
5127-
ggml_set_zero(vctx->state->c_state);
5128-
5129-
float * probs= new float[n_chunks];
5130-
WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);
5131-
5132-
std::vector<float> window(vctx->n_window, 0.0f);
5155+
// we are going to reuse the graph multiple times for each chunk
5156+
// TODO: measure time and print timing information for this step
51335157
for (int i = 0; i < n_chunks; i++) {
51345158
int start_idx = i * vctx->n_window;
51355159
int end_idx = std::min(start_idx + vctx->n_window, n_samples);
5160+
51365161
int chunk_len = end_idx - start_idx;
51375162

51385163
if (chunk_len < vctx->n_window) {
@@ -5150,28 +5175,33 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51505175
} else {
51515176
// Copy current frame samples to the window.
51525177
int samples_to_copy = std::min(end_idx - start_idx, vctx->n_window);
5153-
std::copy(pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy,
5154-
window.begin());
5178+
std::copy(pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy, window.begin());
51555179
}
51565180

51575181
// Set the frame tensor data with the samples.
51585182
ggml_backend_tensor_set(frame, window.data(), 0, ggml_nelements(frame) * sizeof(float));
51595183

5160-
if (!ggml_graph_compute_helper(sched, gf, vctx->n_threads)) {
5184+
// do not reset the scheduler - we will reuse the graph in the next chunk
5185+
if (!ggml_graph_compute_helper(sched, gf, vctx->n_threads, false)) {
51615186
WHISPER_LOG_ERROR("%s: failed to compute VAD graph\n", __func__);
51625187
break;
51635188
}
51645189

51655190
// Get the probability for this chunk.
51665191
ggml_backend_tensor_get(prob, &probs[i], 0, sizeof(float));
51675192

5193+
//WHISPER_LOG_DEBUG("chunk %d: p = %7.3f\n", i, probs[i]);
51685194
}
5195+
5196+
ggml_backend_sched_reset(sched);
5197+
51695198
WHISPER_LOG_INFO("%s: finished processing %d samples\n", __func__, n_samples);
51705199

51715200
struct whisper_vad_speech speech = {
51725201
/* n_probs = */ n_chunks,
51735202
/* probs = */ probs,
51745203
};
5204+
51755205
return speech;
51765206
}
51775207

0 commit comments

Comments
 (0)