Skip to content

Commit 60d561b

Browse files
committed
vad : fix buffers and enable GPU support by default
1 parent 4e6271f commit 60d561b

File tree

1 file changed

+55
-27
lines changed

1 file changed

+55
-27
lines changed

src/whisper.cpp

Lines changed: 55 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ static bool ggml_graph_compute_helper(
170170
int n_threads,
171171
ggml_abort_callback abort_callback,
172172
void * abort_callback_data) {
173-
174173
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
175174

176175
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
@@ -191,8 +190,8 @@ static bool ggml_graph_compute_helper(
191190
static bool ggml_graph_compute_helper(
192191
ggml_backend_sched_t sched,
193192
struct ggml_cgraph * graph,
194-
int n_threads) {
195-
193+
int n_threads,
194+
bool sched_reset = true) {
196195
for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) {
197196
ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i);
198197
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
@@ -204,8 +203,12 @@ static bool ggml_graph_compute_helper(
204203
}
205204
}
206205

207-
bool t = ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS;
208-
ggml_backend_sched_reset(sched);
206+
const bool t = (ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS);
207+
208+
if (!t || sched_reset) {
209+
ggml_backend_sched_reset(sched);
210+
}
211+
209212
return t;
210213
}
211214

@@ -4421,6 +4424,10 @@ struct whisper_vad_state {
44214424
struct ggml_tensor * h_state;
44224425
struct ggml_tensor * c_state;
44234426

4427+
ggml_backend_buffer_t buffer = nullptr;
4428+
4429+
std::vector<uint8_t> ctx_buf;
4430+
44244431
whisper_sched sched;
44254432
};
44264433

@@ -4443,9 +4450,7 @@ struct whisper_vad_context {
44434450
struct whisper_vad_context_params whisper_vad_default_context_params(void) {
44444451
whisper_vad_context_params result = {
44454452
/*.n_thread = */ 4,
4446-
// TODO(danbev) Default to true when CUDA GPU support is working:
4447-
// https://github.com/ggml-org/whisper.cpp/pull/3065#issuecomment-2858583911
4448-
/*.use_gpu = */ false,
4453+
/*.use_gpu = */ true,
44494454
/*.gpu_device = */ 0,
44504455
};
44514456
return result;
@@ -4601,6 +4606,7 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
46014606

46024607
// Create add operation to get preactivations for all gates.
46034608
struct ggml_tensor * out_gate = ggml_add(ctx0, inp_gate, hid_gate);
4609+
46044610
const size_t hdim_size = ggml_row_size(out_gate->type, hdim);
46054611

46064612
// Create sigmoid for input gate (using the first 128 bytes from the preactivations).
@@ -4623,12 +4629,13 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
46234629

46244630
// Update hidden state
46254631
struct ggml_tensor * out = ggml_mul(ctx0, o_t, ggml_tanh(ctx0, c_out));
4626-
ggml_build_forward_expand(gf, ggml_cpy(ctx0, out, vctx.state->h_state));
4632+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, out, vctx.state->h_state));
4633+
46274634
return out;
46284635
}
46294636

46304637
static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx) {
4631-
const auto & model = vctx.model;
4638+
const auto & model = vctx.model;
46324639

46334640
struct ggml_init_params params = {
46344641
/*.mem_size =*/ vctx.state->sched.meta.size(),
@@ -4677,22 +4684,28 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
46774684
auto whisper_context_params = whisper_context_default_params();
46784685
whisper_context_params.use_gpu = vctx->params.use_gpu;
46794686
whisper_context_params.gpu_device = vctx->params.gpu_device;
4687+
46804688
state->backends = whisper_backend_init(whisper_context_params);
46814689
if (state->backends.empty()) {
46824690
WHISPER_LOG_ERROR("%s: whisper_backend_init() failed\n", __func__);
46834691
whisper_vad_free_state(state);
46844692
return nullptr;
46854693
}
46864694

4687-
int32_t lstm_hidden_size = vctx->model.hparams.lstm_hidden_size;
4695+
const int32_t lstm_hidden_size = vctx->model.hparams.lstm_hidden_size;
4696+
4697+
state->ctx_buf.resize(2u*ggml_tensor_overhead());
4698+
46884699
struct ggml_init_params params = {
4689-
/*.mem_size =*/ size_t(2u*lstm_hidden_size*ggml_tensor_overhead()),
4690-
/*.mem_buffer =*/ NULL,
4700+
/*.mem_size =*/ state->ctx_buf.size(),
4701+
/*.mem_buffer =*/ state->ctx_buf.data(),
46914702
/*.no_alloc =*/ true,
46924703
};
4704+
46934705
ggml_context * ctx = ggml_init(params);
46944706
if (!ctx) {
46954707
WHISPER_LOG_ERROR("%s: failed to init LSTM state ggml context\n", __func__);
4708+
whisper_vad_free_state(state);
46964709
return nullptr;
46974710
}
46984711

@@ -4704,6 +4717,13 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
47044717
state->c_state = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, lstm_hidden_size);
47054718
ggml_set_name(state->c_state, "c_state");
47064719

4720+
state->buffer = ggml_backend_alloc_ctx_tensors(ctx, state->backends[0]);
4721+
if (!state->buffer) {
4722+
WHISPER_LOG_ERROR("%s: failed to allocate memory for the VAD state\n", __func__);
4723+
whisper_vad_free_state(state);
4724+
return nullptr;
4725+
}
4726+
47074727
{
47084728
bool ok = whisper_sched_graph_init(state->sched, state->backends,
47094729
[&]() {
@@ -5106,11 +5126,20 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51065126
if (n_samples % vctx->n_window != 0) {
51075127
n_chunks += 1; // Add one more chunk for remaining samples.
51085128
}
5109-
auto & sched = vctx->state->sched.sched;
5110-
51115129
WHISPER_LOG_INFO("%s: detecting speech in %d samples\n", __func__, n_samples);
51125130
WHISPER_LOG_INFO("%s: n_chunks: %d\n", __func__, n_chunks);
51135131

5132+
// Reset LSTM hidden/cell states
5133+
ggml_backend_buffer_clear(vctx->state->buffer, 0);
5134+
5135+
// TODO: move to vad state and change to std::vector<float>
5136+
float * probs = new float[n_chunks];
5137+
WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);
5138+
5139+
std::vector<float> window(vctx->n_window, 0.0f);
5140+
5141+
auto & sched = vctx->state->sched.sched;
5142+
51145143
ggml_cgraph * gf = whisper_vad_build_graph(*vctx);
51155144

51165145
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
@@ -5120,19 +5149,13 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51205149

51215150
struct ggml_tensor * frame = ggml_graph_get_tensor(gf, "frame");
51225151
struct ggml_tensor * prob = ggml_graph_get_tensor(gf, "prob");
5123-
ggml_set_zero(prob);
51245152

5125-
// Reset LSTM hidden/cell states
5126-
ggml_set_zero(vctx->state->h_state);
5127-
ggml_set_zero(vctx->state->c_state);
5128-
5129-
float * probs= new float[n_chunks];
5130-
WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);
5131-
5132-
std::vector<float> window(vctx->n_window, 0.0f);
5153+
// we are going to reuse the graph multiple times for each chunk
5154+
// TODO: measure time and print timing information for this step
51335155
for (int i = 0; i < n_chunks; i++) {
51345156
int start_idx = i * vctx->n_window;
51355157
int end_idx = std::min(start_idx + vctx->n_window, n_samples);
5158+
51365159
int chunk_len = end_idx - start_idx;
51375160

51385161
if (chunk_len < vctx->n_window) {
@@ -5150,28 +5173,33 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51505173
} else {
51515174
// Copy current frame samples to the window.
51525175
int samples_to_copy = std::min(end_idx - start_idx, vctx->n_window);
5153-
std::copy(pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy,
5154-
window.begin());
5176+
std::copy(pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy, window.begin());
51555177
}
51565178

51575179
// Set the frame tensor data with the samples.
51585180
ggml_backend_tensor_set(frame, window.data(), 0, ggml_nelements(frame) * sizeof(float));
51595181

5160-
if (!ggml_graph_compute_helper(sched, gf, vctx->n_threads)) {
5182+
// do not reset the scheduler - we will reuse the graph in the next chunk
5183+
if (!ggml_graph_compute_helper(sched, gf, vctx->n_threads, false)) {
51615184
WHISPER_LOG_ERROR("%s: failed to compute VAD graph\n", __func__);
51625185
break;
51635186
}
51645187

51655188
// Get the probability for this chunk.
51665189
ggml_backend_tensor_get(prob, &probs[i], 0, sizeof(float));
51675190

5191+
//WHISPER_LOG_DEBUG("chunk %d: p = %7.3f\n", i, probs[i]);
51685192
}
5193+
5194+
ggml_backend_sched_reset(sched);
5195+
51695196
WHISPER_LOG_INFO("%s: finished processing %d samples\n", __func__, n_samples);
51705197

51715198
struct whisper_vad_speech speech = {
51725199
/* n_probs = */ n_chunks,
51735200
/* probs = */ probs,
51745201
};
5202+
51755203
return speech;
51765204
}
51775205

0 commit comments

Comments
 (0)