Skip to content

Commit 6b56b7d

Browse files
committed
vad : map timestamps to original audio
This commit adds a mapping of the original audio timestamps to the timestamps of the segments in the VAD (Voice Activity Detection) process. The motivation for this change is when we process the original audio signal and only pass the speech segments to whisper_full, the timestamps that whisper returnes when calling functions like whisper_full_get_segment_t0 are the timestamps for the "VAD" segments and not the original audio. The values are not identical to the the timestamps processed without VAD enabled but they are close, and hopefully close enough.
1 parent 8c43841 commit 6b56b7d

File tree

2 files changed

+147
-4
lines changed

2 files changed

+147
-4
lines changed

src/whisper.cpp

Lines changed: 143 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,15 @@ struct whisper_state {
956956

957957
// [EXPERIMENTAL] speed-up techniques
958958
int32_t exp_n_audio_ctx = 0; // 0 - use default
959+
960+
struct vad_segment_info {
961+
float orig_start;
962+
float orig_end;
963+
float vad_start;
964+
float vad_end;
965+
};
966+
std::vector<vad_segment_info> vad_segments;
967+
bool has_vad_segments = false;
959968
};
960969

961970
struct whisper_context {
@@ -6703,6 +6712,10 @@ int whisper_full_with_state(
67036712
struct whisper_vad_timestamps timestamps = whisper_vad_detect_speech_timestamps(vctx, vad_params, samples, n_samples);
67046713

67056714
if (timestamps.n_segments > 0) {
6715+
state->has_vad_segments = true;
6716+
ctx->state->vad_segments.clear();
6717+
ctx->state->vad_segments.reserve(timestamps.n_segments);
6718+
67066719
WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, timestamps.n_segments);
67076720
float overlap_seconds = params.vad_samples_overlap;
67086721
int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
@@ -6752,6 +6765,19 @@ int whisper_full_with_state(
67526765
int segment_length = segment_end_samples - segment_start_samples;
67536766

67546767
if (segment_length > 0) {
6768+
whisper_state::vad_segment_info segment;
6769+
6770+
segment.orig_start = timestamps.segments[i].start;
6771+
segment.orig_end = timestamps.segments[i].end;
6772+
6773+
segment.vad_start = offset / (float)WHISPER_SAMPLE_RATE;
6774+
segment.vad_end = (offset + segment_length) / (float)WHISPER_SAMPLE_RATE;
6775+
6776+
6777+
WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
6778+
__func__, segment.orig_start, segment.orig_end, segment.vad_start, segment.vad_end);
6779+
ctx->state->vad_segments.push_back(segment);
6780+
67556781
// Copy this speech segment
67566782
memcpy(filtered_samples + offset, samples + segment_start_samples, segment_length * sizeof(float));
67576783
offset += segment_length;
@@ -7826,19 +7852,132 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
78267852
}
78277853

78287854
int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
7829-
return state->result_all[i_segment].t0;
7855+
// If VAD wasn't used, return the original timestamp
7856+
if (!state->has_vad_segments || state->vad_segments.empty()) {
7857+
return state->result_all[i_segment].t0;
7858+
}
7859+
7860+
// For the first segment, always start at 0
7861+
if (i_segment == 0) {
7862+
return 0;
7863+
}
7864+
7865+
// Get the start timestamp produced by whisper_full. whisper_full processes
7866+
// only the speech segments in this case so we need to map these timestamps
7867+
// back to the original audio.
7868+
float t0 = state->result_all[i_segment].t0 / 100.0f;
7869+
7870+
// Find which VAD segment this timestamp belongs.
7871+
for (size_t i = 0; i < state->vad_segments.size(); i++) {
7872+
const auto& segment = state->vad_segments[i];
7873+
7874+
// Check if the timestamp falls within this segment.
7875+
if (t0 >= segment.vad_start && t0 <= segment.vad_end) {
7876+
float proportion = 0.0f;
7877+
if (segment.vad_end > segment.vad_start) {
7878+
proportion = (t0 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7879+
}
7880+
float orig_t0 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7881+
return (int64_t)(orig_t0 * 100);
7882+
}
7883+
}
7884+
7885+
// Check if the timestamp falls between two segments.
7886+
for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7887+
const auto & curr = state->vad_segments[i];
7888+
const auto & next = state->vad_segments[i + 1];
7889+
7890+
if (t0 > curr.vad_end && t0 < next.vad_start) {
7891+
// Calculate how far we are through the gap as a proportion
7892+
float gap_proportion = 0.0f;
7893+
if (next.vad_start > curr.vad_end) {
7894+
gap_proportion = (t0 - curr.vad_end) / (next.vad_start - curr.vad_end);
7895+
}
7896+
float orig_t0 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7897+
return (int64_t)(orig_t0 * 100);
7898+
}
7899+
}
7900+
7901+
// Handle the case where the timestamp is after the last segment.
7902+
if (t0 > state->vad_segments.back().vad_end) {
7903+
// For timestamps after the last segment, add the extra time to the end of the last segment
7904+
const auto& last = state->vad_segments.back();
7905+
// Calculate how far beyond the last segment
7906+
float extra_time = t0 - last.vad_end;
7907+
// Add this extra time to the original end time
7908+
float orig_t0 = last.orig_end + extra_time;
7909+
return (int64_t)(orig_t0 * 100);
7910+
}
7911+
7912+
WHISPER_LOG_WARN("%s: Could not map t0 = %f to a VAD segment\n", __func__, t0);
7913+
return t0;
78307914
}
78317915

78327916
int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
7833-
return ctx->state->result_all[i_segment].t0;
7917+
return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
78347918
}
78357919

78367920
int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
7837-
return state->result_all[i_segment].t1;
7921+
// If VAD wasn't used, return the original timestamp
7922+
if (!state->has_vad_segments || state->vad_segments.empty()) {
7923+
return state->result_all[i_segment].t1;
7924+
}
7925+
7926+
// Get the end timestamp produced by whisper_full. whisper_full processes
7927+
// only the speech segments in this case so we need to map these timestamps
7928+
// back to the original audio.
7929+
float t1 = state->result_all[i_segment].t1 / 100.0f;
7930+
7931+
// Find which VAD segment this timestamp belongs.
7932+
for (size_t i = 0; i < state->vad_segments.size(); i++) {
7933+
const auto& segment = state->vad_segments[i];
7934+
7935+
// Check if the timestamp falls within this segment.
7936+
if (t1 >= segment.vad_start && t1 <= segment.vad_end) {
7937+
// Calculate the proportion through the filtered segment.
7938+
float proportion = 0.0f;
7939+
if (segment.vad_end > segment.vad_start) {
7940+
proportion = (t1 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7941+
}
7942+
float orig_t1 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7943+
return (int64_t)(orig_t1 * 100);
7944+
}
7945+
}
7946+
7947+
// Check if the timestamp falls between two segments.
7948+
for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7949+
const auto & curr = state->vad_segments[i];
7950+
const auto & next = state->vad_segments[i + 1];
7951+
7952+
if (t1 > curr.vad_end && t1 < next.vad_start) {
7953+
// Calculate how far we are through the gap as a proportion
7954+
float gap_proportion = 0.0f;
7955+
if (next.vad_start > curr.vad_end) {
7956+
gap_proportion = (t1 - curr.vad_end) / (next.vad_start - curr.vad_end);
7957+
}
7958+
// Map to the corresponding position in the original gap
7959+
float orig_t1 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7960+
return (int64_t)(orig_t1 * 100);
7961+
}
7962+
}
7963+
7964+
// Handle the case where the timestamp is after the last segment
7965+
if (t1 > state->vad_segments.back().vad_end) {
7966+
// For the last segment, use the end of the last VAD segment
7967+
const auto& last = state->vad_segments.back();
7968+
// Calculate how far beyond the last segment
7969+
float extra_time = t1 - last.vad_end;
7970+
// Add this extra time to the original end time
7971+
float orig_t1 = last.orig_end + extra_time;
7972+
return (int64_t)(orig_t1 * 100);
7973+
}
7974+
7975+
WHISPER_LOG_WARN("%s: Could not map t1 = %f to a VAD segment\n", __func__, t1);
7976+
return t1;
78387977
}
78397978

78407979
int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
7841-
return ctx->state->result_all[i_segment].t1;
7980+
return whisper_full_get_segment_t1_from_state(ctx->state, i_segment);
78427981
}
78437982

78447983
bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment) {

tests/test-vad-full.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,15 @@ int main() {
4141

4242
const int n_segments = whisper_full_n_segments(wctx);
4343
assert(n_segments == 2);
44+
4445
assert(strcmp("And so my fellow Americans ask not what you country can do for you.",
4546
whisper_full_get_segment_text(wctx, 0)));
4647
assert(strcmp("Ask what you can do for your country.",
4748
whisper_full_get_segment_text(wctx, 1)));
4849

50+
assert(whisper_full_get_segment_t0(wctx, 0) == 0);
51+
assert(whisper_full_get_segment_t1(wctx, 1) == 1047);
52+
4953
whisper_free(wctx);
5054

5155
return 0;

0 commit comments

Comments
 (0)