@@ -956,6 +956,15 @@ struct whisper_state {
956
956
957
957
// [EXPERIMENTAL] speed-up techniques
958
958
int32_t exp_n_audio_ctx = 0; // 0 - use default
959
+
960
+ struct vad_segment_info {
961
+ float orig_start;
962
+ float orig_end;
963
+ float vad_start;
964
+ float vad_end;
965
+ };
966
+ std::vector<vad_segment_info> vad_segments;
967
+ bool has_vad_segments = false;
959
968
};
960
969
961
970
struct whisper_context {
@@ -6703,6 +6712,10 @@ int whisper_full_with_state(
6703
6712
struct whisper_vad_timestamps timestamps = whisper_vad_detect_speech_timestamps(vctx, vad_params, samples, n_samples);
6704
6713
6705
6714
if (timestamps.n_segments > 0) {
6715
+ state->has_vad_segments = true;
6716
+ ctx->state->vad_segments.clear();
6717
+ ctx->state->vad_segments.reserve(timestamps.n_segments);
6718
+
6706
6719
WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, timestamps.n_segments);
6707
6720
float overlap_seconds = params.vad_samples_overlap;
6708
6721
int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
@@ -6752,6 +6765,19 @@ int whisper_full_with_state(
6752
6765
int segment_length = segment_end_samples - segment_start_samples;
6753
6766
6754
6767
if (segment_length > 0) {
6768
+ whisper_state::vad_segment_info segment;
6769
+
6770
+ segment.orig_start = timestamps.segments[i].start;
6771
+ segment.orig_end = timestamps.segments[i].end;
6772
+
6773
+ segment.vad_start = offset / (float)WHISPER_SAMPLE_RATE;
6774
+ segment.vad_end = (offset + segment_length) / (float)WHISPER_SAMPLE_RATE;
6775
+
6776
+
6777
+ WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
6778
+ __func__, segment.orig_start, segment.orig_end, segment.vad_start, segment.vad_end);
6779
+ ctx->state->vad_segments.push_back(segment);
6780
+
6755
6781
// Copy this speech segment
6756
6782
memcpy(filtered_samples + offset, samples + segment_start_samples, segment_length * sizeof(float));
6757
6783
offset += segment_length;
@@ -7826,19 +7852,132 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
7826
7852
}
7827
7853
7828
7854
int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
7829
- return state->result_all[i_segment].t0;
7855
+ // If VAD wasn't used, return the original timestamp
7856
+ if (!state->has_vad_segments || state->vad_segments.empty()) {
7857
+ return state->result_all[i_segment].t0;
7858
+ }
7859
+
7860
+ // For the first segment, always start at 0
7861
+ if (i_segment == 0) {
7862
+ return 0;
7863
+ }
7864
+
7865
+ // Get the start timestamp produced by whisper_full. whisper_full processes
7866
+ // only the speech segments in this case so we need to map these timestamps
7867
+ // back to the original audio.
7868
+ float t0 = state->result_all[i_segment].t0 / 100.0f;
7869
+
7870
+ // Find which VAD segment this timestamp belongs.
7871
+ for (size_t i = 0; i < state->vad_segments.size(); i++) {
7872
+ const auto& segment = state->vad_segments[i];
7873
+
7874
+ // Check if the timestamp falls within this segment.
7875
+ if (t0 >= segment.vad_start && t0 <= segment.vad_end) {
7876
+ float proportion = 0.0f;
7877
+ if (segment.vad_end > segment.vad_start) {
7878
+ proportion = (t0 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7879
+ }
7880
+ float orig_t0 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7881
+ return (int64_t)(orig_t0 * 100);
7882
+ }
7883
+ }
7884
+
7885
+ // Check if the timestamp falls between two segments.
7886
+ for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7887
+ const auto & curr = state->vad_segments[i];
7888
+ const auto & next = state->vad_segments[i + 1];
7889
+
7890
+ if (t0 > curr.vad_end && t0 < next.vad_start) {
7891
+ // Calculate how far we are through the gap as a proportion
7892
+ float gap_proportion = 0.0f;
7893
+ if (next.vad_start > curr.vad_end) {
7894
+ gap_proportion = (t0 - curr.vad_end) / (next.vad_start - curr.vad_end);
7895
+ }
7896
+ float orig_t0 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7897
+ return (int64_t)(orig_t0 * 100);
7898
+ }
7899
+ }
7900
+
7901
+ // Handle the case where the timestamp is after the last segment.
7902
+ if (t0 > state->vad_segments.back().vad_end) {
7903
+ // For timestamps after the last segment, add the extra time to the end of the last segment
7904
+ const auto& last = state->vad_segments.back();
7905
+ // Calculate how far beyond the last segment
7906
+ float extra_time = t0 - last.vad_end;
7907
+ // Add this extra time to the original end time
7908
+ float orig_t0 = last.orig_end + extra_time;
7909
+ return (int64_t)(orig_t0 * 100);
7910
+ }
7911
+
7912
+ WHISPER_LOG_WARN("%s: Could not map t0 = %f to a VAD segment\n", __func__, t0);
7913
+ return t0;
7830
7914
}
7831
7915
7832
7916
int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
7833
- return ctx->state->result_all[ i_segment].t0 ;
7917
+ return whisper_full_get_segment_t0_from_state( ctx->state, i_segment) ;
7834
7918
}
7835
7919
7836
7920
int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
7837
- return state->result_all[i_segment].t1;
7921
+ // If VAD wasn't used, return the original timestamp
7922
+ if (!state->has_vad_segments || state->vad_segments.empty()) {
7923
+ return state->result_all[i_segment].t1;
7924
+ }
7925
+
7926
+ // Get the end timestamp produced by whisper_full. whisper_full processes
7927
+ // only the speech segments in this case so we need to map these timestamps
7928
+ // back to the original audio.
7929
+ float t1 = state->result_all[i_segment].t1 / 100.0f;
7930
+
7931
+ // Find which VAD segment this timestamp belongs.
7932
+ for (size_t i = 0; i < state->vad_segments.size(); i++) {
7933
+ const auto& segment = state->vad_segments[i];
7934
+
7935
+ // Check if the timestamp falls within this segment.
7936
+ if (t1 >= segment.vad_start && t1 <= segment.vad_end) {
7937
+ // Calculate the proportion through the filtered segment.
7938
+ float proportion = 0.0f;
7939
+ if (segment.vad_end > segment.vad_start) {
7940
+ proportion = (t1 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7941
+ }
7942
+ float orig_t1 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7943
+ return (int64_t)(orig_t1 * 100);
7944
+ }
7945
+ }
7946
+
7947
+ // Check if the timestamp falls between two segments.
7948
+ for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7949
+ const auto & curr = state->vad_segments[i];
7950
+ const auto & next = state->vad_segments[i + 1];
7951
+
7952
+ if (t1 > curr.vad_end && t1 < next.vad_start) {
7953
+ // Calculate how far we are through the gap as a proportion
7954
+ float gap_proportion = 0.0f;
7955
+ if (next.vad_start > curr.vad_end) {
7956
+ gap_proportion = (t1 - curr.vad_end) / (next.vad_start - curr.vad_end);
7957
+ }
7958
+ // Map to the corresponding position in the original gap
7959
+ float orig_t1 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7960
+ return (int64_t)(orig_t1 * 100);
7961
+ }
7962
+ }
7963
+
7964
+ // Handle the case where the timestamp is after the last segment
7965
+ if (t1 > state->vad_segments.back().vad_end) {
7966
+ // For the last segment, use the end of the last VAD segment
7967
+ const auto& last = state->vad_segments.back();
7968
+ // Calculate how far beyond the last segment
7969
+ float extra_time = t1 - last.vad_end;
7970
+ // Add this extra time to the original end time
7971
+ float orig_t1 = last.orig_end + extra_time;
7972
+ return (int64_t)(orig_t1 * 100);
7973
+ }
7974
+
7975
+ WHISPER_LOG_WARN("%s: Could not map t1 = %f to a VAD segment\n", __func__, t1);
7976
+ return t1;
7838
7977
}
7839
7978
7840
7979
int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
7841
- return ctx->state->result_all[ i_segment].t1 ;
7980
+ return whisper_full_get_segment_t1_from_state( ctx->state, i_segment) ;
7842
7981
}
7843
7982
7844
7983
bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment) {
0 commit comments