@@ -869,8 +869,8 @@ struct whisper_aheads_masks {
869
869
};
870
870
871
871
struct vad_time_mapping {
872
- double processed_time; // Time in processed (VAD) audio
873
- double original_time; // Corresponding time in original audio
872
+ uint64_t processed_time; // Time in processed (VAD) audio
873
+ uint64_t original_time; // Corresponding time in original audio
874
874
};
875
875
876
876
struct whisper_state {
@@ -6716,8 +6716,8 @@ static bool whisper_vad(
6716
6716
segment.vad_end = (offset + segment_length) / (double )WHISPER_SAMPLE_RATE;
6717
6717
6718
6718
// Add segment boundaries to mapping table
6719
- vad_time_mapping start_mapping = {segment.vad_start , segment.orig_start };
6720
- vad_time_mapping end_mapping = {segment.vad_end , segment.orig_end };
6719
+ vad_time_mapping start_mapping = {( uint64_t )( segment.vad_start * 100.0 + 0.5 ), ( uint64_t )( segment.orig_start * 100.0 + 0.5 ) };
6720
+ vad_time_mapping end_mapping = {( uint64_t )( segment.vad_end * 100.0 + 0.5 ), ( uint64_t )( segment.orig_end * 100.0 + 0.5 ) };
6721
6721
6722
6722
state->vad_mapping_table .push_back (start_mapping);
6723
6723
state->vad_mapping_table .push_back (end_mapping);
@@ -6738,7 +6738,7 @@ static bool whisper_vad(
6738
6738
double proportion = (vad_time - segment.vad_start ) / (segment.vad_end - segment.vad_start );
6739
6739
double orig_time = segment.orig_start + proportion * (segment.orig_end - segment.orig_start );
6740
6740
6741
- vad_time_mapping intermediate_mapping = {vad_time, orig_time};
6741
+ vad_time_mapping intermediate_mapping = {( uint64_t )( vad_time * 100.0 + 0.5 ), ( uint64_t )( orig_time * 100.0 + 0.5 ) };
6742
6742
state->vad_mapping_table .push_back (intermediate_mapping);
6743
6743
}
6744
6744
}
@@ -6762,8 +6762,8 @@ static bool whisper_vad(
6762
6762
double orig_silence_end = vad_segments->data [i+1 ].start ;
6763
6763
6764
6764
// Add mapping points for silence boundaries
6765
- state->vad_mapping_table .push_back ({silence_start_vad, orig_silence_start});
6766
- state->vad_mapping_table .push_back ({silence_end_vad, orig_silence_end});
6765
+ state->vad_mapping_table .push_back ({( uint64_t )( silence_start_vad * 100.0 + 0.5 ),( uint64_t )( orig_silence_start * 100.0 + 0.5 ) });
6766
+ state->vad_mapping_table .push_back ({( uint64_t )( silence_end_vad * 100.0 + 0.5 ),( uint64_t )( orig_silence_end * 100.0 + 0.5 ) });
6767
6767
6768
6768
// Fill with zeros (silence)
6769
6769
memset (filtered_samples.data () + offset, 0 , silence_samples * sizeof (float ));
@@ -6783,7 +6783,7 @@ static bool whisper_vad(
6783
6783
if (!state->vad_mapping_table .empty ()) {
6784
6784
auto last = std::unique (state->vad_mapping_table .begin (), state->vad_mapping_table .end (),
6785
6785
[](const vad_time_mapping& a, const vad_time_mapping& b) {
6786
- return std::abs ( a.processed_time - b.processed_time ) < 1e-9 ;
6786
+ return a.processed_time == b.processed_time ;
6787
6787
});
6788
6788
state->vad_mapping_table .erase (last, state->vad_mapping_table .end ());
6789
6789
}
@@ -7873,7 +7873,7 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
7873
7873
return ctx->state ->lang_id ;
7874
7874
}
7875
7875
7876
- static double map_processed_to_original_time (double processed_time, const std::vector<vad_time_mapping>& mapping_table) {
7876
+ static uint64_t map_processed_to_original_time (uint64_t processed_time, const std::vector<vad_time_mapping>& mapping_table) {
7877
7877
if (mapping_table.empty ()) {
7878
7878
return processed_time;
7879
7879
}
@@ -7889,29 +7889,29 @@ static double map_processed_to_original_time(double processed_time, const std::v
7889
7889
// Binary search over the time map that finds the first entry that has a
7890
7890
// processed time greater than or equal to the current processed time.
7891
7891
auto upper = std::lower_bound (mapping_table.begin (), mapping_table.end (), processed_time,
7892
- [](const vad_time_mapping& entry, double time ) {
7892
+ [](const vad_time_mapping& entry, uint64_t time ) {
7893
7893
return entry.processed_time < time ;
7894
7894
}
7895
7895
);
7896
7896
7897
7897
// If exact match found
7898
- if (std::abs ( upper->processed_time - processed_time) < 1e-9 ) {
7898
+ if (upper->processed_time == processed_time) {
7899
7899
return upper->original_time ;
7900
7900
}
7901
7901
7902
7902
// Need to interpolate between two points
7903
7903
auto lower = upper - 1 ;
7904
7904
7905
- // Calculate the proportion
7906
- double proportion = 0.0 ;
7907
- double denominator = upper-> processed_time - lower->processed_time ;
7905
+ uint64_t processed_diff = upper-> processed_time - lower-> processed_time ;
7906
+ uint64_t original_diff = upper-> original_time - lower-> original_time ;
7907
+ uint64_t offset = processed_time - lower->processed_time ;
7908
7908
7909
- if (denominator > 1e-9 ) { // Avoid division by very small numbers
7910
- proportion = (processed_time - lower->processed_time ) / denominator ;
7909
+ if (processed_diff == 0 ) {
7910
+ return lower->original_time ;
7911
7911
}
7912
7912
7913
7913
// Perform linear interpolation
7914
- return lower->original_time + proportion * (upper-> original_time - lower-> original_time ) ;
7914
+ return lower->original_time + (offset * original_diff) / processed_diff ;
7915
7915
}
7916
7916
7917
7917
// Function to get the starting timestamp of a segment
@@ -7923,12 +7923,10 @@ int64_t whisper_full_get_segment_t0_from_state(struct whisper_state* state, int
7923
7923
}
7924
7924
7925
7925
// Get the processed timestamp
7926
- double t0 = state->result_all [i_segment].t0 / 100.0 ;
7926
+ uint64_t t0 = state->result_all [i_segment].t0 ;
7927
7927
7928
7928
// Map to original time using the mapping table
7929
- double orig_t0 = map_processed_to_original_time (t0, state->vad_mapping_table );
7930
-
7931
- return (int64_t )(orig_t0 * 100 + 0.5 ); // Round to nearest
7929
+ return map_processed_to_original_time (t0, state->vad_mapping_table );
7932
7930
}
7933
7931
7934
7932
// Function to get the ending timestamp of a segment
@@ -7940,21 +7938,21 @@ int64_t whisper_full_get_segment_t1_from_state(struct whisper_state* state, int
7940
7938
}
7941
7939
7942
7940
// Get the processed timestamp
7943
- double t1 = state->result_all [i_segment].t1 / 100.0 ;
7941
+ uint64_t t1 = state->result_all [i_segment].t1 ;
7944
7942
7945
7943
// Map to original time using the mapping table
7946
- double orig_t1 = map_processed_to_original_time (t1, state->vad_mapping_table );
7944
+ uint64_t orig_t1 = map_processed_to_original_time (t1, state->vad_mapping_table );
7947
7945
7948
7946
// Get the corresponding t0 for this segment
7949
- double orig_t0 = whisper_full_get_segment_t0_from_state (state, i_segment) / 100.0 ;
7947
+ uint64_t orig_t0 = whisper_full_get_segment_t0_from_state (state, i_segment);
7950
7948
7951
7949
// Ensure minimum duration to prevent zero-length segments
7952
- const double min_duration = 0.01 ; // 10ms minimum
7950
+ const uint64_t min_duration = 10 ; // 10ms minimum
7953
7951
if (orig_t1 - orig_t0 < min_duration) {
7954
7952
orig_t1 = orig_t0 + min_duration;
7955
7953
}
7956
7954
7957
- return ( int64_t )( orig_t1 * 100 + 0.5 ); // Round to nearest
7955
+ return orig_t1;
7958
7956
}
7959
7957
7960
7958
0 commit comments