@@ -7,21 +7,20 @@ use std::{
7
7
8
8
use kalosm_sound:: AsyncSource ;
9
9
use rodio:: buffer:: SamplesBuffer ;
10
+ use voice_activity_detector:: { IteratorExt , VoiceActivityDetector } ;
10
11
11
- use crate :: Predictor ;
12
-
13
- pub struct ChunkStream < S : AsyncSource + Unpin , P : Predictor + Unpin > {
12
+ pub struct ChunkStream < S : AsyncSource + Unpin > {
14
13
source : S ,
15
- predictor : P ,
14
+ vad : VoiceActivityDetector ,
16
15
buffer : Vec < f32 > ,
17
16
max_duration : Duration ,
18
17
}
19
18
20
- impl < S : AsyncSource + Unpin , P : Predictor + Unpin > ChunkStream < S , P > {
21
- pub fn new ( source : S , predictor : P , max_duration : Duration ) -> Self {
19
+ impl < S : AsyncSource + Unpin > ChunkStream < S > {
20
+ pub fn new ( source : S , vad : VoiceActivityDetector , max_duration : Duration ) -> Self {
22
21
Self {
23
22
source,
24
- predictor ,
23
+ vad ,
25
24
buffer : Vec :: new ( ) ,
26
25
max_duration,
27
26
}
@@ -34,26 +33,9 @@ impl<S: AsyncSource + Unpin, P: Predictor + Unpin> ChunkStream<S, P> {
34
33
fn samples_for_duration ( & self , duration : Duration ) -> usize {
35
34
( self . source . sample_rate ( ) as f64 * duration. as_secs_f64 ( ) ) as usize
36
35
}
37
-
38
- fn trim_silence ( predictor : & P , data : & mut Vec < f32 > ) {
39
- const WINDOW_SIZE : usize = 100 ;
40
-
41
- let mut trim_index = 0 ;
42
- for start_idx in ( 0 ..data. len ( ) ) . step_by ( WINDOW_SIZE ) {
43
- let end_idx = ( start_idx + WINDOW_SIZE ) . min ( data. len ( ) ) ;
44
- let window = & data[ start_idx..end_idx] ;
45
-
46
- if let Ok ( false ) = predictor. predict ( window) {
47
- trim_index = start_idx;
48
- break ;
49
- }
50
- }
51
-
52
- data. drain ( 0 ..trim_index) ;
53
- }
54
36
}
55
37
56
- impl < S : AsyncSource + Unpin , P : Predictor + Unpin > Stream for ChunkStream < S , P > {
38
+ impl < S : AsyncSource + Unpin > Stream for ChunkStream < S > {
57
39
type Item = SamplesBuffer < f32 > ;
58
40
59
41
fn poll_next ( self : Pin < & mut Self > , cx : & mut Context < ' _ > ) -> Poll < Option < Self :: Item > > {
@@ -62,8 +44,6 @@ impl<S: AsyncSource + Unpin, P: Predictor + Unpin> Stream for ChunkStream<S, P>
62
44
let sample_rate = this. source . sample_rate ( ) ;
63
45
64
46
let min_buffer_samples = this. samples_for_duration ( Duration :: from_secs ( 6 ) ) ;
65
- let silence_window_samples = this. samples_for_duration ( Duration :: from_millis ( 500 ) ) ;
66
-
67
47
let stream = this. source . as_stream ( ) ;
68
48
let mut stream = std:: pin:: pin!( stream) ;
69
49
@@ -73,32 +53,41 @@ impl<S: AsyncSource + Unpin, P: Predictor + Unpin> Stream for ChunkStream<S, P>
73
53
this. buffer . push ( sample) ;
74
54
75
55
if this. buffer . len ( ) >= min_buffer_samples {
76
- let buffer_len = this. buffer . len ( ) ;
77
- let silence_start = buffer_len. saturating_sub ( silence_window_samples) ;
78
- let last_samples = & this. buffer [ silence_start..buffer_len] ;
79
-
80
- if let Ok ( false ) = this. predictor . predict ( last_samples) {
81
- let mut data = std:: mem:: take ( & mut this. buffer ) ;
82
- Self :: trim_silence ( & this. predictor , & mut data) ;
83
-
84
- return Poll :: Ready ( Some ( SamplesBuffer :: new ( 1 , sample_rate, data) ) ) ;
85
- }
56
+ let data = std:: mem:: take ( & mut this. buffer ) ;
57
+ let speech = filter_speech_chunks ( & mut this. vad , data) ;
58
+ return Poll :: Ready ( Some ( SamplesBuffer :: new ( 1 , sample_rate, speech) ) ) ;
86
59
}
87
60
}
88
61
Poll :: Ready ( None ) if !this. buffer . is_empty ( ) => {
89
- let mut data = std:: mem:: take ( & mut this. buffer ) ;
90
- Self :: trim_silence ( & this. predictor , & mut data) ;
91
-
92
- return Poll :: Ready ( Some ( SamplesBuffer :: new ( 1 , sample_rate, data) ) ) ;
62
+ let data = std:: mem:: take ( & mut this. buffer ) ;
63
+ let speech = filter_speech_chunks ( & mut this. vad , data) ;
64
+ return Poll :: Ready ( Some ( SamplesBuffer :: new ( 1 , sample_rate, speech) ) ) ;
93
65
}
94
66
Poll :: Ready ( None ) => return Poll :: Ready ( None ) ,
95
67
Poll :: Pending => return Poll :: Pending ,
96
68
}
97
69
}
98
70
99
- let mut chunk: Vec < _ > = this. buffer . drain ( 0 ..max_samples) . collect ( ) ;
100
- Self :: trim_silence ( & this. predictor , & mut chunk) ;
101
-
102
- Poll :: Ready ( Some ( SamplesBuffer :: new ( 1 , sample_rate, chunk) ) )
71
+ let data = this. buffer . drain ( 0 ..max_samples) ;
72
+ let speech = filter_speech_chunks ( & mut this. vad , data) ;
73
+ Poll :: Ready ( Some ( SamplesBuffer :: new ( 1 , sample_rate, speech) ) )
103
74
}
104
75
}
76
+
77
+ // helper function to filter speech chunks
78
+ fn filter_speech_chunks < D : IntoIterator < Item = f32 > > (
79
+ vad : & mut VoiceActivityDetector ,
80
+ data : D ,
81
+ ) -> Vec < f32 > {
82
+ data. into_iter ( )
83
+ . label ( vad, 0.75 , 3 )
84
+ . filter_map ( |label| {
85
+ if label. is_speech ( ) {
86
+ Some ( label. into_iter ( ) )
87
+ } else {
88
+ None
89
+ }
90
+ } )
91
+ . flatten ( )
92
+ . collect ( )
93
+ }
0 commit comments