88import sounddevice as sd
99
1010from agents import function_tool
11- from agents .realtime import RealtimeAgent , RealtimeRunner , RealtimeSession , RealtimeSessionEvent
11+ from agents .realtime import (
12+ RealtimeAgent ,
13+ RealtimePlaybackTracker ,
14+ RealtimeRunner ,
15+ RealtimeSession ,
16+ RealtimeSessionEvent ,
17+ )
18+ from agents .realtime .model import RealtimeModelConfig
1219
1320# Audio configuration
14- CHUNK_LENGTH_S = 0.05 # 50ms
21+ CHUNK_LENGTH_S = 0.04 # 40ms aligns with realtime defaults
1522SAMPLE_RATE = 24000
1623FORMAT = np .int16
1724CHANNELS = 1
@@ -49,11 +56,16 @@ def __init__(self) -> None:
4956 self .audio_player : sd .OutputStream | None = None
5057 self .recording = False
5158
59+ # Playback tracker lets the model know our real playback progress
60+ self .playback_tracker = RealtimePlaybackTracker ()
61+
5262 # Audio output state for callback system
53- self .output_queue : queue .Queue [Any ] = queue .Queue (maxsize = 10 ) # Buffer more chunks
63+ # Store tuples: (samples_np, item_id, content_index)
64+ self .output_queue : queue .Queue [Any ] = queue .Queue (maxsize = 100 )
5465 self .interrupt_event = threading .Event ()
55- self .current_audio_chunk : np .ndarray [Any , np .dtype [Any ]] | None = None
66+ self .current_audio_chunk : tuple [ np .ndarray [Any , np .dtype [Any ]], str , int ] | None = None
5667 self .chunk_position = 0
68+ self .bytes_per_sample = np .dtype (FORMAT ).itemsize
5769
5870 def _output_callback (self , outdata , frames : int , time , status ) -> None :
5971 """Callback for audio output - handles continuous audio stream from server."""
@@ -92,20 +104,29 @@ def _output_callback(self, outdata, frames: int, time, status) -> None:
92104
93105 # Copy data from current chunk to output buffer
94106 remaining_output = len (outdata ) - samples_filled
95- remaining_chunk = len (self .current_audio_chunk ) - self .chunk_position
107+ samples , item_id , content_index = self .current_audio_chunk
108+ remaining_chunk = len (samples ) - self .chunk_position
96109 samples_to_copy = min (remaining_output , remaining_chunk )
97110
98111 if samples_to_copy > 0 :
99- chunk_data = self .current_audio_chunk [
100- self .chunk_position : self .chunk_position + samples_to_copy
101- ]
112+ chunk_data = samples [self .chunk_position : self .chunk_position + samples_to_copy ]
102113 # More efficient: direct assignment for mono audio instead of reshape
103114 outdata [samples_filled : samples_filled + samples_to_copy , 0 ] = chunk_data
104115 samples_filled += samples_to_copy
105116 self .chunk_position += samples_to_copy
106117
118+ # Inform playback tracker about played bytes
119+ try :
120+ self .playback_tracker .on_play_bytes (
121+ item_id = item_id ,
122+ item_content_index = content_index ,
123+ bytes = chunk_data .tobytes (),
124+ )
125+ except Exception :
126+ pass
127+
107128 # If we've used up the entire chunk, reset for next iteration
108- if self .chunk_position >= len (self . current_audio_chunk ):
129+ if self .chunk_position >= len (samples ):
109130 self .current_audio_chunk = None
110131 self .chunk_position = 0
111132
@@ -125,7 +146,15 @@ async def run(self) -> None:
125146
126147 try :
127148 runner = RealtimeRunner (agent )
128- async with await runner .run () as session :
149+ # Attach playback tracker and disable server-side response interruption,
150+ # which can truncate assistant audio when mic picks up speaker output.
151+ model_config : RealtimeModelConfig = {
152+ "playback_tracker" : self .playback_tracker ,
153+ "initial_model_settings" : {
154+ "turn_detection" : {"type" : "semantic_vad" , "interrupt_response" : False },
155+ },
156+ }
157+ async with await runner .run (model_config = model_config ) as session :
129158 self .session = session
130159 print ("Connected. Starting audio recording..." )
131160
@@ -170,6 +199,14 @@ async def capture_audio(self) -> None:
170199 read_size = int (SAMPLE_RATE * CHUNK_LENGTH_S )
171200
172201 try :
202+ # Simple energy-based barge-in: if user speaks while audio is playing, interrupt.
203+ def rms_energy (samples : np .ndarray [Any , np .dtype [Any ]]) -> float :
204+ if samples .size == 0 :
205+ return 0.0
206+ # Normalize int16 to [-1, 1]
207+ x = samples .astype (np .float32 ) / 32768.0
208+ return float (np .sqrt (np .mean (x * x )))
209+
173210 while self .recording :
174211 # Check if there's enough data to read
175212 if self .audio_stream .read_available < read_size :
@@ -182,8 +219,12 @@ async def capture_audio(self) -> None:
182219 # Convert numpy array to bytes
183220 audio_bytes = data .tobytes ()
184221
185- # Send audio to session
186- await self .session .send_audio (audio_bytes )
222+ # Half-duplex gating: do not send mic while assistant audio is playing
223+ assistant_playing = (
224+ self .current_audio_chunk is not None or not self .output_queue .empty ()
225+ )
226+ if not assistant_playing :
227+ await self .session .send_audio (audio_bytes )
187228
188229 # Yield control back to event loop
189230 await asyncio .sleep (0 )
@@ -212,17 +253,19 @@ async def _on_event(self, event: RealtimeSessionEvent) -> None:
212253 elif event .type == "audio_end" :
213254 print ("Audio ended" )
214255 elif event .type == "audio" :
215- # Enqueue audio for callback-based playback
256+ # Enqueue audio for callback-based playback with metadata
216257 np_audio = np .frombuffer (event .audio .data , dtype = np .int16 )
217258 try :
218- self .output_queue .put_nowait (np_audio )
259+ self .output_queue .put_nowait (( np_audio , event . item_id , event . content_index ) )
219260 except queue .Full :
220261 # Queue is full - only drop if we have significant backlog
221262 # This prevents aggressive dropping that could cause choppiness
222263 if self .output_queue .qsize () > 8 : # Keep some buffer
223264 try :
224265 self .output_queue .get_nowait ()
225- self .output_queue .put_nowait (np_audio )
266+ self .output_queue .put_nowait (
267+ (np_audio , event .item_id , event .content_index )
268+ )
226269 except queue .Empty :
227270 pass
228271 # If queue isn't too full, just skip this chunk to avoid blocking
0 commit comments