feat: Add transcription support for single-agent live scenarios

jinnigu · jinnigu · commit 50d7733705e3 · 2025-10-29T22:19:58.000-07:00
diff --git a/src/google/adk/runners.py b/src/google/adk/runners.py
@@ -1104,12 +1104,12 @@ def _new_invocation_context_for_live(
       live_request_queue: Optional[LiveRequestQueue] = None,
       run_config: Optional[RunConfig] = None,
   ) -> InvocationContext:
-    """Creates a new invocation context for live multi-agent."""
+    """Creates a new invocation context for live single and multi-agent scenarios."""
     run_config = run_config or RunConfig()
 
     # For live multi-agent, we need model's text transcription as context for
-    # next agent.
-    if self.agent.sub_agents and live_request_queue:
+    # next agent. For single-agent, we need a general transcription support.
+    if live_request_queue:
       if not run_config.response_modalities:
         # default
         run_config.response_modalities = ['AUDIO']
@@ -1123,7 +1123,8 @@ def _new_invocation_context_for_live(
               types.AudioTranscriptionConfig()
           )
       if not run_config.input_audio_transcription:
-        # need this input transcription for agent transferring in live mode.
+        # need this input transcription for agent transferring in multi-agent live
+        # mode and for general transcription support in single agent live mode.
         run_config.input_audio_transcription = types.AudioTranscriptionConfig()
     return self._new_invocation_context(
         session,
diff --git a/tests/unittests/streaming/test_live_streaming_configs.py b/tests/unittests/streaming/test_live_streaming_configs.py
@@ -642,3 +642,103 @@ def test_streaming_with_context_window_compression_config():
       llm_request_sent_to_mock.live_connect_config.context_window_compression.sliding_window.target_tokens
       == 500
   )
+
+
+def test_single_agent_live_streaming_with_transcription():
+  """Test single-agent streaming adds transcription configs when not provided."""
+  response1 = LlmResponse(
+      turn_complete=True,
+  )
+
+  mock_model = testing_utils.MockModel.create([response1])
+
+  root_agent = Agent(
+      name='single_agent',
+      model=mock_model,
+      tools=[],
+  )
+
+  runner = testing_utils.InMemoryRunner(root_agent=root_agent)
+
+  # Test without passing any run_config to verify default behavior
+  # The logic in _new_invocation_context_for_live should automatically add
+  # transcription configs for live streaming
+  live_request_queue = LiveRequestQueue()
+  live_request_queue.send_realtime(
+      blob=types.Blob(data=b'\x00\xFF', mime_type='audio/pcm')
+  )
+
+  res_events = runner.run_live(live_request_queue)
+
+  assert res_events is not None, 'Expected a list of events, got None.'
+  assert (
+      len(res_events) > 0
+  ), 'Expected at least one response, but got an empty list.'
+  assert len(mock_model.requests) == 1
+
+  # Get the request that was captured
+  llm_request_sent_to_mock = mock_model.requests[0]
+
+  # Assert that transcription configs were added
+  assert llm_request_sent_to_mock.live_connect_config is not None
+  assert (
+      llm_request_sent_to_mock.live_connect_config.output_audio_transcription
+      is not None
+  )
+  assert (
+      llm_request_sent_to_mock.live_connect_config.input_audio_transcription
+      is not None
+  )
+
+
+def test_single_agent_live_streaming_respects_explicit_transcription():
+  """Test that single-agent live streaming respects explicitly provided transcription configs."""
+  response1 = LlmResponse(
+      turn_complete=True,
+  )
+
+  mock_model = testing_utils.MockModel.create([response1])
+
+  # Create a single agent (no sub_agents)
+  root_agent = Agent(
+      name='single_agent',
+      model=mock_model,
+      tools=[],
+  )
+
+  runner = testing_utils.InMemoryRunner(root_agent=root_agent)
+
+  # Create run config with input and output audio transcription
+  explicit_output_config = types.AudioTranscriptionConfig()
+  explicit_input_config = types.AudioTranscriptionConfig()
+  run_config = RunConfig(
+      output_audio_transcription=explicit_output_config,
+      input_audio_transcription=explicit_input_config,
+  )
+
+  live_request_queue = LiveRequestQueue()
+  live_request_queue.send_realtime(
+      blob=types.Blob(data=b'\x00\xFF', mime_type='audio/pcm')
+  )
+
+  res_events = runner.run_live(live_request_queue, run_config)
+
+  assert res_events is not None, 'Expected a list of events, got None.'
+  assert (
+      len(res_events) > 0
+  ), 'Expected at least one response, but got an empty list.'
+  assert len(mock_model.requests) == 1
+
+  # Get the request that was captured
+  llm_request_sent_to_mock = mock_model.requests[0]
+
+  # Assert that the explicit configs were used
+  assert llm_request_sent_to_mock.live_connect_config is not None
+  assert (
+      llm_request_sent_to_mock.live_connect_config.output_audio_transcription
+      is explicit_output_config
+  )
+  assert (
+      llm_request_sent_to_mock.live_connect_config.input_audio_transcription
+      is explicit_input_config
+  )