structured output - multi-modal input (#405)

pgrayy · web-flow · commit 1b83c5f8fed4 · 2025-07-10T12:56:29.000-04:00
diff --git a/src/strands/agent/agent.py b/src/strands/agent/agent.py
@@ -380,13 +380,13 @@ async def invoke_async(self, prompt: Union[str, list[ContentBlock]], **kwargs: A
 
         return cast(AgentResult, event["result"])
 
-    def structured_output(self, output_model: Type[T], prompt: Optional[str] = None) -> T:
+    def structured_output(self, output_model: Type[T], prompt: Optional[Union[str, list[ContentBlock]]] = None) -> T:
         """This method allows you to get structured output from the agent.
 
         If you pass in a prompt, it will be added to the conversation history and the agent will respond to it.
         If you don't pass in a prompt, it will use only the conversation history to respond.
 
-        For smaller models, you may want to use the optional prompt string to add additional instructions to explicitly
+        For smaller models, you may want to use the optional prompt to add additional instructions to explicitly
         instruct the model to output the structured data.
 
         Args:
@@ -405,13 +405,15 @@ def execute() -> T:
             future = executor.submit(execute)
             return future.result()
 
-    async def structured_output_async(self, output_model: Type[T], prompt: Optional[str] = None) -> T:
+    async def structured_output_async(
+        self, output_model: Type[T], prompt: Optional[Union[str, list[ContentBlock]]] = None
+    ) -> T:
         """This method allows you to get structured output from the agent.
 
         If you pass in a prompt, it will be added to the conversation history and the agent will respond to it.
         If you don't pass in a prompt, it will use only the conversation history to respond.
 
-        For smaller models, you may want to use the optional prompt string to add additional instructions to explicitly
+        For smaller models, you may want to use the optional prompt to add additional instructions to explicitly
         instruct the model to output the structured data.
 
         Args:
@@ -430,7 +432,8 @@ async def structured_output_async(self, output_model: Type[T], prompt: Optional[
 
             # add the prompt as the last message
             if prompt:
-                self._append_message({"role": "user", "content": [{"text": prompt}]})
+                content: list[ContentBlock] = [{"text": prompt}] if isinstance(prompt, str) else prompt
+                self._append_message({"role": "user", "content": content})
 
             events = self.model.structured_output(output_model, self.messages)
             async for event in events:
diff --git a/tests/strands/agent/test_agent.py b/tests/strands/agent/test_agent.py
@@ -959,6 +959,28 @@ def test_agent_structured_output(agent, user, agenerator):
     agent.model.structured_output.assert_called_once_with(type(user), [{"role": "user", "content": [{"text": prompt}]}])
 
 
+def test_agent_structured_output_multi_modal_input(agent, user, agenerator):
+    agent.model.structured_output = unittest.mock.Mock(return_value=agenerator([{"output": user}]))
+
+    prompt = [
+        {"text": "Please describe the user in this image"},
+        {
+            "image": {
+                "format": "png",
+                "source": {
+                    "bytes": b"\x89PNG\r\n\x1a\n",
+                },
+            }
+        },
+    ]
+
+    tru_result = agent.structured_output(type(user), prompt)
+    exp_result = user
+    assert tru_result == exp_result
+
+    agent.model.structured_output.assert_called_once_with(type(user), [{"role": "user", "content": prompt}])
+
+
 @pytest.mark.asyncio
 async def test_agent_structured_output_in_async_context(agent, user, agenerator):
     agent.model.structured_output = unittest.mock.Mock(return_value=agenerator([{"output": user}]))
diff --git a/tests_integ/models/test_model_anthropic.py b/tests_integ/models/test_model_anthropic.py
@@ -12,7 +12,7 @@
 pytestmark = providers.anthropic.mark
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def model():
     return AnthropicModel(
         client_args={
@@ -23,7 +23,7 @@ def model():
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def tools():
     @strands.tool
     def tool_time() -> str:
@@ -36,17 +36,17 @@ def tool_weather() -> str:
     return [tool_time, tool_weather]
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def system_prompt():
     return "You are an AI assistant."
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def agent(model, tools, system_prompt):
     return Agent(model=model, tools=tools, system_prompt=system_prompt)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def weather():
     class Weather(BaseModel):
         """Extracts the time and weather from the user's message with the exact strings."""
@@ -57,6 +57,16 @@ class Weather(BaseModel):
     return Weather(time="12:00", weather="sunny")
 
 
+@pytest.fixture
+def yellow_color():
+    class Color(BaseModel):
+        """Describes a color."""
+
+        name: str
+
+    return Color(name="yellow")
+
+
 def test_agent_invoke(agent):
     result = agent("What is the time and weather in New York?")
     text = result.message["content"][0]["text"].lower()
@@ -97,7 +107,7 @@ async def test_agent_structured_output_async(agent, weather):
     assert tru_weather == exp_weather
 
 
-def test_multi_modal_input(agent, yellow_img):
+def test_invoke_multi_modal_input(agent, yellow_img):
     content = [
         {"text": "what is in this image"},
         {
@@ -113,3 +123,20 @@ def test_multi_modal_input(agent, yellow_img):
     text = result.message["content"][0]["text"].lower()
 
     assert "yellow" in text
+
+
+def test_structured_output_multi_modal_input(agent, yellow_img, yellow_color):
+    content = [
+        {"text": "Is this image red, blue, or yellow?"},
+        {
+            "image": {
+                "format": "png",
+                "source": {
+                    "bytes": yellow_img,
+                },
+            },
+        },
+    ]
+    tru_color = agent.structured_output(type(yellow_color), content)
+    exp_color = yellow_color
+    assert tru_color == exp_color
diff --git a/tests_integ/models/test_model_bedrock.py b/tests_integ/models/test_model_bedrock.py
@@ -37,6 +37,16 @@ def non_streaming_agent(non_streaming_model, system_prompt):
     return Agent(model=non_streaming_model, system_prompt=system_prompt, load_tools_from_directory=False)
 
 
+@pytest.fixture
+def yellow_color():
+    class Color(BaseModel):
+        """Describes a color."""
+
+        name: str
+
+    return Color(name="yellow")
+
+
 def test_streaming_agent(streaming_agent):
     """Test agent with streaming model."""
     result = streaming_agent("Hello!")
@@ -153,7 +163,7 @@ class Weather(BaseModel):
     assert result.weather == "sunny"
 
 
-def test_multi_modal_input(streaming_agent, yellow_img):
+def test_invoke_multi_modal_input(streaming_agent, yellow_img):
     content = [
         {"text": "what is in this image"},
         {
@@ -169,3 +179,20 @@ def test_multi_modal_input(streaming_agent, yellow_img):
     text = result.message["content"][0]["text"].lower()
 
     assert "yellow" in text
+
+
+def test_structured_output_multi_modal_input(streaming_agent, yellow_img, yellow_color):
+    content = [
+        {"text": "Is this image red, blue, or yellow?"},
+        {
+            "image": {
+                "format": "png",
+                "source": {
+                    "bytes": yellow_img,
+                },
+            },
+        },
+    ]
+    tru_color = streaming_agent.structured_output(type(yellow_color), content)
+    exp_color = yellow_color
+    assert tru_color == exp_color
diff --git a/tests_integ/models/test_model_litellm.py b/tests_integ/models/test_model_litellm.py
@@ -29,6 +29,16 @@ def agent(model, tools):
     return Agent(model=model, tools=tools)
 
 
+@pytest.fixture
+def yellow_color():
+    class Color(BaseModel):
+        """Describes a color."""
+
+        name: str
+
+    return Color(name="yellow")
+
+
 def test_agent(agent):
     result = agent("What is the time and weather in New York?")
     text = result.message["content"][0]["text"].lower()
@@ -49,9 +59,9 @@ class Weather(BaseModel):
     assert result.weather == "sunny"
 
 
-def test_multi_modal_input(agent, yellow_img):
+def test_invoke_multi_modal_input(agent, yellow_img):
     content = [
-        {"text": "what is in this image"},
+        {"text": "Is this image red, blue, or yellow?"},
         {
             "image": {
                 "format": "png",
@@ -65,3 +75,20 @@ def test_multi_modal_input(agent, yellow_img):
     text = result.message["content"][0]["text"].lower()
 
     assert "yellow" in text
+
+
+def test_structured_output_multi_modal_input(agent, yellow_img, yellow_color):
+    content = [
+        {"text": "what is in this image"},
+        {
+            "image": {
+                "format": "png",
+                "source": {
+                    "bytes": yellow_img,
+                },
+            },
+        },
+    ]
+    tru_color = agent.structured_output(type(yellow_color), content)
+    exp_color = yellow_color
+    assert tru_color == exp_color
diff --git a/tests_integ/models/test_model_ollama.py b/tests_integ/models/test_model_ollama.py
@@ -10,12 +10,12 @@
 pytestmark = providers.ollama.mark
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def model():
     return OllamaModel(host="http://localhost:11434", model_id="llama3.3:70b")
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def tools():
     @strands.tool
     def tool_time() -> str:
@@ -28,12 +28,12 @@ def tool_weather() -> str:
     return [tool_time, tool_weather]
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def agent(model, tools):
     return Agent(model=model, tools=tools)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def weather():
     class Weather(BaseModel):
         """Extracts the time and weather from the user's message with the exact strings."""
diff --git a/tests_integ/models/test_model_openai.py b/tests_integ/models/test_model_openai.py
@@ -12,7 +12,7 @@
 pytestmark = providers.openai.mark
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def model():
     return OpenAIModel(
         model_id="gpt-4o",
@@ -22,7 +22,7 @@ def model():
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def tools():
     @strands.tool
     def tool_time() -> str:
@@ -35,12 +35,12 @@ def tool_weather() -> str:
     return [tool_time, tool_weather]
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def agent(model, tools):
     return Agent(model=model, tools=tools)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def weather():
     class Weather(BaseModel):
         """Extracts the time and weather from the user's message with the exact strings."""
@@ -51,6 +51,16 @@ class Weather(BaseModel):
     return Weather(time="12:00", weather="sunny")
 
 
+@pytest.fixture
+def yellow_color():
+    class Color(BaseModel):
+        """Describes a color."""
+
+        name: str
+
+    return Color(name="yellow")
+
+
 @pytest.fixture(scope="module")
 def test_image_path(request):
     return request.config.rootpath / "tests_integ" / "test_image.png"
@@ -96,7 +106,7 @@ async def test_agent_structured_output_async(agent, weather):
     assert tru_weather == exp_weather
 
 
-def test_multi_modal_input(agent, yellow_img):
+def test_invoke_multi_modal_input(agent, yellow_img):
     content = [
         {"text": "what is in this image"},
         {
@@ -114,6 +124,23 @@ def test_multi_modal_input(agent, yellow_img):
     assert "yellow" in text
 
 
+def test_structured_output_multi_modal_input(agent, yellow_img, yellow_color):
+    content = [
+        {"text": "Is this image red, blue, or yellow?"},
+        {
+            "image": {
+                "format": "png",
+                "source": {
+                    "bytes": yellow_img,
+                },
+            },
+        },
+    ]
+    tru_color = agent.structured_output(type(yellow_color), content)
+    exp_color = yellow_color
+    assert tru_color == exp_color
+
+
 @pytest.mark.skip("https://github.com/strands-agents/sdk-python/issues/320")
 def test_tool_returning_images(model, yellow_img):
     @tool