From b4862a0a329af0cad6e4ecb5127d2f5a38fa8bc7 Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Wed, 19 Nov 2025 11:29:54 +0800
Subject: [PATCH 01/14] feat: add cache all and limit cache point in
 AnthropicModel

---
 docs/models/anthropic.md                      |  49 +++++-
 .../pydantic_ai/models/anthropic.py           | 104 +++++++++++-
 tests/models/test_anthropic.py                | 160 ++++++++++++++++++
 3 files changed, 309 insertions(+), 4 deletions(-)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index 96aa6207c1..ce9fa7b67c 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -80,18 +80,29 @@ agent = Agent(model)
 
 ## Prompt Caching
 
-Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides three ways to use prompt caching:
+Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides four ways to use prompt caching:
 
 1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
 2. **Cache System Instructions**: Set [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
+4. **Cache All (Convenience)**: Set [`AnthropicModelSettings.anthropic_cache_all`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_all] to `True` to automatically cache both system instructions and the last user message
 
-You can combine all three strategies for maximum savings:
+You can combine multiple strategies for maximum savings:
 
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint, RunContext
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
+# Option 1: Use anthropic_cache_all for convenience (caches system + last message)
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    system_prompt='Detailed instructions...',
+    model_settings=AnthropicModelSettings(
+        anthropic_cache_all=True,  # Caches both system prompt and last message
+    ),
+)
+
+# Option 2: Fine-grained control with individual settings
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
@@ -145,3 +156,37 @@ async def main():
     print(f'Cache write tokens: {usage.cache_write_tokens}')
     print(f'Cache read tokens: {usage.cache_read_tokens}')
 ```
+
+### Cache Point Limits
+
+Anthropic enforces a maximum of 4 cache points per request. Pydantic AI automatically manages this limit:
+
+- **`anthropic_cache_all`**: Uses 2 cache points (system instructions + last message)
+- **`anthropic_cache_instructions`**: Uses 1 cache point
+- **`anthropic_cache_tool_definitions`**: Uses 1 cache point
+- **`CachePoint` markers**: Use remaining available cache points
+
+When the total exceeds 4 cache points, Pydantic AI automatically removes cache points from **older messages** (keeping the most recent ones), ensuring your requests always comply with Anthropic's limits without errors.
+
+```python {test="skip"}
+from pydantic_ai import Agent, CachePoint
+from pydantic_ai.models.anthropic import AnthropicModelSettings
+
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    system_prompt='Instructions...',
+    model_settings=AnthropicModelSettings(
+        anthropic_cache_all=True,  # Uses 2 cache points
+    ),
+)
+
+async def main():
+    # Even with multiple CachePoint markers, only 2 more will be kept
+    # (4 total limit - 2 from cache_all = 2 available)
+    result = await agent.run([
+        'Context 1', CachePoint(),  # Will be kept
+        'Context 2', CachePoint(),  # Will be kept
+        'Context 3', CachePoint(),  # Automatically removed (oldest)
+        'Question'
+    ])
+```
diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
index de33a08f7a..4c36c546dd 100644
--- a/pydantic_ai_slim/pydantic_ai/models/anthropic.py
+++ b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -169,6 +169,22 @@ class AnthropicModelSettings(ModelSettings, total=False):
     See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
     """
 
+    anthropic_cache_all: bool | Literal['5m', '1h']
+    """Convenience setting to enable caching for both system instructions and the last user message.
+
+    When enabled, this automatically adds cache points to:
+    1. The last system prompt block (system instructions)
+    2. The last content block in the final user message
+
+    This is equivalent to setting both `anthropic_cache_instructions` and adding a cache point
+    to the last message, but more convenient for common use cases.
+    If `True`, uses TTL='5m'. You can also specify '5m' or '1h' directly.
+
+    Note: Uses 2 of Anthropic's 4 available cache points per request. Any additional CachePoint
+    markers in messages will be automatically limited to respect the 4-cache-point maximum.
+    See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
+    """
+
 
 @dataclass(init=False)
 class AnthropicModel(Model):
@@ -478,7 +494,10 @@ def _get_tools(
         ]
 
         # Add cache_control to the last tool if enabled
-        if tools and (cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')):
+        if tools and (
+            cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')
+            or model_settings.get('anthropic_cache_all')
+        ):
             # If True, use '5m'; otherwise use the specified ttl value
             ttl: Literal['5m', '1h'] = '5m' if cache_tool_defs is True else cache_tool_defs
             last_tool = tools[-1]
@@ -747,8 +766,32 @@ async def _map_message(  # noqa: C901
             system_prompt_parts.insert(0, instructions)
         system_prompt = '\n\n'.join(system_prompt_parts)
 
+        # Add cache_control to the last message content if anthropic_cache_all is enabled
+        if anthropic_messages and (cache_all := model_settings.get('anthropic_cache_all')):
+            ttl: Literal['5m', '1h'] = '5m' if cache_all is True else cache_all
+            m = anthropic_messages[-1]
+            content = m['content']
+            if isinstance(content, str):
+                # Convert string content to list format with cache_control
+                m['content'] = [
+                    {
+                        'text': content,
+                        'type': 'text',
+                        'cache_control': BetaCacheControlEphemeralParam(type='ephemeral', ttl=ttl),
+                    }
+                ]
+            else:
+                # Add cache_control to the last content block
+                content = cast(list[BetaContentBlockParam], content)
+                self._add_cache_control_to_last_param(content, ttl)
+
+        # Ensure total cache points don't exceed Anthropic's limit of 4
+        self._limit_cache_points(anthropic_messages, model_settings)
         # If anthropic_cache_instructions is enabled, return system prompt as a list with cache_control
-        if system_prompt and (cache_instructions := model_settings.get('anthropic_cache_instructions')):
+        if system_prompt and (
+            cache_instructions := model_settings.get('anthropic_cache_instructions')
+            or model_settings.get('anthropic_cache_all')
+        ):
             # If True, use '5m'; otherwise use the specified ttl value
             ttl: Literal['5m', '1h'] = '5m' if cache_instructions is True else cache_instructions
             system_prompt_blocks = [
@@ -762,6 +805,63 @@ async def _map_message(  # noqa: C901
 
         return system_prompt, anthropic_messages
 
+    @staticmethod
+    def _limit_cache_points(messages: list[BetaMessageParam], model_settings: AnthropicModelSettings) -> None:
+        """Limit the number of cache points in messages to comply with Anthropic's 4-cache-point maximum.
+
+        Anthropic allows a maximum of 4 cache points per request. This method ensures compliance by:
+        1. Calculating how many cache points are already used by system-level settings
+           (anthropic_cache_instructions, anthropic_cache_tool_definitions, anthropic_cache_all)
+        2. Determining how many cache points remain available for message-level caching
+        3. Traversing messages from newest to oldest, keeping only the allowed number of cache points
+        4. Removing cache_control from older cache points that exceed the limit
+
+        This prioritizes recent cache points, which are typically more valuable for conversation continuity.
+
+        Args:
+            messages: List of message parameters to limit cache points in.
+            model_settings: Model settings containing cache configuration.
+        """
+        # Anthropic's maximum cache points per request
+        max_cache_points = 4
+        used_cache_points = 0
+
+        # Calculate cache points used by system-level settings
+        if model_settings.get('anthropic_cache_all'):
+            # anthropic_cache_all adds cache points for both system instructions and last message
+            used_cache_points += 2
+        else:
+            if model_settings.get('anthropic_cache_instructions'):
+                used_cache_points += 1
+            if model_settings.get('anthropic_cache_tool_definitions'):
+                # Assume used one cache point for tool definitions
+                used_cache_points += 1
+
+        # Calculate remaining cache points available for message content
+        keep_cache_points = max_cache_points - used_cache_points
+
+        # Traverse messages from back to front (newest to oldest)
+        remaining_cache_points = keep_cache_points
+        for message in reversed(messages):
+            content = message['content']
+            # Skip if content is a string or None
+            if isinstance(content, str):
+                continue
+            content = cast(list[BetaContentBlockParam], content)
+            # Traverse content blocks from back to front within each message
+            for block in reversed(content):
+                # Cast to dict for TypedDict manipulation
+                block_dict = cast(dict[str, Any], block)
+
+                # Check if this block has cache_control
+                if 'cache_control' in block_dict:
+                    if remaining_cache_points > 0:
+                        # Keep this cache point (within limit)
+                        remaining_cache_points -= 1
+                    else:
+                        # Remove cache_control as we've exceeded the limit
+                        del block_dict['cache_control']
+
     @staticmethod
     def _add_cache_control_to_last_param(params: list[BetaContentBlockParam], ttl: Literal['5m', '1h'] = '5m') -> None:
         """Add cache control to the last content block param.
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
index 86ba5a68d3..4d0daddd42 100644
--- a/tests/models/test_anthropic.py
+++ b/tests/models/test_anthropic.py
@@ -588,6 +588,166 @@ def my_tool(value: str) -> str:  # pragma: no cover
     assert system[0]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '5m'})
 
 
+async def test_anthropic_cache_all(allow_model_requests: None):
+    """Test that anthropic_cache_all caches both system instructions and last message."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        system_prompt='System instructions to cache.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_all=True,
+        ),
+    )
+
+    await agent.run('User message')
+
+    # Verify both system and last message have cache_control
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    system = completion_kwargs['system']
+    messages = completion_kwargs['messages']
+
+    # System should have cache_control
+    assert system == snapshot(
+        [{'type': 'text', 'text': 'System instructions to cache.', 'cache_control': {'type': 'ephemeral', 'ttl': '5m'}}]
+    )
+
+    # Last message content should have cache_control
+    assert messages[-1]['content'][-1] == snapshot(
+        {'type': 'text', 'text': 'User message', 'cache_control': {'type': 'ephemeral', 'ttl': '5m'}}
+    )
+
+
+async def test_anthropic_cache_all_with_custom_ttl(allow_model_requests: None):
+    """Test that anthropic_cache_all supports custom TTL values."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        system_prompt='System instructions.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_all='1h',  # Custom 1h TTL
+        ),
+    )
+
+    await agent.run('User message')
+
+    # Verify both use 1h TTL
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    system = completion_kwargs['system']
+    messages = completion_kwargs['messages']
+
+    assert system[0]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '1h'})
+    assert messages[-1]['content'][-1]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '1h'})
+
+
+async def test_limit_cache_points_with_cache_all(allow_model_requests: None):
+    """Test that cache points are limited when using cache_all + CachePoint markers."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        system_prompt='System instructions.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_all=True,  # Uses 2 cache points
+        ),
+    )
+
+    # Add 3 CachePoint markers (total would be 5: 2 from cache_all + 3 from markers)
+    # Only 2 CachePoint markers should be kept (newest ones)
+    await agent.run(
+        [
+            'Context 1',
+            CachePoint(),  # Oldest, should be removed
+            'Context 2',
+            CachePoint(),  # Should be kept
+            'Context 3',
+            CachePoint(),  # Should be kept
+            'Question',
+        ]
+    )
+
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    messages = completion_kwargs['messages']
+
+    # Count cache_control occurrences in messages
+    cache_count = 0
+    for msg in messages:
+        for block in msg['content']:
+            if 'cache_control' in block:
+                cache_count += 1
+
+    # anthropic_cache_all uses 2 cache points (system + last message)
+    # With 3 CachePoint markers, we'd have 5 total
+    # Limit is 4, so 1 oldest CachePoint should be removed
+    # Result: 2 cache points in messages (from the 2 newest CachePoints)
+    # The cache_all's last message cache is applied after limiting
+    assert cache_count == 2
+
+
+async def test_limit_cache_points_all_settings(allow_model_requests: None):
+    """Test cache point limiting with all cache settings enabled."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+
+    agent = Agent(
+        m,
+        system_prompt='System instructions.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_instructions=True,  # 1 cache point
+            anthropic_cache_tool_definitions=True,  # 1 cache point
+        ),
+    )
+
+    @agent.tool_plain
+    def my_tool() -> str:  # pragma: no cover
+        return 'result'
+
+    # Add 3 CachePoint markers (total would be 5: 2 from settings + 3 from markers)
+    # Only 2 CachePoint markers should be kept
+    await agent.run(
+        [
+            'Context 1',
+            CachePoint(),  # Oldest, should be removed
+            'Context 2',
+            CachePoint(),  # Should be kept
+            'Context 3',
+            CachePoint(),  # Should be kept
+            'Question',
+        ]
+    )
+
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    messages = completion_kwargs['messages']
+
+    # Count cache_control in messages (excluding system and tools)
+    cache_count = 0
+    for msg in messages:
+        for block in msg['content']:
+            if 'cache_control' in block:
+                cache_count += 1
+
+    # Should have exactly 2 cache points in messages
+    # (4 total - 1 system - 1 tool = 2 available for messages)
+    assert cache_count == 2
+
+
 async def test_async_request_text_response(allow_model_requests: None):
     c = completion_message(
         [BetaTextBlock(text='world', type='text')],

From 9bf3f6ed5b2faa9e2a4ec6abbc455ee091c79c06 Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Wed, 19 Nov 2025 11:41:29 +0800
Subject: [PATCH 02/14] fix ci issues

---
 docs/models/anthropic.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index ce9fa7b67c..ffc8487698 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -189,4 +189,5 @@ async def main():
         'Context 3', CachePoint(),  # Automatically removed (oldest)
         'Question'
     ])
+    print(result.output)
 ```

From 0f0dd763dc82ece53a216253f7014e409f052f6d Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Wed, 19 Nov 2025 12:00:06 +0800
Subject: [PATCH 03/14] use BetaTextBlockParam and add nocover

---
 pydantic_ai_slim/pydantic_ai/models/anthropic.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
index 4c36c546dd..abfa4f1d14 100644
--- a/pydantic_ai_slim/pydantic_ai/models/anthropic.py
+++ b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -773,12 +773,12 @@ async def _map_message(  # noqa: C901
             content = m['content']
             if isinstance(content, str):
                 # Convert string content to list format with cache_control
-                m['content'] = [
-                    {
-                        'text': content,
-                        'type': 'text',
-                        'cache_control': BetaCacheControlEphemeralParam(type='ephemeral', ttl=ttl),
-                    }
+                m['content'] = [  # pragma: no cover
+                    BetaTextBlockParam(
+                        text=content,
+                        type='text',
+                        cache_control=BetaCacheControlEphemeralParam(type='ephemeral', ttl=ttl),
+                    )
                 ]
             else:
                 # Add cache_control to the last content block
@@ -845,7 +845,7 @@ def _limit_cache_points(messages: list[BetaMessageParam], model_settings: Anthro
         for message in reversed(messages):
             content = message['content']
             # Skip if content is a string or None
-            if isinstance(content, str):
+            if isinstance(content, str):  # pragma: no cover
                 continue
             content = cast(list[BetaContentBlockParam], content)
             # Traverse content blocks from back to front within each message

From 8bf1d945c17da084b8e2c1f3b7a93d9f3f070af1 Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Thu, 20 Nov 2025 11:26:04 +0800
Subject: [PATCH 04/14] use anthropic_cache_messages

---
 docs/models/anthropic.md                      |  74 ++++++++---
 .../pydantic_ai/models/anthropic.py           | 120 ++++++++----------
 tests/models/test_anthropic.py                |  45 +++----
 3 files changed, 132 insertions(+), 107 deletions(-)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index ffc8487698..b0fec0d324 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -85,7 +85,7 @@ Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-wit
 1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
 2. **Cache System Instructions**: Set [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
-4. **Cache All (Convenience)**: Set [`AnthropicModelSettings.anthropic_cache_all`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_all] to `True` to automatically cache both system instructions and the last user message
+4. **Cache Last Message (Convenience)**: Set [`AnthropicModelSettings.anthropic_cache_messages`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_messages] to `True` to automatically cache the last user message
 
 You can combine multiple strategies for maximum savings:
 
@@ -93,12 +93,12 @@ You can combine multiple strategies for maximum savings:
 from pydantic_ai import Agent, CachePoint, RunContext
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
-# Option 1: Use anthropic_cache_all for convenience (caches system + last message)
+# Option 1: Use anthropic_cache_messages for convenience (caches last message only)
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
     model_settings=AnthropicModelSettings(
-        anthropic_cache_all=True,  # Caches both system prompt and last message
+        anthropic_cache_messages=True,  # Caches the last user message
     ),
 )
 
@@ -159,35 +159,77 @@ async def main():
 
 ### Cache Point Limits
 
-Anthropic enforces a maximum of 4 cache points per request. Pydantic AI automatically manages this limit:
+Anthropic enforces a maximum of 4 cache points per request. Pydantic AI automatically manages this limit to ensure your requests always comply without errors.
 
-- **`anthropic_cache_all`**: Uses 2 cache points (system instructions + last message)
-- **`anthropic_cache_instructions`**: Uses 1 cache point
-- **`anthropic_cache_tool_definitions`**: Uses 1 cache point
-- **`CachePoint` markers**: Use remaining available cache points
+#### How Cache Points Are Allocated
 
-When the total exceeds 4 cache points, Pydantic AI automatically removes cache points from **older messages** (keeping the most recent ones), ensuring your requests always comply with Anthropic's limits without errors.
+Cache points can be placed in three locations:
+
+1. **System Prompt**: Via `anthropic_cache_instructions` setting (adds cache point to last system prompt block)
+2. **Tool Definitions**: Via `anthropic_cache_tool_definitions` setting (adds cache point to last tool definition)
+3. **Messages**: Via `CachePoint` markers or `anthropic_cache_messages` setting (adds cache points to message content)
+
+Each setting uses **at most 1 cache point**, but you can combine them:
 
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
+# Example: Using all 3 cache point sources
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    system_prompt='Detailed instructions...',
+    model_settings=AnthropicModelSettings(
+        anthropic_cache_instructions=True,      # 1 cache point
+        anthropic_cache_tool_definitions=True,  # 1 cache point
+        anthropic_cache_messages=True,          # 1 cache point
+    ),
+)
+
+@agent.tool_plain
+def my_tool() -> str:
+    return 'result'
+
+async def main():
+    # This uses 3 cache points (instructions + tools + last message)
+    # You can add 1 more CachePoint marker before hitting the limit
+    result = await agent.run([
+        'Context', CachePoint(),  # 4th cache point - OK
+        'Question'
+    ])
+```
+
+#### Automatic Cache Point Limiting
+
+When cache points from all sources (settings + `CachePoint` markers) exceed 4, Pydantic AI automatically removes excess cache points from **older message content** (keeping the most recent ones):
+
+```python {test="skip"}
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Instructions...',
     model_settings=AnthropicModelSettings(
-        anthropic_cache_all=True,  # Uses 2 cache points
+        anthropic_cache_instructions=True,      # 1 cache point
+        anthropic_cache_tool_definitions=True,  # 1 cache point
     ),
 )
 
+@agent.tool_plain
+def search() -> str:
+    return 'data'
+
 async def main():
-    # Even with multiple CachePoint markers, only 2 more will be kept
-    # (4 total limit - 2 from cache_all = 2 available)
+    # Already using 2 cache points (instructions + tools)
+    # Can add 2 more CachePoint markers (4 total limit)
     result = await agent.run([
-        'Context 1', CachePoint(),  # Will be kept
-        'Context 2', CachePoint(),  # Will be kept
-        'Context 3', CachePoint(),  # Automatically removed (oldest)
+        'Context 1', CachePoint(),  # Oldest - will be removed
+        'Context 2', CachePoint(),  # Will be kept (3rd point)
+        'Context 3', CachePoint(),  # Will be kept (4th point)
         'Question'
     ])
-    print(result.output)
+    # Final cache points: instructions + tools + Context 2 + Context 3 = 4
 ```
+
+**Key Points**:
+- System and tool cache points are **always preserved**
+- Message cache points are removed from oldest to newest when limit is exceeded
+- This ensures critical caching (instructions/tools) is maintained while still benefiting from message-level caching
diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
index abfa4f1d14..964bb620de 100644
--- a/pydantic_ai_slim/pydantic_ai/models/anthropic.py
+++ b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -169,18 +169,15 @@ class AnthropicModelSettings(ModelSettings, total=False):
     See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
     """
 
-    anthropic_cache_all: bool | Literal['5m', '1h']
-    """Convenience setting to enable caching for both system instructions and the last user message.
+    anthropic_cache_messages: bool | Literal['5m', '1h']
+    """Convenience setting to enable caching for the last user message.
 
-    When enabled, this automatically adds cache points to:
-    1. The last system prompt block (system instructions)
-    2. The last content block in the final user message
-
-    This is equivalent to setting both `anthropic_cache_instructions` and adding a cache point
-    to the last message, but more convenient for common use cases.
+    When enabled, this automatically adds a cache point to the last content block
+    in the final user message, which is useful for caching conversation history
+    or context in multi-turn conversations.
     If `True`, uses TTL='5m'. You can also specify '5m' or '1h' directly.
 
-    Note: Uses 2 of Anthropic's 4 available cache points per request. Any additional CachePoint
+    Note: Uses 1 of Anthropic's 4 available cache points per request. Any additional CachePoint
     markers in messages will be automatically limited to respect the 4-cache-point maximum.
     See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
     """
@@ -349,7 +346,7 @@ async def _messages_create(
         tool_choice = self._infer_tool_choice(tools, model_settings, model_request_parameters)
 
         system_prompt, anthropic_messages = await self._map_message(messages, model_request_parameters, model_settings)
-
+        self._limit_cache_points(system_prompt, anthropic_messages, tools)
         try:
             extra_headers = self._map_extra_headers(beta_features, model_settings)
 
@@ -392,7 +389,7 @@ async def _messages_count_tokens(
         tool_choice = self._infer_tool_choice(tools, model_settings, model_request_parameters)
 
         system_prompt, anthropic_messages = await self._map_message(messages, model_request_parameters, model_settings)
-
+        self._limit_cache_points(system_prompt, anthropic_messages, tools)
         try:
             extra_headers = self._map_extra_headers(beta_features, model_settings)
 
@@ -494,10 +491,7 @@ def _get_tools(
         ]
 
         # Add cache_control to the last tool if enabled
-        if tools and (
-            cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')
-            or model_settings.get('anthropic_cache_all')
-        ):
+        if tools and (cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')):
             # If True, use '5m'; otherwise use the specified ttl value
             ttl: Literal['5m', '1h'] = '5m' if cache_tool_defs is True else cache_tool_defs
             last_tool = tools[-1]
@@ -766,9 +760,9 @@ async def _map_message(  # noqa: C901
             system_prompt_parts.insert(0, instructions)
         system_prompt = '\n\n'.join(system_prompt_parts)
 
-        # Add cache_control to the last message content if anthropic_cache_all is enabled
-        if anthropic_messages and (cache_all := model_settings.get('anthropic_cache_all')):
-            ttl: Literal['5m', '1h'] = '5m' if cache_all is True else cache_all
+        # Add cache_control to the last message content if anthropic_cache_messages is enabled
+        if anthropic_messages and (cache_messages := model_settings.get('anthropic_cache_messages')):
+            ttl: Literal['5m', '1h'] = '5m' if cache_messages is True else cache_messages
             m = anthropic_messages[-1]
             content = m['content']
             if isinstance(content, str):
@@ -785,13 +779,8 @@ async def _map_message(  # noqa: C901
                 content = cast(list[BetaContentBlockParam], content)
                 self._add_cache_control_to_last_param(content, ttl)
 
-        # Ensure total cache points don't exceed Anthropic's limit of 4
-        self._limit_cache_points(anthropic_messages, model_settings)
         # If anthropic_cache_instructions is enabled, return system prompt as a list with cache_control
-        if system_prompt and (
-            cache_instructions := model_settings.get('anthropic_cache_instructions')
-            or model_settings.get('anthropic_cache_all')
-        ):
+        if system_prompt and (cache_instructions := model_settings.get('anthropic_cache_instructions')):
             # If True, use '5m'; otherwise use the specified ttl value
             ttl: Literal['5m', '1h'] = '5m' if cache_instructions is True else cache_instructions
             system_prompt_blocks = [
@@ -806,60 +795,57 @@ async def _map_message(  # noqa: C901
         return system_prompt, anthropic_messages
 
     @staticmethod
-    def _limit_cache_points(messages: list[BetaMessageParam], model_settings: AnthropicModelSettings) -> None:
-        """Limit the number of cache points in messages to comply with Anthropic's 4-cache-point maximum.
-
-        Anthropic allows a maximum of 4 cache points per request. This method ensures compliance by:
-        1. Calculating how many cache points are already used by system-level settings
-           (anthropic_cache_instructions, anthropic_cache_tool_definitions, anthropic_cache_all)
-        2. Determining how many cache points remain available for message-level caching
-        3. Traversing messages from newest to oldest, keeping only the allowed number of cache points
-        4. Removing cache_control from older cache points that exceed the limit
+    def _limit_cache_points(
+        system_prompt: str | list[BetaTextBlockParam],
+        anthropic_messages: list[BetaMessageParam],
+        tools: list[BetaToolUnionParam],
+    ) -> None:
+        """Limit the number of cache points in the request to Anthropic's maximum.
+
+        Strategy:
+        1. Keep the last cache point in system_prompt and tools (if present)
+        2. Count cache points already used in system_prompt and tools
+        3. Traverse messages from newest to oldest, keeping the most recent cache points
+           until the maximum limit is reached
+        """
+        MAX_CACHE_POINTS = 4
 
-        This prioritizes recent cache points, which are typically more valuable for conversation continuity.
+        # Count existing cache points in system prompt
+        used_cache_points = (
+            sum(1 for block in system_prompt if 'cache_control' in cast(dict[str, Any], block))
+            if isinstance(system_prompt, list)
+            else 0
+        )
 
-        Args:
-            messages: List of message parameters to limit cache points in.
-            model_settings: Model settings containing cache configuration.
-        """
-        # Anthropic's maximum cache points per request
-        max_cache_points = 4
-        used_cache_points = 0
-
-        # Calculate cache points used by system-level settings
-        if model_settings.get('anthropic_cache_all'):
-            # anthropic_cache_all adds cache points for both system instructions and last message
-            used_cache_points += 2
-        else:
-            if model_settings.get('anthropic_cache_instructions'):
-                used_cache_points += 1
-            if model_settings.get('anthropic_cache_tool_definitions'):
-                # Assume used one cache point for tool definitions
+        # Count existing cache points in tools (any tool may have cache_control)
+        # Note: cache_control can be in the middle of tools list if builtin tools are added after
+        for tool in tools:
+            if 'cache_control' in tool:
                 used_cache_points += 1
 
-        # Calculate remaining cache points available for message content
-        keep_cache_points = max_cache_points - used_cache_points
-
-        # Traverse messages from back to front (newest to oldest)
-        remaining_cache_points = keep_cache_points
-        for message in reversed(messages):
+        # Calculate remaining cache points budget for messages
+        remaining_budget = MAX_CACHE_POINTS - used_cache_points
+        if remaining_budget < 0:  # pragma: no cover
+            raise UserError(
+                f'Too many cache points for Anthropic request. '
+                f'System prompt and tool definitions already use {used_cache_points} cache points, '
+                f'which exceeds the maximum of {MAX_CACHE_POINTS}.'
+            )
+        # Remove excess cache points from messages (newest to oldest)
+        for message in reversed(anthropic_messages):
             content = message['content']
-            # Skip if content is a string or None
             if isinstance(content, str):  # pragma: no cover
                 continue
-            content = cast(list[BetaContentBlockParam], content)
-            # Traverse content blocks from back to front within each message
-            for block in reversed(content):
-                # Cast to dict for TypedDict manipulation
+
+            # Process content blocks in reverse order (newest first)
+            for block in reversed(cast(list[BetaContentBlockParam], content)):
                 block_dict = cast(dict[str, Any], block)
 
-                # Check if this block has cache_control
                 if 'cache_control' in block_dict:
-                    if remaining_cache_points > 0:
-                        # Keep this cache point (within limit)
-                        remaining_cache_points -= 1
+                    if remaining_budget > 0:
+                        remaining_budget -= 1
                     else:
-                        # Remove cache_control as we've exceeded the limit
+                        # Exceeded limit, remove this cache point
                         del block_dict['cache_control']
 
     @staticmethod
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
index 4d0daddd42..a033f9cad3 100644
--- a/tests/models/test_anthropic.py
+++ b/tests/models/test_anthropic.py
@@ -588,8 +588,8 @@ def my_tool(value: str) -> str:  # pragma: no cover
     assert system[0]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '5m'})
 
 
-async def test_anthropic_cache_all(allow_model_requests: None):
-    """Test that anthropic_cache_all caches both system instructions and last message."""
+async def test_anthropic_cache_messages(allow_model_requests: None):
+    """Test that anthropic_cache_messages caches only the last message."""
     c = completion_message(
         [BetaTextBlock(text='Response', type='text')],
         usage=BetaUsage(input_tokens=10, output_tokens=5),
@@ -600,21 +600,19 @@ async def test_anthropic_cache_all(allow_model_requests: None):
         m,
         system_prompt='System instructions to cache.',
         model_settings=AnthropicModelSettings(
-            anthropic_cache_all=True,
+            anthropic_cache_messages=True,
         ),
     )
 
     await agent.run('User message')
 
-    # Verify both system and last message have cache_control
+    # Verify only last message has cache_control, not system
     completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
     system = completion_kwargs['system']
     messages = completion_kwargs['messages']
 
-    # System should have cache_control
-    assert system == snapshot(
-        [{'type': 'text', 'text': 'System instructions to cache.', 'cache_control': {'type': 'ephemeral', 'ttl': '5m'}}]
-    )
+    # System should NOT have cache_control (should be a plain string)
+    assert system == snapshot('System instructions to cache.')
 
     # Last message content should have cache_control
     assert messages[-1]['content'][-1] == snapshot(
@@ -622,8 +620,8 @@ async def test_anthropic_cache_all(allow_model_requests: None):
     )
 
 
-async def test_anthropic_cache_all_with_custom_ttl(allow_model_requests: None):
-    """Test that anthropic_cache_all supports custom TTL values."""
+async def test_anthropic_cache_messages_with_custom_ttl(allow_model_requests: None):
+    """Test that anthropic_cache_messages supports custom TTL values."""
     c = completion_message(
         [BetaTextBlock(text='Response', type='text')],
         usage=BetaUsage(input_tokens=10, output_tokens=5),
@@ -634,23 +632,21 @@ async def test_anthropic_cache_all_with_custom_ttl(allow_model_requests: None):
         m,
         system_prompt='System instructions.',
         model_settings=AnthropicModelSettings(
-            anthropic_cache_all='1h',  # Custom 1h TTL
+            anthropic_cache_messages='1h',  # Custom 1h TTL
         ),
     )
 
     await agent.run('User message')
 
-    # Verify both use 1h TTL
+    # Verify use 1h TTL
     completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
-    system = completion_kwargs['system']
     messages = completion_kwargs['messages']
 
-    assert system[0]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '1h'})
     assert messages[-1]['content'][-1]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '1h'})
 
 
-async def test_limit_cache_points_with_cache_all(allow_model_requests: None):
-    """Test that cache points are limited when using cache_all + CachePoint markers."""
+async def test_limit_cache_points_with_cache_messages(allow_model_requests: None):
+    """Test that cache points are limited when using cache_messages + CachePoint markers."""
     c = completion_message(
         [BetaTextBlock(text='Response', type='text')],
         usage=BetaUsage(input_tokens=10, output_tokens=5),
@@ -661,12 +657,12 @@ async def test_limit_cache_points_with_cache_all(allow_model_requests: None):
         m,
         system_prompt='System instructions.',
         model_settings=AnthropicModelSettings(
-            anthropic_cache_all=True,  # Uses 2 cache points
+            anthropic_cache_messages=True,  # Uses 1 cache point
         ),
     )
 
-    # Add 3 CachePoint markers (total would be 5: 2 from cache_all + 3 from markers)
-    # Only 2 CachePoint markers should be kept (newest ones)
+    # Add 4 CachePoint markers (total would be 5: 1 from cache_messages + 4 from markers)
+    # Only 3 CachePoint markers should be kept (newest ones)
     await agent.run(
         [
             'Context 1',
@@ -675,6 +671,8 @@ async def test_limit_cache_points_with_cache_all(allow_model_requests: None):
             CachePoint(),  # Should be kept
             'Context 3',
             CachePoint(),  # Should be kept
+            'Context 4',
+            CachePoint(),  # Should be kept
             'Question',
         ]
     )
@@ -689,12 +687,11 @@ async def test_limit_cache_points_with_cache_all(allow_model_requests: None):
             if 'cache_control' in block:
                 cache_count += 1
 
-    # anthropic_cache_all uses 2 cache points (system + last message)
-    # With 3 CachePoint markers, we'd have 5 total
+    # anthropic_cache_messages uses 1 cache point (last message only)
+    # With 4 CachePoint markers, we'd have 5 total
     # Limit is 4, so 1 oldest CachePoint should be removed
-    # Result: 2 cache points in messages (from the 2 newest CachePoints)
-    # The cache_all's last message cache is applied after limiting
-    assert cache_count == 2
+    # Result: 3 cache points from CachePoint markers + 1 from cache_messages = 4 total
+    assert cache_count == 4
 
 
 async def test_limit_cache_points_all_settings(allow_model_requests: None):

From 240f71c566bc4925e31cf830f2c29ce2ac31ee7a Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Thu, 20 Nov 2025 11:28:11 +0800
Subject: [PATCH 05/14] fix ci

---
 docs/models/anthropic.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index b0fec0d324..a69353ffc5 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -197,6 +197,10 @@ async def main():
         'Context', CachePoint(),  # 4th cache point - OK
         'Question'
     ])
+    print(result.output)
+    usage = result.usage()
+    print(f'Cache write tokens: {usage.cache_write_tokens}')
+    print(f'Cache read tokens: {usage.cache_read_tokens}')
 ```
 
 #### Automatic Cache Point Limiting
@@ -227,6 +231,10 @@ async def main():
         'Question'
     ])
     # Final cache points: instructions + tools + Context 2 + Context 3 = 4
+    print(result.output)
+    usage = result.usage()
+    print(f'Cache write tokens: {usage.cache_write_tokens}')
+    print(f'Cache read tokens: {usage.cache_read_tokens}')
 ```
 
 **Key Points**:

From ae63b134c3b180b3ee969721a1a59d1f03ebc21e Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Thu, 20 Nov 2025 11:33:31 +0800
Subject: [PATCH 06/14] update docstring for _limit_cache_points

---
 .../pydantic_ai/models/anthropic.py           | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
index 964bb620de..2e4546fa2f 100644
--- a/pydantic_ai_slim/pydantic_ai/models/anthropic.py
+++ b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -802,11 +802,26 @@ def _limit_cache_points(
     ) -> None:
         """Limit the number of cache points in the request to Anthropic's maximum.
 
+        Anthropic enforces a maximum of 4 cache points per request. This method ensures
+        compliance by counting existing cache points and removing excess ones from messages.
+
         Strategy:
-        1. Keep the last cache point in system_prompt and tools (if present)
-        2. Count cache points already used in system_prompt and tools
-        3. Traverse messages from newest to oldest, keeping the most recent cache points
-           until the maximum limit is reached
+        1. Count cache points in system_prompt (can be multiple if list of blocks)
+        2. Count cache points in tools (can be in any position, not just last)
+        3. Raise UserError if system + tools already exceed MAX_CACHE_POINTS
+        4. Calculate remaining budget for message cache points
+        5. Traverse messages from newest to oldest, keeping the most recent cache points
+           within the remaining budget
+        6. Remove excess cache points from older messages to stay within limit
+
+        Cache point priority (always preserved):
+        - System prompt cache points
+        - Tool definition cache points
+        - Message cache points (newest first, oldest removed if needed)
+
+        Raises:
+            UserError: If system_prompt and tools combined already exceed MAX_CACHE_POINTS (4).
+                      This indicates a configuration error that cannot be auto-fixed.
         """
         MAX_CACHE_POINTS = 4
 

From 6cceb59ccb9c001b098462770e7fc7d32c8acdfd Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Thu, 20 Nov 2025 11:40:49 +0800
Subject: [PATCH 07/14] fix doc example issue

---
 docs/models/anthropic.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index a69353ffc5..70727140f5 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -208,6 +208,8 @@ async def main():
 When cache points from all sources (settings + `CachePoint` markers) exceed 4, Pydantic AI automatically removes excess cache points from **older message content** (keeping the most recent ones):
 
 ```python {test="skip"}
+from pydantic_ai import Agent, CachePoint
+from pydantic_ai.models.anthropic import AnthropicModelSettings
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Instructions...',

From 0aa82ad995a1b64d0487e2edd59bf724dcd95574 Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Thu, 20 Nov 2025 11:54:09 +0800
Subject: [PATCH 08/14] fix doc check

---
 docs/models/anthropic.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index 70727140f5..fe46435fb7 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -210,6 +210,7 @@ When cache points from all sources (settings + `CachePoint` markers) exceed 4, P
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint
 from pydantic_ai.models.anthropic import AnthropicModelSettings
+
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Instructions...',

From 779bd40fb94a6cc86219e52d973a533c2fd07fb4 Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Fri, 21 Nov 2025 11:04:13 +0800
Subject: [PATCH 09/14] update docs and add real case

---
 docs/models/anthropic.md       | 53 +++++++++++++++++++++-------------
 tests/models/test_anthropic.py | 38 ++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index fe46435fb7..5e2cbae588 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -93,23 +93,32 @@ You can combine multiple strategies for maximum savings:
 from pydantic_ai import Agent, CachePoint, RunContext
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
-# Option 1: Use anthropic_cache_messages for convenience (caches last message only)
+# Example 1: Use anthropic_cache_messages for automatic last message caching
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
-    system_prompt='Detailed instructions...',
+    system_prompt='You are a helpful assistant.',
     model_settings=AnthropicModelSettings(
-        anthropic_cache_messages=True,  # Caches the last user message
+        anthropic_cache_messages=True,  # Automatically caches the last message
     ),
 )
 
-# Option 2: Fine-grained control with individual settings
+async def main():
+    # The last message is automatically cached - no need for manual CachePoint
+    result1 = await agent.run('What is the capital of France?')
+
+    # Subsequent calls with similar conversation benefit from cache
+    result2 = await agent.run('What is the capital of Germany?')
+    print(f'Cache write: {result1.usage().cache_write_tokens}')
+    print(f'Cache read: {result2.usage().cache_read_tokens}')
+
+# Example 2: Combine with other cache settings for comprehensive caching
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
     model_settings=AnthropicModelSettings(
-        # Use True for default 5m TTL, or specify '5m' / '1h' directly
-        anthropic_cache_instructions=True,
-        anthropic_cache_tool_definitions='1h',  # Longer cache for tool definitions
+        anthropic_cache_instructions=True,      # Cache system instructions
+        anthropic_cache_tool_definitions='1h',  # Cache tool definitions with 1h TTL
+        anthropic_cache_messages=True,          # Also cache the last message
     ),
 )
 
@@ -119,21 +128,24 @@ def search_docs(ctx: RunContext, query: str) -> str:
     return f'Results for {query}'
 
 async def main():
-    # First call - writes to cache
-    result1 = await agent.run([
+    # All three cache points are used: instructions, tools, and last message
+    result = await agent.run('Search for Python best practices')
+    print(result.output)
+
+# Example 3: Fine-grained control with manual CachePoint markers
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    system_prompt='Instructions...',
+)
+
+async def main():
+    # Manually control cache points for specific content blocks
+    result = await agent.run([
         'Long context from documentation...',
-        CachePoint(),
+        CachePoint(),  # Cache everything up to this point
         'First question'
     ])
-
-    # Subsequent calls - read from cache (90% cost reduction)
-    result2 = await agent.run([
-        'Long context from documentation...',  # Same content
-        CachePoint(),
-        'Second question'
-    ])
-    print(f'First: {result1.output}')
-    print(f'Second: {result2.output}')
+    print(result.output)
 ```
 
 Access cache usage statistics via `result.usage()`:
@@ -242,5 +254,6 @@ async def main():
 
 **Key Points**:
 - System and tool cache points are **always preserved**
-- Message cache points are removed from oldest to newest when limit is exceeded
+- The cache point created by `anthropic_cache_messages` is **always preserved** (as it's the newest message cache point)
+- Additional `CachePoint` markers in messages are removed from oldest to newest when the limit is exceeded
 - This ensures critical caching (instructions/tools) is maintained while still benefiting from message-level caching
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
index a033f9cad3..c6fa16d76f 100644
--- a/tests/models/test_anthropic.py
+++ b/tests/models/test_anthropic.py
@@ -6673,3 +6673,41 @@ async def test_anthropic_bedrock_count_tokens_not_supported(env: TestEnv):
 
     with pytest.raises(UserError, match='AsyncAnthropicBedrock client does not support `count_tokens` api.'):
         await agent.run('hello', usage_limits=UsageLimits(input_tokens_limit=20, count_tokens_before_request=True))
+
+
+@pytest.mark.vcr()
+async def test_anthropic_cache_messages_real_api(allow_model_requests: None, anthropic_api_key: str):
+    """Test that anthropic_cache_messages setting adds cache_control and produces cache usage metrics.
+
+    This test uses a cassette to verify the cache behavior without making real API calls in CI.
+    When run with real API credentials, it demonstrates that:
+    1. The first call with a long context creates a cache (cache_write_tokens > 0)
+    2. Follow-up messages in the same conversation can read from that cache (cache_read_tokens > 0)
+    """
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(api_key=anthropic_api_key))
+    agent = Agent(
+        m,
+        system_prompt='You are a helpful assistant.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_messages=True,
+        ),
+    )
+
+    # First call with a longer message - this will cache the message content
+    result1 = await agent.run('Please explain what Python is and its main use cases. ' * 10)
+    usage1 = result1.usage()
+
+    # With anthropic_cache_messages, the first call should write cache for the last message
+    # (Note: cache_write_tokens might be 0 if content is too short, but the setting is applied)
+    assert usage1.requests == 1
+    assert usage1.output_tokens > 0
+
+    # Continue the conversation - this message appends to history
+    # The previous cached message should still be in the request
+    result2 = await agent.run('Can you summarize that in one sentence?', message_history=result1.all_messages())
+    usage2 = result2.usage()
+
+    # The second call should potentially read from cache if the previous message is still cached
+    # (cache_read_tokens > 0 when cache hit occurs)
+    assert usage2.requests == 1
+    assert usage2.output_tokens > 0

From bf0dc8419a1956337131baacfd48a2a5d624a6db Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Fri, 21 Nov 2025 11:58:25 +0800
Subject: [PATCH 10/14] test via real api key

---
 ...est_anthropic_cache_messages_real_api.yaml | 327 ++++++++++++++++++
 tests/models/test_anthropic.py                |  10 +-
 2 files changed, 334 insertions(+), 3 deletions(-)
 create mode 100644 tests/models/cassettes/test_anthropic/test_anthropic_cache_messages_real_api.yaml

diff --git a/tests/models/cassettes/test_anthropic/test_anthropic_cache_messages_real_api.yaml b/tests/models/cassettes/test_anthropic/test_anthropic_cache_messages_real_api.yaml
new file mode 100644
index 0000000000..a1711a107c
--- /dev/null
+++ b/tests/models/cassettes/test_anthropic/test_anthropic_cache_messages_real_api.yaml
@@ -0,0 +1,327 @@
+interactions:
+- request:
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '5617'
+      content-type:
+      - application/json
+      host:
+      - api.anthropic.com
+    method: POST
+    parsed_body:
+      max_tokens: 4096
+      messages:
+      - content:
+        - cache_control:
+            ttl: 5m
+            type: ephemeral
+          text: 'Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its
+            main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main
+            use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use
+            cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its
+            main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main
+            use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use
+            cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its
+            main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main
+            use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use
+            cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its
+            main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main
+            use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use
+            cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. '
+          type: text
+        role: user
+      model: claude-sonnet-4-5
+      stream: false
+      system: You are a helpful assistant.
+    uri: https://api.anthropic.com/v1/messages?beta=true
+  response:
+    headers:
+      connection:
+      - keep-alive
+      content-length:
+      - '1986'
+      content-type:
+      - application/json
+      retry-after:
+      - '3'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      transfer-encoding:
+      - chunked
+    parsed_body:
+      content:
+      - text: |-
+          # What is Python?
+
+          **Python** is a high-level, interpreted programming language created by Guido van Rossum and first released in 1991. It emphasizes code readability and simplicity, using clear syntax that often resembles plain English.
+
+          ## Key Characteristics:
+          - **Easy to learn**: Simple, intuitive syntax ideal for beginners
+          - **Interpreted**: Code runs line-by-line without compilation
+          - **Dynamically typed**: No need to declare variable types
+          - **Versatile**: Supports multiple programming paradigms (procedural, object-oriented, functional)
+          - **Extensive libraries**: Vast ecosystem of packages and frameworks
+
+          ## Main Use Cases:
+
+          ### 1. **Web Development**
+          - Frameworks: Django, Flask, FastAPI
+          - Backend services and APIs
+
+          ### 2. **Data Science & Analytics**
+          - Libraries: Pandas, NumPy, Matplotlib
+          - Data manipulation and visualization
+
+          ### 3. **Machine Learning & AI**
+          - Frameworks: TensorFlow, PyTorch, scikit-learn
+          - Deep learning and predictive modeling
+
+          ### 4. **Automation & Scripting**
+          - Task automation
+          - System administration
+
+          ### 5. **Scientific Computing**
+          - Research and computational analysis
+          - Libraries: SciPy, SymPy
+
+          ### 6. **Software Development**
+          - Application development
+          - Prototyping
+
+          ### 7. **Cybersecurity**
+          - Penetration testing tools
+          - Security automation
+
+          ### 8. **Game Development**
+          - Libraries: Pygame
+          - Prototyping game logic
+
+          Python's versatility and ease of use make it one of the most popular programming languages worldwide, used by companies like Google, Netflix, NASA, and many others.
+        type: text
+      id: msg_01FdUT99HgS9cAzcv4ztTDYJ
+      model: claude-sonnet-4-5-20250929
+      role: assistant
+      stop_reason: end_turn
+      stop_sequence: null
+      type: message
+      usage:
+        cache_creation:
+          ephemeral_1h_input_tokens: 0
+          ephemeral_5m_input_tokens: 1111
+        cache_creation_input_tokens: 1111
+        cache_read_input_tokens: 0
+        input_tokens: 3
+        output_tokens: 391
+        service_tier: standard
+    status:
+      code: 200
+      message: OK
+- request:
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '7343'
+      content-type:
+      - application/json
+      host:
+      - api.anthropic.com
+    method: POST
+    parsed_body:
+      max_tokens: 4096
+      messages:
+      - content:
+        - text: 'Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its
+            main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main
+            use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use
+            cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its
+            main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main
+            use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use
+            cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its
+            main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main
+            use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use
+            cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its
+            main use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main
+            use cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use
+            cases. Please explain what Python is and its main use cases. Please explain what Python is and its main use cases.
+            Please explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please
+            explain what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain
+            what Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what
+            Python is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python
+            is and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is
+            and its main use cases. Please explain what Python is and its main use cases. Please explain what Python is and
+            its main use cases. Please explain what Python is and its main use cases. '
+          type: text
+        role: user
+      - content:
+        - text: |-
+            # What is Python?
+
+            **Python** is a high-level, interpreted programming language created by Guido van Rossum and first released in 1991. It emphasizes code readability and simplicity, using clear syntax that often resembles plain English.
+
+            ## Key Characteristics:
+            - **Easy to learn**: Simple, intuitive syntax ideal for beginners
+            - **Interpreted**: Code runs line-by-line without compilation
+            - **Dynamically typed**: No need to declare variable types
+            - **Versatile**: Supports multiple programming paradigms (procedural, object-oriented, functional)
+            - **Extensive libraries**: Vast ecosystem of packages and frameworks
+
+            ## Main Use Cases:
+
+            ### 1. **Web Development**
+            - Frameworks: Django, Flask, FastAPI
+            - Backend services and APIs
+
+            ### 2. **Data Science & Analytics**
+            - Libraries: Pandas, NumPy, Matplotlib
+            - Data manipulation and visualization
+
+            ### 3. **Machine Learning & AI**
+            - Frameworks: TensorFlow, PyTorch, scikit-learn
+            - Deep learning and predictive modeling
+
+            ### 4. **Automation & Scripting**
+            - Task automation
+            - System administration
+
+            ### 5. **Scientific Computing**
+            - Research and computational analysis
+            - Libraries: SciPy, SymPy
+
+            ### 6. **Software Development**
+            - Application development
+            - Prototyping
+
+            ### 7. **Cybersecurity**
+            - Penetration testing tools
+            - Security automation
+
+            ### 8. **Game Development**
+            - Libraries: Pygame
+            - Prototyping game logic
+
+            Python's versatility and ease of use make it one of the most popular programming languages worldwide, used by companies like Google, Netflix, NASA, and many others.
+          type: text
+        role: assistant
+      - content:
+        - cache_control:
+            ttl: 5m
+            type: ephemeral
+          text: Can you summarize that in one sentence?
+          type: text
+        role: user
+      model: claude-sonnet-4-5
+      stream: false
+      system: You are a helpful assistant.
+    uri: https://api.anthropic.com/v1/messages?beta=true
+  response:
+    headers:
+      connection:
+      - keep-alive
+      content-length:
+      - '576'
+      content-type:
+      - application/json
+      retry-after:
+      - '56'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      transfer-encoding:
+      - chunked
+    parsed_body:
+      content:
+      - text: Python is a beginner-friendly, versatile programming language widely used for web development, data science,
+          machine learning, automation, and scientific computing.
+        type: text
+      id: msg_01PYfyaNS7Rysss2xMNqQgy9
+      model: claude-sonnet-4-5-20250929
+      role: assistant
+      stop_reason: end_turn
+      stop_sequence: null
+      type: message
+      usage:
+        cache_creation:
+          ephemeral_1h_input_tokens: 0
+          ephemeral_5m_input_tokens: 403
+        cache_creation_input_tokens: 403
+        cache_read_input_tokens: 1111
+        input_tokens: 3
+        output_tokens: 33
+        service_tier: standard
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
index c6fa16d76f..a5408258de 100644
--- a/tests/models/test_anthropic.py
+++ b/tests/models/test_anthropic.py
@@ -6684,7 +6684,7 @@ async def test_anthropic_cache_messages_real_api(allow_model_requests: None, ant
     1. The first call with a long context creates a cache (cache_write_tokens > 0)
     2. Follow-up messages in the same conversation can read from that cache (cache_read_tokens > 0)
     """
-    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(api_key=anthropic_api_key))
+    m = AnthropicModel('claude-sonnet-4-5', provider=AnthropicProvider(api_key=anthropic_api_key))
     agent = Agent(
         m,
         system_prompt='You are a helpful assistant.',
@@ -6694,12 +6694,13 @@ async def test_anthropic_cache_messages_real_api(allow_model_requests: None, ant
     )
 
     # First call with a longer message - this will cache the message content
-    result1 = await agent.run('Please explain what Python is and its main use cases. ' * 10)
+    result1 = await agent.run('Please explain what Python is and its main use cases. ' * 100)
     usage1 = result1.usage()
 
     # With anthropic_cache_messages, the first call should write cache for the last message
-    # (Note: cache_write_tokens might be 0 if content is too short, but the setting is applied)
+    # (cache_write_tokens > 0 indicates that caching occurred)
     assert usage1.requests == 1
+    assert usage1.cache_write_tokens > 0
     assert usage1.output_tokens > 0
 
     # Continue the conversation - this message appends to history
@@ -6709,5 +6710,8 @@ async def test_anthropic_cache_messages_real_api(allow_model_requests: None, ant
 
     # The second call should potentially read from cache if the previous message is still cached
     # (cache_read_tokens > 0 when cache hit occurs)
+    # (cache_write_tokens > 0 as new message is added to cache)
     assert usage2.requests == 1
+    assert usage2.cache_read_tokens > 0
+    assert usage2.cache_write_tokens > 0
     assert usage2.output_tokens > 0

From 7f317f0a34854ebb6a6da313b0823f2baf8b8337 Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Fri, 21 Nov 2025 13:19:19 +0800
Subject: [PATCH 11/14] fix docs ruff issues

---
 docs/models/anthropic.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index 5e2cbae588..aa5460bd0e 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -127,7 +127,7 @@ def search_docs(ctx: RunContext, query: str) -> str:
     """Search documentation."""
     return f'Results for {query}'
 
-async def main():
+async def main():  # noqa: F811
     # All three cache points are used: instructions, tools, and last message
     result = await agent.run('Search for Python best practices')
     print(result.output)
@@ -138,7 +138,7 @@ agent = Agent(
     system_prompt='Instructions...',
 )
 
-async def main():
+async def main():  # noqa: F811
     # Manually control cache points for specific content blocks
     result = await agent.run([
         'Long context from documentation...',
@@ -162,7 +162,7 @@ agent = Agent(
     ),
 )
 
-async def main():
+async def main():  # noqa: F811
     result = await agent.run('Your question')
     usage = result.usage()
     print(f'Cache write tokens: {usage.cache_write_tokens}')
@@ -202,7 +202,7 @@ agent = Agent(
 def my_tool() -> str:
     return 'result'
 
-async def main():
+async def main():  # noqa: F811
     # This uses 3 cache points (instructions + tools + last message)
     # You can add 1 more CachePoint marker before hitting the limit
     result = await agent.run([
@@ -236,7 +236,7 @@ agent = Agent(
 def search() -> str:
     return 'data'
 
-async def main():
+async def main():  # noqa: F811
     # Already using 2 cache points (instructions + tools)
     # Can add 2 more CachePoint markers (4 total limit)
     result = await agent.run([

From 63500d77e60647594e9246096e91248be7cd7e2f Mon Sep 17 00:00:00 2001
From: Zhongsheng Ji <9573586@qq.com>
Date: Sat, 22 Nov 2025 09:25:55 +0800
Subject: [PATCH 12/14] Update docs/models/anthropic.md

Co-authored-by: Douwe Maan <me@douwe.me>
---
 docs/models/anthropic.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index aa5460bd0e..34417d6679 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -85,7 +85,7 @@ Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-wit
 1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
 2. **Cache System Instructions**: Set [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
-4. **Cache Last Message (Convenience)**: Set [`AnthropicModelSettings.anthropic_cache_messages`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_messages] to `True` to automatically cache the last user message
+4. **Cache All Messages**: Set [`AnthropicModelSettings.anthropic_cache_messages`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_messages] to `True` to automatically cache all messages
 
 You can combine multiple strategies for maximum savings:
 

From 03dfa1986780b4cfef3191b93ff30df851bd83b9 Mon Sep 17 00:00:00 2001
From: Wh1isper <9573586@qq.com>
Date: Sat, 22 Nov 2025 09:26:37 +0800
Subject: [PATCH 13/14] use run_async in docs

---
 docs/models/anthropic.md | 132 ++++++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 57 deletions(-)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index 34417d6679..fa4dce4a36 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -87,13 +87,14 @@ Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-wit
 3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 4. **Cache All Messages**: Set [`AnthropicModelSettings.anthropic_cache_messages`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_messages] to `True` to automatically cache all messages
 
-You can combine multiple strategies for maximum savings:
+### Example 1: Automatic Last Message Caching
+
+Use `anthropic_cache_messages` to automatically cache the last user message:
 
 ```python {test="skip"}
-from pydantic_ai import Agent, CachePoint, RunContext
+from pydantic_ai import Agent
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
-# Example 1: Use anthropic_cache_messages for automatic last message caching
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='You are a helpful assistant.',
@@ -102,16 +103,23 @@ agent = Agent(
     ),
 )
 
-async def main():
-    # The last message is automatically cached - no need for manual CachePoint
-    result1 = await agent.run('What is the capital of France?')
+# The last message is automatically cached - no need for manual CachePoint
+result1 = agent.run_sync('What is the capital of France?')
+
+# Subsequent calls with similar conversation benefit from cache
+result2 = agent.run_sync('What is the capital of Germany?')
+print(f'Cache write: {result1.usage().cache_write_tokens}')
+print(f'Cache read: {result2.usage().cache_read_tokens}')
+```
+
+### Example 2: Comprehensive Caching Strategy
 
-    # Subsequent calls with similar conversation benefit from cache
-    result2 = await agent.run('What is the capital of Germany?')
-    print(f'Cache write: {result1.usage().cache_write_tokens}')
-    print(f'Cache read: {result2.usage().cache_read_tokens}')
+Combine multiple cache settings for maximum savings:
+
+```python {test="skip"}
+from pydantic_ai import Agent, RunContext
+from pydantic_ai.models.anthropic import AnthropicModelSettings
 
-# Example 2: Combine with other cache settings for comprehensive caching
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
@@ -127,27 +135,34 @@ def search_docs(ctx: RunContext, query: str) -> str:
     """Search documentation."""
     return f'Results for {query}'
 
-async def main():  # noqa: F811
-    # All three cache points are used: instructions, tools, and last message
-    result = await agent.run('Search for Python best practices')
-    print(result.output)
 
-# Example 3: Fine-grained control with manual CachePoint markers
+result = agent.run_sync('Search for Python best practices')
+print(result.output)
+```
+
+### Example 3: Fine-Grained Control with CachePoint
+
+Use manual `CachePoint` markers to control cache locations precisely:
+
+```python {test="skip"}
+from pydantic_ai import Agent, CachePoint
+
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Instructions...',
 )
 
-async def main():  # noqa: F811
-    # Manually control cache points for specific content blocks
-    result = await agent.run([
-        'Long context from documentation...',
-        CachePoint(),  # Cache everything up to this point
-        'First question'
-    ])
-    print(result.output)
+# Manually control cache points for specific content blocks
+result = agent.run_sync([
+    'Long context from documentation...',
+    CachePoint(),  # Cache everything up to this point
+    'First question'
+])
+print(result.output)
 ```
 
+### Accessing Cache Usage Statistics
+
 Access cache usage statistics via `result.usage()`:
 
 ```python {test="skip"}
@@ -162,11 +177,10 @@ agent = Agent(
     ),
 )
 
-async def main():  # noqa: F811
-    result = await agent.run('Your question')
-    usage = result.usage()
-    print(f'Cache write tokens: {usage.cache_write_tokens}')
-    print(f'Cache read tokens: {usage.cache_read_tokens}')
+result = agent.run_sync('Your question')
+usage = result.usage()
+print(f'Cache write tokens: {usage.cache_write_tokens}')
+print(f'Cache read tokens: {usage.cache_read_tokens}')
 ```
 
 ### Cache Point Limits
@@ -181,13 +195,16 @@ Cache points can be placed in three locations:
 2. **Tool Definitions**: Via `anthropic_cache_tool_definitions` setting (adds cache point to last tool definition)
 3. **Messages**: Via `CachePoint` markers or `anthropic_cache_messages` setting (adds cache points to message content)
 
-Each setting uses **at most 1 cache point**, but you can combine them:
+Each setting uses **at most 1 cache point**, but you can combine them.
+
+#### Example: Using All 3 Cache Point Sources
+
+Define an agent with all cache settings enabled:
 
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
-# Example: Using all 3 cache point sources
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
@@ -202,22 +219,24 @@ agent = Agent(
 def my_tool() -> str:
     return 'result'
 
-async def main():  # noqa: F811
-    # This uses 3 cache points (instructions + tools + last message)
-    # You can add 1 more CachePoint marker before hitting the limit
-    result = await agent.run([
-        'Context', CachePoint(),  # 4th cache point - OK
-        'Question'
-    ])
-    print(result.output)
-    usage = result.usage()
-    print(f'Cache write tokens: {usage.cache_write_tokens}')
-    print(f'Cache read tokens: {usage.cache_read_tokens}')
+
+# This uses 3 cache points (instructions + tools + last message)
+# You can add 1 more CachePoint marker before hitting the limit
+result = agent.run_sync([
+    'Context', CachePoint(),  # 4th cache point - OK
+    'Question'
+])
+print(result.output)
+usage = result.usage()
+print(f'Cache write tokens: {usage.cache_write_tokens}')
+print(f'Cache read tokens: {usage.cache_read_tokens}')
 ```
 
 #### Automatic Cache Point Limiting
 
-When cache points from all sources (settings + `CachePoint` markers) exceed 4, Pydantic AI automatically removes excess cache points from **older message content** (keeping the most recent ones):
+When cache points from all sources (settings + `CachePoint` markers) exceed 4, Pydantic AI automatically removes excess cache points from **older message content** (keeping the most recent ones).
+
+Define an agent with 2 cache points from settings:
 
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint
@@ -236,20 +255,19 @@ agent = Agent(
 def search() -> str:
     return 'data'
 
-async def main():  # noqa: F811
-    # Already using 2 cache points (instructions + tools)
-    # Can add 2 more CachePoint markers (4 total limit)
-    result = await agent.run([
-        'Context 1', CachePoint(),  # Oldest - will be removed
-        'Context 2', CachePoint(),  # Will be kept (3rd point)
-        'Context 3', CachePoint(),  # Will be kept (4th point)
-        'Question'
-    ])
-    # Final cache points: instructions + tools + Context 2 + Context 3 = 4
-    print(result.output)
-    usage = result.usage()
-    print(f'Cache write tokens: {usage.cache_write_tokens}')
-    print(f'Cache read tokens: {usage.cache_read_tokens}')
+# Already using 2 cache points (instructions + tools)
+# Can add 2 more CachePoint markers (4 total limit)
+result = agent.run_sync([
+    'Context 1', CachePoint(),  # Oldest - will be removed
+    'Context 2', CachePoint(),  # Will be kept (3rd point)
+    'Context 3', CachePoint(),  # Will be kept (4th point)
+    'Question'
+])
+# Final cache points: instructions + tools + Context 2 + Context 3 = 4
+print(result.output)
+usage = result.usage()
+print(f'Cache write tokens: {usage.cache_write_tokens}')
+print(f'Cache read tokens: {usage.cache_read_tokens}')
 ```
 
 **Key Points**:

From 4f58aa8b0bdc8939c0301c67639446b812e70b26 Mon Sep 17 00:00:00 2001
From: Douwe Maan <me@douwe.me>
Date: Tue, 25 Nov 2025 17:38:41 -0600
Subject: [PATCH 14/14] Update docs/models/anthropic.md

---
 docs/models/anthropic.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
index fa4dce4a36..b3ed1dc9ed 100644
--- a/docs/models/anthropic.md
+++ b/docs/models/anthropic.md
@@ -87,9 +87,9 @@ Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-wit
 3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 4. **Cache All Messages**: Set [`AnthropicModelSettings.anthropic_cache_messages`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_messages] to `True` to automatically cache all messages
 
-### Example 1: Automatic Last Message Caching
+### Example 1: Automatic Message Caching
 
-Use `anthropic_cache_messages` to automatically cache the last user message:
+Use `anthropic_cache_messages` to automatically cache all messages up to and including the newest user message:
 
 ```python {test="skip"}
 from pydantic_ai import Agent