Skip to content

Commit c478996

Browse files
committed
chore: emit strands metrics
1 parent 4d7bb98 commit c478996

File tree

5 files changed

+159
-39
lines changed

5 files changed

+159
-39
lines changed

src/strands/event_loop/event_loop.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from functools import partial
1414
from typing import Any, Callable, Dict, List, Optional, Tuple, cast
1515

16+
from ..telemetry import MetricsClient
1617
from ..telemetry.metrics import EventLoopMetrics, Trace
1718
from ..telemetry.tracer import get_tracer
1819
from ..tools.executor import run_tools, validate_and_prepare_tools
@@ -105,10 +106,14 @@ def event_loop_cycle(
105106
kwargs["event_loop_cycle_id"] = uuid.uuid4()
106107

107108
event_loop_metrics: EventLoopMetrics = kwargs.get("event_loop_metrics", EventLoopMetrics())
108-
109+
metrics_client = MetricsClient()
109110
# Initialize state and get cycle trace
110111
kwargs = initialize_state(**kwargs)
111-
cycle_start_time, cycle_trace = event_loop_metrics.start_cycle()
112+
113+
attributes = {"event_loop_cycle_id": str(kwargs.get("event_loop_cycle_id"))}
114+
cycle_start_time, cycle_trace = event_loop_metrics.start_cycle(metrics_client)
115+
metrics_client.event_loop_cycle_count.add(1, attributes=attributes)
116+
metrics_client.event_loop_start_cycle.add(1, attributes=attributes)
112117
kwargs["event_loop_cycle_trace"] = cycle_trace
113118

114119
callback_handler(start=True)
@@ -191,8 +196,8 @@ def event_loop_cycle(
191196
callback_handler(message=message)
192197

193198
# Update metrics
194-
event_loop_metrics.update_usage(usage)
195-
event_loop_metrics.update_metrics(metrics)
199+
event_loop_metrics.update_usage(usage, metrics_client)
200+
event_loop_metrics.update_metrics(metrics, metrics_client)
196201

197202
# If the model is requesting to use tools
198203
if stop_reason == "tool_use":
@@ -227,7 +232,7 @@ def event_loop_cycle(
227232
)
228233

229234
# End the cycle and return results
230-
event_loop_metrics.end_cycle(cycle_start_time, cycle_trace)
235+
event_loop_metrics.end_cycle(cycle_start_time, cycle_trace, metrics_client)
231236
if cycle_span:
232237
tracer.end_event_loop_cycle_span(
233238
span=cycle_span,
@@ -380,7 +385,7 @@ def _handle_tool_execution(
380385

381386
if not tool_uses:
382387
return stop_reason, message, event_loop_metrics, kwargs["request_state"]
383-
388+
metrics_client = MetricsClient()
384389
tool_handler_process = partial(
385390
tool_handler.process,
386391
messages=messages,
@@ -418,7 +423,7 @@ def _handle_tool_execution(
418423
tracer.end_event_loop_cycle_span(span=cycle_span, message=message, tool_result_message=tool_result_message)
419424

420425
if kwargs["request_state"].get("stop_event_loop", False):
421-
event_loop_metrics.end_cycle(cycle_start_time, cycle_trace)
426+
event_loop_metrics.end_cycle(cycle_start_time, cycle_trace, metrics_client)
422427
return stop_reason, message, event_loop_metrics, kwargs["request_state"]
423428

424429
return recurse_event_loop(

src/strands/telemetry/metrics.py

Lines changed: 78 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
88

99
import opentelemetry.metrics as metrics_api
10-
from opentelemetry.metrics import Counter, Meter
10+
from opentelemetry.metrics import Counter, Histogram, Meter
1111

1212
from ..telemetry import metrics_constants as constants
1313
from ..types.content import Message
@@ -121,22 +121,34 @@ class ToolMetrics:
121121
error_count: int = 0
122122
total_time: float = 0.0
123123

124-
def add_call(self, tool: ToolUse, duration: float, success: bool) -> None:
124+
def add_call(
125+
self,
126+
tool: ToolUse,
127+
duration: float,
128+
success: bool,
129+
metrics_client: "MetricsClient",
130+
attributes: Optional[Dict[str, Any]] = None,
131+
) -> None:
125132
"""Record a new tool call with its outcome.
126133
127134
Args:
128135
tool: The tool that was called.
129136
duration: How long the call took in seconds.
130137
success: Whether the call was successful.
138+
metrics_client: The metrics client for recording the metrics.
139+
attributes: attributes of the metrics.
131140
"""
132141
self.tool = tool # Update with latest tool state
133142
self.call_count += 1
134143
self.total_time += duration
135-
144+
metrics_client.tool_call_count.add(1, attributes=attributes)
145+
metrics_client.tool_duration.record(duration, attributes=attributes)
136146
if success:
137147
self.success_count += 1
148+
metrics_client.tool_success_count.add(1, attributes=attributes)
138149
else:
139150
self.error_count += 1
151+
metrics_client.tool_error_count.add(1, attributes=attributes)
140152

141153

142154
@dataclass
@@ -159,32 +171,42 @@ class EventLoopMetrics:
159171
accumulated_usage: Usage = field(default_factory=lambda: Usage(inputTokens=0, outputTokens=0, totalTokens=0))
160172
accumulated_metrics: Metrics = field(default_factory=lambda: Metrics(latencyMs=0))
161173

162-
def start_cycle(self) -> Tuple[float, Trace]:
174+
def start_cycle(self, metrics_client: "MetricsClient") -> Tuple[float, Trace]:
163175
"""Start a new event loop cycle and create a trace for it.
164176
165177
Returns:
166178
A tuple containing the start time and the cycle trace object.
167179
"""
180+
metrics_client.event_loop_cycle_count.add(1)
168181
self.cycle_count += 1
169182
start_time = time.time()
170183
cycle_trace = Trace(f"Cycle {self.cycle_count}", start_time=start_time)
171184
self.traces.append(cycle_trace)
172185
return start_time, cycle_trace
173186

174-
def end_cycle(self, start_time: float, cycle_trace: Trace) -> None:
187+
def end_cycle(self, start_time: float, cycle_trace: Trace, metrics_client: "MetricsClient") -> None:
175188
"""End the current event loop cycle and record its duration.
176189
177190
Args:
178191
start_time: The timestamp when the cycle started.
179192
cycle_trace: The trace object for this cycle.
193+
metrics_client: The metrics client for recording the metrics.
180194
"""
195+
metrics_client.event_loop_end_cycle.add(1)
181196
end_time = time.time()
182197
duration = end_time - start_time
198+
metrics_client.event_loop_cycle_duration.record(duration)
183199
self.cycle_durations.append(duration)
184200
cycle_trace.end(end_time)
185201

186202
def add_tool_usage(
187-
self, tool: ToolUse, duration: float, tool_trace: Trace, success: bool, message: Message
203+
self,
204+
tool: ToolUse,
205+
duration: float,
206+
tool_trace: Trace,
207+
success: bool,
208+
message: Message,
209+
metrics_client: "MetricsClient",
188210
) -> None:
189211
"""Record metrics for a tool invocation.
190212
@@ -194,6 +216,7 @@ def add_tool_usage(
194216
tool_trace: The trace object for this tool call.
195217
success: Whether the tool call was successful.
196218
message: The message associated with the tool call.
219+
metrics_client: The metrics client for recording the metrics.
197220
"""
198221
tool_name = tool.get("name", "unknown_tool")
199222
tool_use_id = tool.get("toolUseId", "unknown")
@@ -207,26 +230,39 @@ def add_tool_usage(
207230
tool_trace.raw_name = f"{tool_name} - {tool_use_id}"
208231
tool_trace.add_message(message)
209232

210-
self.tool_metrics.setdefault(tool_name, ToolMetrics(tool)).add_call(tool, duration, success)
211-
233+
self.tool_metrics.setdefault(tool_name, ToolMetrics(tool)).add_call(
234+
tool,
235+
duration,
236+
success,
237+
metrics_client,
238+
attributes={
239+
"tool_name": tool_name,
240+
"tool_use_id": tool_use_id,
241+
},
242+
)
212243
tool_trace.end()
213244

214-
def update_usage(self, usage: Usage) -> None:
245+
def update_usage(self, usage: Usage, metrics_client: "MetricsClient") -> None:
215246
"""Update the accumulated token usage with new usage data.
216247
217248
Args:
218249
usage: The usage data to add to the accumulated totals.
250+
metrics_client: The metrics client for recording the metrics.
219251
"""
252+
metrics_client.event_loop_input_tokens.record(usage["inputTokens"])
253+
metrics_client.event_loop_output_tokens.record(usage["outputTokens"])
220254
self.accumulated_usage["inputTokens"] += usage["inputTokens"]
221255
self.accumulated_usage["outputTokens"] += usage["outputTokens"]
222256
self.accumulated_usage["totalTokens"] += usage["totalTokens"]
223257

224-
def update_metrics(self, metrics: Metrics) -> None:
258+
def update_metrics(self, metrics: Metrics, metrics_client: "MetricsClient") -> None:
225259
"""Update the accumulated performance metrics with new metrics data.
226260
227261
Args:
228262
metrics: The metrics data to add to the accumulated totals.
263+
metrics_client: The metrics client for recording the metrics.
229264
"""
265+
metrics_client.event_loop_latency.record(metrics["latencyMs"])
230266
self.accumulated_metrics["latencyMs"] += metrics["latencyMs"]
231267

232268
def get_summary(self) -> Dict[str, Any]:
@@ -370,7 +406,18 @@ class MetricsClient:
370406

371407
_instance: Optional["MetricsClient"] = None
372408
meter: Meter
373-
strands_agent_invocation_count: Counter
409+
event_loop_cycle_count: Counter
410+
event_loop_start_cycle: Counter
411+
event_loop_end_cycle: Counter
412+
event_loop_cycle_duration: Histogram
413+
event_loop_latency: Histogram
414+
event_loop_input_tokens: Histogram
415+
event_loop_output_tokens: Histogram
416+
417+
tool_call_count: Counter
418+
tool_success_count: Counter
419+
tool_error_count: Counter
420+
tool_duration: Histogram
374421

375422
def __new__(cls) -> "MetricsClient":
376423
"""Create or return the singleton instance of MetricsClient.
@@ -398,6 +445,24 @@ def __init__(self) -> None:
398445

399446
def create_instruments(self) -> None:
400447
"""Create and initialize all OpenTelemetry metric instruments."""
401-
self.strands_agent_invocation_count = self.meter.create_counter(
402-
name=constants.STRANDS_AGENT_INVOCATION_COUNT, unit="Count"
448+
self.event_loop_cycle_count = self.meter.create_counter(
449+
name=constants.STRANDS_EVENT_LOOP_CYCLE_COUNT, unit="Count"
450+
)
451+
self.event_loop_start_cycle = self.meter.create_counter(
452+
name=constants.STRANDS_EVENT_LOOP_START_CYCLE, unit="Count"
453+
)
454+
self.event_loop_end_cycle = self.meter.create_counter(name=constants.STRANDS_EVENT_LOOP_END_CYCLE, unit="Count")
455+
self.event_loop_cycle_duration = self.meter.create_histogram(
456+
name=constants.STRANDS_EVENT_LOOP_CYCLE_DURATION, unit="s"
457+
)
458+
self.event_loop_latency = self.meter.create_histogram(name=constants.STRANDS_EVENT_LOOP_LATENCY, unit="ms")
459+
self.tool_call_count = self.meter.create_counter(name=constants.STRANDS_TOOL_CALL_COUNT, unit="Count")
460+
self.tool_success_count = self.meter.create_counter(name=constants.STRANDS_TOOL_SUCCESS_COUNT, unit="Count")
461+
self.tool_error_count = self.meter.create_counter(name=constants.STRANDS_TOOL_ERROR_COUNT, unit="Count")
462+
self.tool_duration = self.meter.create_histogram(name=constants.STRANDS_TOOL_DURATION, unit="s")
463+
self.event_loop_input_tokens = self.meter.create_histogram(
464+
name=constants.STRANDS_EVENT_LOOP_INPUT_TOKENS, unit="token"
465+
)
466+
self.event_loop_output_tokens = self.meter.create_histogram(
467+
name=constants.STRANDS_EVENT_LOOP_OUTPUT_TOKENS, unit="token"
403468
)
Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1-
"""Metrics that are emitted in Strands-Agent."""
1+
"""Metrics that are emitted in Strands-Agents."""
22

3-
STRANDS_AGENT_INVOCATION_COUNT = "strands.agent.invocation_count"
3+
STRANDS_EVENT_LOOP_CYCLE_COUNT = "strands.event_loop.cycle_count"
4+
STRANDS_EVENT_LOOP_START_CYCLE = "strands.event_loop.start_cycle"
5+
STRANDS_EVENT_LOOP_END_CYCLE = "strands.event_loop.end_cycle"
6+
STRANDS_TOOL_CALL_COUNT = "strands.tool.call_count"
7+
STRANDS_TOOL_SUCCESS_COUNT = "strands.tool.success_count"
8+
STRANDS_TOOL_ERROR_COUNT = "strands.tool.error_count"
9+
10+
# Histograms
11+
STRANDS_EVENT_LOOP_LATENCY = "strands.event_loop.latency"
12+
STRANDS_TOOL_DURATION = "strands.tool.duration"
13+
STRANDS_EVENT_LOOP_CYCLE_DURATION = "strands.event_loop.cycle_duration"
14+
STRANDS_EVENT_LOOP_INPUT_TOKENS = "strands.event_loop.input.tokens"
15+
STRANDS_EVENT_LOOP_OUTPUT_TOKENS = "strands.event_loop.output.tokens"

src/strands/tools/executor.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from opentelemetry import trace
99

10+
from ..telemetry import MetricsClient
1011
from ..telemetry.metrics import EventLoopMetrics, Trace
1112
from ..telemetry.tracer import get_tracer
1213
from ..tools.tools import InvalidToolUseNameException, validate_tool_use
@@ -51,6 +52,7 @@ def _handle_tool_execution(tool: ToolUse) -> Tuple[bool, Optional[ToolResult]]:
5152

5253
tracer = get_tracer()
5354
tool_call_span = tracer.start_tool_call_span(tool, parent_span)
55+
metrics_client = MetricsClient()
5456

5557
try:
5658
if "toolUseId" not in tool or tool["toolUseId"] not in invalid_tool_use_ids:
@@ -64,7 +66,9 @@ def _handle_tool_execution(tool: ToolUse) -> Tuple[bool, Optional[ToolResult]]:
6466

6567
tool_duration = time.time() - tool_start_time
6668
message = Message(role="user", content=[{"toolResult": result}])
67-
event_loop_metrics.add_tool_usage(tool, tool_duration, tool_trace, tool_success, message)
69+
event_loop_metrics.add_tool_usage(
70+
tool, tool_duration, tool_trace, tool_success, message, metrics_client
71+
)
6872
cycle_trace.add_child(tool_trace)
6973

7074
if tool_call_span:

0 commit comments

Comments
 (0)