1
1
# ---------------------------------------------------------
2
2
# Copyright (c) Microsoft Corporation. All rights reserved.
3
3
# ---------------------------------------------------------
4
- import json
4
+ from enum import Enum
5
5
from collections import Counter
6
+ import json
6
7
from typing import Dict , List , Union , Any , Tuple
7
8
from typing_extensions import overload , override
8
9
9
- from azure .ai .evaluation ._evaluators ._common import EvaluatorBase
10
10
from azure .ai .evaluation ._constants import EVALUATION_PASS_FAIL_MAPPING
11
+ from azure .ai .evaluation ._evaluators ._common import EvaluatorBase
12
+ from azure .ai .evaluation ._exceptions import (
13
+ ErrorCategory ,
14
+ ErrorTarget ,
15
+ EvaluationException ,
16
+ )
17
+
18
+
19
+ class TaskNavigationEfficiencyMatchingMode (str , Enum ):
20
+ """
21
+ Enumeration of task navigation efficiency matching mode.
22
+
23
+ This enum allows you to specify which single matching technique should be used when evaluating
24
+ the efficiency of an agent's tool calls sequence against a ground truth path.
25
+ """
26
+
27
+ EXACT_MATCH = "exact_match"
28
+ """
29
+ Binary metric indicating whether the agent's tool calls exactly match the ground truth.
30
+
31
+ Returns True only if the agent's tool calls sequence is identical to the expected sequence
32
+ in both order and content (no extra steps, no missing steps, correct order).
33
+ """
34
+
35
+ IN_ORDER_MATCH = "in_order_match"
36
+ """
37
+ Binary metric allowing extra steps but requiring correct order of required tool calls.
38
+
39
+ Returns True if all ground truth steps appear in the agent's sequence in the correct
40
+ order, even if there are additional steps interspersed.
41
+ """
42
+
43
+ ANY_ORDER_MATCH = "any_order_match"
44
+ """
45
+ Binary metric allowing both extra steps and different ordering.
46
+
47
+ Returns True if all ground truth steps appear in the agent's sequence with sufficient
48
+ frequency, regardless of order. Most lenient matching criterion.
49
+ """
11
50
12
51
13
- class PathEfficiencyEvaluator (EvaluatorBase ):
52
+ class TaskNavigationEfficiencyEvaluator (EvaluatorBase ):
14
53
"""
15
54
Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
16
55
17
- The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
18
- between the agent's tool usage trajectory and the ground truth expected steps. It also provides
19
- three binary match metrics: exact match, in-order match (allows extra steps) , and any-order match (allows extra steps and ignores order) .
56
+ The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
57
+ It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
58
+ It also returns precision, recall , and F1 scores in properties bag .
20
59
21
- :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
22
- :type precision_threshold: float
23
- :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
24
- :type recall_threshold: float
25
- :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
26
- :type f1_score_threshold: float
60
+ :param matching_mode: The matching mode to use. Default is "exact_match".
61
+ :type matching_mode: enum[str, TaskNavigationEfficiencyMatchingMode]
27
62
28
63
.. admonition:: Example:
29
64
30
65
.. code-block:: python
31
66
32
- from azure.ai.evaluation import PathEfficiencyEvaluator
67
+ from azure.ai.evaluation import TaskNavigationEfficiencyEvaluator
33
68
34
- path_efficiency_eval = PathEfficiencyEvaluator(
35
- precision_threshold=0.7,
36
- recall_threshold=0.8,
37
- f1_score_threshold=0.75
69
+ task_navigation_efficiency_eval = TaskNavigationEfficiencyEvaluator(
70
+ matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
38
71
)
39
72
40
73
# Example 1: Using simple tool names list
@@ -64,36 +97,36 @@ class PathEfficiencyEvaluator(EvaluatorBase):
64
97
)
65
98
"""
66
99
67
- _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5
68
-
69
- id = "azureai://built-in/evaluators/path_efficiency"
100
+ id = "azureai://built-in/evaluators/task_navigation_efficiency"
70
101
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71
102
103
+ matching_mode : TaskNavigationEfficiencyMatchingMode
104
+ """The matching mode to use."""
105
+
72
106
@override
73
107
def __init__ (
74
108
self ,
75
109
* ,
76
- precision_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
77
- recall_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
78
- f1_score_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
110
+ matching_mode : Union [str , TaskNavigationEfficiencyMatchingMode ] = TaskNavigationEfficiencyMatchingMode .EXACT_MATCH ,
79
111
):
80
- self ._higher_is_better = True
112
+ # Type checking for metric parameter
113
+ if isinstance (matching_mode , str ):
114
+ try :
115
+ self .matching_mode = TaskNavigationEfficiencyMatchingMode (matching_mode )
116
+ except ValueError :
117
+ raise ValueError (f"matching_mode must be one of { [m .value for m in TaskNavigationEfficiencyMatchingMode ]} , got '{ matching_mode } '" )
118
+ elif isinstance (matching_mode , TaskNavigationEfficiencyMatchingMode ):
119
+ self .matching_mode = matching_mode
120
+ else :
121
+ raise EvaluationException (
122
+ f"matching_mode must be a string with one of { [m .value for m in TaskNavigationEfficiencyMatchingMode ]} or TaskNavigationEfficiencyMatchingMode enum, got { type (matching_mode )} " ,
123
+ internal_message = str (self .matching_mode ),
124
+ target = ErrorTarget .TASK_NAVIGATION_EFFICIENCY_EVALUATOR ,
125
+ category = ErrorCategory .INVALID_VALUE ,
126
+ )
127
+
81
128
super ().__init__ ()
82
129
83
- # Type checking for threshold parameters
84
- for name , value in [
85
- ("precision_threshold" , precision_threshold ),
86
- ("recall_threshold" , recall_threshold ),
87
- ("f1_score_threshold" , f1_score_threshold ),
88
- ]:
89
- if not isinstance (value , float ):
90
- raise TypeError (f"{ name } must be a float, got { type (value )} " )
91
-
92
- self ._threshold = {
93
- "path_efficiency_precision" : precision_threshold ,
94
- "path_efficiency_recall" : recall_threshold ,
95
- "path_efficiency_f1" : f1_score_threshold ,
96
- }
97
130
98
131
def _prepare_steps_for_comparison (
99
132
self ,
@@ -192,14 +225,20 @@ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List
192
225
# Check if agent has at least as many occurrences of each ground truth step
193
226
return all (agent_counts [step ] >= ground_truth_counts [step ] for step in ground_truth_counts )
194
227
228
+ _TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
229
+ TaskNavigationEfficiencyMatchingMode .EXACT_MATCH : _calculate_exact_match ,
230
+ TaskNavigationEfficiencyMatchingMode .IN_ORDER_MATCH : _calculate_in_order_match ,
231
+ TaskNavigationEfficiencyMatchingMode .ANY_ORDER_MATCH : _calculate_any_order_match ,
232
+ }
233
+
195
234
@override
196
- async def _do_eval (self , eval_input : Dict ) -> Dict [str , Union [float , str ]]:
235
+ async def _do_eval (self , eval_input : Dict ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
197
236
"""Produce a path efficiency evaluation result.
198
237
199
238
:param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
200
239
:type eval_input: Dict
201
240
:return: The evaluation result.
202
- :rtype: Dict[str, Union[float, str]]
241
+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
203
242
"""
204
243
response = eval_input ["response" ]
205
244
ground_truth = eval_input ["ground_truth" ]
@@ -244,12 +283,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
244
283
ground_truth_names = [name .strip () for name in tool_names_list ]
245
284
ground_truth_params_dict = params_dict
246
285
use_parameter_matching = True
247
-
248
286
elif isinstance (ground_truth , list ) and all (isinstance (step , str ) for step in ground_truth ):
249
287
# List format: just tool names
250
288
ground_truth_names = [step .strip () for step in ground_truth ]
251
289
use_parameter_matching = False
252
-
253
290
else :
254
291
raise TypeError (
255
292
"ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
@@ -267,42 +304,45 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
267
304
)
268
305
269
306
# Calculate precision, recall, and F1 scores
270
- metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth_steps )
271
-
272
- # Calculate binary match metrics
273
- exact_match = self ._calculate_exact_match (agent_steps , ground_truth_steps )
274
- in_order_match = self ._calculate_in_order_match (agent_steps , ground_truth_steps )
275
- any_order_match = self ._calculate_any_order_match (agent_steps , ground_truth_steps )
307
+ additional_properties_metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth_steps )
276
308
309
+
277
310
# Convert metrics to floats, using nan for None or non-convertible values
278
- path_efficiency_precision = (
279
- float (metrics ["precision_score" ]) if metrics ["precision_score" ] is not None else float ("nan" )
280
- )
281
- path_efficiency_recall = float (metrics ["recall_score" ]) if metrics ["recall_score" ] is not None else float ("nan" )
282
- path_efficiency_f1_score = float (metrics ["f1_score" ]) if metrics ["f1_score" ] is not None else float ("nan" )
311
+ for metric , score in additional_properties_metrics .items ():
312
+ additional_properties_metrics [metric ] = (
313
+ float (score ) if score is not None else float ("nan" )
314
+ )
283
315
284
- return {
285
- "path_efficiency_precision_score" : path_efficiency_precision ,
286
- "path_efficiency_recall_score" : path_efficiency_recall ,
287
- "path_efficiency_f1_score" : path_efficiency_f1_score ,
288
- "path_efficiency_exact_match_result" : EVALUATION_PASS_FAIL_MAPPING [exact_match ],
289
- "path_efficiency_in_order_match_result" : EVALUATION_PASS_FAIL_MAPPING [in_order_match ],
290
- "path_efficiency_any_order_match_result" : EVALUATION_PASS_FAIL_MAPPING [any_order_match ],
291
- }
316
+
317
+ if self .matching_mode in self ._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS :
318
+ # Calculate binary match metrics
319
+ match_result = self ._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS [self .matching_mode ](self , agent_steps , ground_truth_steps )
320
+
321
+ return {
322
+ "task_navigation_efficiency_result" : EVALUATION_PASS_FAIL_MAPPING [match_result ],
323
+ "properties" : additional_properties_metrics
324
+ }
325
+ else :
326
+ raise EvaluationException (
327
+ f"Unsupported matching_mode '{ self .matching_mode } '" ,
328
+ internal_message = str (self .matching_mode ),
329
+ target = ErrorTarget .TASK_NAVIGATION_EFFICIENCY_EVALUATOR ,
330
+ category = ErrorCategory .INVALID_VALUE ,
331
+ )
292
332
293
333
@overload
294
334
def __call__ ( # type: ignore
295
335
self , * , response : Union [str , List [Dict [str , Any ]]], ground_truth : List [str ]
296
- ) -> Dict [str , Union [float , str ]]:
336
+ ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
297
337
"""
298
- Evaluate the path efficiency of an agent's action sequence.
338
+ Evaluate the task navigation efficiency of an agent's action sequence.
299
339
300
340
:keyword response: The agent's response containing tool calls.
301
341
:paramtype response: Union[str, List[Dict[str, Any]]]
302
342
:keyword ground_truth: List of expected tool/action steps.
303
343
:paramtype ground_truth: List[str]
304
- :return: The path efficiency scores and results.
305
- :rtype: Dict[str, Union[float, str]]
344
+ :return: The task navigation efficiency scores and results.
345
+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
306
346
"""
307
347
308
348
@overload
@@ -311,16 +351,16 @@ def __call__( # type: ignore
311
351
* ,
312
352
response : Union [str , List [Dict [str , Any ]]],
313
353
ground_truth : Tuple [List [str ], Dict [str , Dict [str , str ]]],
314
- ) -> Dict [str , Union [float , str ]]:
354
+ ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
315
355
"""
316
- Evaluate the path efficiency of an agent's action sequence with tool parameters.
356
+ Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.
317
357
318
358
:keyword response: The agent's response containing tool calls.
319
359
:paramtype response: Union[str, List[Dict[str, Any]]]
320
360
:keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321
361
:paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322
- :return: The path efficiency scores and results.
323
- :rtype: Dict[str, Union[float, str]]
362
+ :return: The task navigation efficiency scores and results.
363
+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
324
364
"""
325
365
326
366
@override
@@ -330,13 +370,13 @@ def __call__(
330
370
** kwargs ,
331
371
):
332
372
"""
333
- Evaluate path efficiency.
373
+ Evaluate task navigation efficiency.
334
374
335
375
:keyword response: The agent's response containing tool calls.
336
376
:paramtype response: Union[str, List[Dict[str, Any]]]
337
377
:keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
338
378
:paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
339
- :return: The path efficiency scores and results.
340
- :rtype: Dict[str, Union[float, str]]
379
+ :return: The task navigation efficiency scores and results.
380
+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
341
381
"""
342
382
return super ().__call__ (* args , ** kwargs )
0 commit comments