Skip to content

Commit 9417e59

Browse files
committed
Rename Path Efficiency to Task Navigation Efficiency and Return Only One Metric
1 parent 05f1eae commit 9417e59

File tree

7 files changed

+308
-524
lines changed

7 files changed

+308
-524
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5-
from ._path_efficiency import PathEfficiencyEvaluator
5+
from .task_navigation_efficiency import TaskNavigationEfficiencyEvaluator, TaskNavigationEfficiencyMatchingMode
66

7-
__all__ = ["PathEfficiencyEvaluator"]
7+
__all__ = ["TaskNavigationEfficiencyEvaluator", "TaskNavigationEfficiencyMatchingMode"]

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py renamed to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/task_navigation_efficiency/task_navigation_efficiency.py

Lines changed: 112 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,73 @@
11
# ---------------------------------------------------------
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
4-
import json
4+
from enum import Enum
55
from collections import Counter
6+
import json
67
from typing import Dict, List, Union, Any, Tuple
78
from typing_extensions import overload, override
89

9-
from azure.ai.evaluation._evaluators._common import EvaluatorBase
1010
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
12+
from azure.ai.evaluation._exceptions import (
13+
ErrorCategory,
14+
ErrorTarget,
15+
EvaluationException,
16+
)
17+
18+
19+
class TaskNavigationEfficiencyMatchingMode(str, Enum):
20+
"""
21+
Enumeration of task navigation efficiency matching mode.
22+
23+
This enum allows you to specify which single matching technique should be used when evaluating
24+
the efficiency of an agent's tool calls sequence against a ground truth path.
25+
"""
26+
27+
EXACT_MATCH = "exact_match"
28+
"""
29+
Binary metric indicating whether the agent's tool calls exactly match the ground truth.
30+
31+
Returns True only if the agent's tool calls sequence is identical to the expected sequence
32+
in both order and content (no extra steps, no missing steps, correct order).
33+
"""
34+
35+
IN_ORDER_MATCH = "in_order_match"
36+
"""
37+
Binary metric allowing extra steps but requiring correct order of required tool calls.
38+
39+
Returns True if all ground truth steps appear in the agent's sequence in the correct
40+
order, even if there are additional steps interspersed.
41+
"""
42+
43+
ANY_ORDER_MATCH = "any_order_match"
44+
"""
45+
Binary metric allowing both extra steps and different ordering.
46+
47+
Returns True if all ground truth steps appear in the agent's sequence with sufficient
48+
frequency, regardless of order. Most lenient matching criterion.
49+
"""
1150

1251

13-
class PathEfficiencyEvaluator(EvaluatorBase):
52+
class TaskNavigationEfficiencyEvaluator(EvaluatorBase):
1453
"""
1554
Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
1655
17-
The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
18-
between the agent's tool usage trajectory and the ground truth expected steps. It also provides
19-
three binary match metrics: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
56+
The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
57+
It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
58+
It also returns precision, recall, and F1 scores in properties bag.
2059
21-
:param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
22-
:type precision_threshold: float
23-
:param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
24-
:type recall_threshold: float
25-
:param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
26-
:type f1_score_threshold: float
60+
:param matching_mode: The matching mode to use. Default is "exact_match".
61+
:type matching_mode: enum[str, TaskNavigationEfficiencyMatchingMode]
2762
2863
.. admonition:: Example:
2964
3065
.. code-block:: python
3166
32-
from azure.ai.evaluation import PathEfficiencyEvaluator
67+
from azure.ai.evaluation import TaskNavigationEfficiencyEvaluator
3368
34-
path_efficiency_eval = PathEfficiencyEvaluator(
35-
precision_threshold=0.7,
36-
recall_threshold=0.8,
37-
f1_score_threshold=0.75
69+
task_navigation_efficiency_eval = TaskNavigationEfficiencyEvaluator(
70+
matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
3871
)
3972
4073
# Example 1: Using simple tool names list
@@ -64,36 +97,36 @@ class PathEfficiencyEvaluator(EvaluatorBase):
6497
)
6598
"""
6699

67-
_DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5
68-
69-
id = "azureai://built-in/evaluators/path_efficiency"
100+
id = "azureai://built-in/evaluators/task_navigation_efficiency"
70101
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71102

103+
matching_mode: TaskNavigationEfficiencyMatchingMode
104+
"""The matching mode to use."""
105+
72106
@override
73107
def __init__(
74108
self,
75109
*,
76-
precision_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
77-
recall_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
78-
f1_score_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
110+
matching_mode: Union[str, TaskNavigationEfficiencyMatchingMode] = TaskNavigationEfficiencyMatchingMode.EXACT_MATCH,
79111
):
80-
self._higher_is_better = True
112+
# Type checking for metric parameter
113+
if isinstance(matching_mode, str):
114+
try:
115+
self.matching_mode = TaskNavigationEfficiencyMatchingMode(matching_mode)
116+
except ValueError:
117+
raise ValueError(f"matching_mode must be one of {[m.value for m in TaskNavigationEfficiencyMatchingMode]}, got '{matching_mode}'")
118+
elif isinstance(matching_mode, TaskNavigationEfficiencyMatchingMode):
119+
self.matching_mode = matching_mode
120+
else:
121+
raise EvaluationException(
122+
f"matching_mode must be a string with one of {[m.value for m in TaskNavigationEfficiencyMatchingMode]} or TaskNavigationEfficiencyMatchingMode enum, got {type(matching_mode)}",
123+
internal_message=str(self.matching_mode),
124+
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
125+
category=ErrorCategory.INVALID_VALUE,
126+
)
127+
81128
super().__init__()
82129

83-
# Type checking for threshold parameters
84-
for name, value in [
85-
("precision_threshold", precision_threshold),
86-
("recall_threshold", recall_threshold),
87-
("f1_score_threshold", f1_score_threshold),
88-
]:
89-
if not isinstance(value, float):
90-
raise TypeError(f"{name} must be a float, got {type(value)}")
91-
92-
self._threshold = {
93-
"path_efficiency_precision": precision_threshold,
94-
"path_efficiency_recall": recall_threshold,
95-
"path_efficiency_f1": f1_score_threshold,
96-
}
97130

98131
def _prepare_steps_for_comparison(
99132
self,
@@ -192,14 +225,20 @@ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List
192225
# Check if agent has at least as many occurrences of each ground truth step
193226
return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
194227

228+
_TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
229+
TaskNavigationEfficiencyMatchingMode.EXACT_MATCH: _calculate_exact_match,
230+
TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH: _calculate_in_order_match,
231+
TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH: _calculate_any_order_match,
232+
}
233+
195234
@override
196-
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
235+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[str, float]]]:
197236
"""Produce a path efficiency evaluation result.
198237
199238
:param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
200239
:type eval_input: Dict
201240
:return: The evaluation result.
202-
:rtype: Dict[str, Union[float, str]]
241+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
203242
"""
204243
response = eval_input["response"]
205244
ground_truth = eval_input["ground_truth"]
@@ -244,12 +283,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
244283
ground_truth_names = [name.strip() for name in tool_names_list]
245284
ground_truth_params_dict = params_dict
246285
use_parameter_matching = True
247-
248286
elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
249287
# List format: just tool names
250288
ground_truth_names = [step.strip() for step in ground_truth]
251289
use_parameter_matching = False
252-
253290
else:
254291
raise TypeError(
255292
"ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
@@ -267,42 +304,45 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
267304
)
268305

269306
# Calculate precision, recall, and F1 scores
270-
metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
271-
272-
# Calculate binary match metrics
273-
exact_match = self._calculate_exact_match(agent_steps, ground_truth_steps)
274-
in_order_match = self._calculate_in_order_match(agent_steps, ground_truth_steps)
275-
any_order_match = self._calculate_any_order_match(agent_steps, ground_truth_steps)
307+
additional_properties_metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
276308

309+
277310
# Convert metrics to floats, using nan for None or non-convertible values
278-
path_efficiency_precision = (
279-
float(metrics["precision_score"]) if metrics["precision_score"] is not None else float("nan")
280-
)
281-
path_efficiency_recall = float(metrics["recall_score"]) if metrics["recall_score"] is not None else float("nan")
282-
path_efficiency_f1_score = float(metrics["f1_score"]) if metrics["f1_score"] is not None else float("nan")
311+
for metric, score in additional_properties_metrics.items():
312+
additional_properties_metrics[metric] = (
313+
float(score) if score is not None else float("nan")
314+
)
283315

284-
return {
285-
"path_efficiency_precision_score": path_efficiency_precision,
286-
"path_efficiency_recall_score": path_efficiency_recall,
287-
"path_efficiency_f1_score": path_efficiency_f1_score,
288-
"path_efficiency_exact_match_result": EVALUATION_PASS_FAIL_MAPPING[exact_match],
289-
"path_efficiency_in_order_match_result": EVALUATION_PASS_FAIL_MAPPING[in_order_match],
290-
"path_efficiency_any_order_match_result": EVALUATION_PASS_FAIL_MAPPING[any_order_match],
291-
}
316+
317+
if self.matching_mode in self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS:
318+
# Calculate binary match metrics
319+
match_result = self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS[self.matching_mode](self, agent_steps, ground_truth_steps)
320+
321+
return {
322+
"task_navigation_efficiency_result": EVALUATION_PASS_FAIL_MAPPING[match_result],
323+
"properties": additional_properties_metrics
324+
}
325+
else:
326+
raise EvaluationException(
327+
f"Unsupported matching_mode '{self.matching_mode}'",
328+
internal_message=str(self.matching_mode),
329+
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
330+
category=ErrorCategory.INVALID_VALUE,
331+
)
292332

293333
@overload
294334
def __call__( # type: ignore
295335
self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str]
296-
) -> Dict[str, Union[float, str]]:
336+
) -> Dict[str, Union[float, str, Dict[str, float]]]:
297337
"""
298-
Evaluate the path efficiency of an agent's action sequence.
338+
Evaluate the task navigation efficiency of an agent's action sequence.
299339
300340
:keyword response: The agent's response containing tool calls.
301341
:paramtype response: Union[str, List[Dict[str, Any]]]
302342
:keyword ground_truth: List of expected tool/action steps.
303343
:paramtype ground_truth: List[str]
304-
:return: The path efficiency scores and results.
305-
:rtype: Dict[str, Union[float, str]]
344+
:return: The task navigation efficiency scores and results.
345+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
306346
"""
307347

308348
@overload
@@ -311,16 +351,16 @@ def __call__( # type: ignore
311351
*,
312352
response: Union[str, List[Dict[str, Any]]],
313353
ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
314-
) -> Dict[str, Union[float, str]]:
354+
) -> Dict[str, Union[float, str, Dict[str, float]]]:
315355
"""
316-
Evaluate the path efficiency of an agent's action sequence with tool parameters.
356+
Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.
317357
318358
:keyword response: The agent's response containing tool calls.
319359
:paramtype response: Union[str, List[Dict[str, Any]]]
320360
:keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321361
:paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322-
:return: The path efficiency scores and results.
323-
:rtype: Dict[str, Union[float, str]]
362+
:return: The task navigation efficiency scores and results.
363+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
324364
"""
325365

326366
@override
@@ -330,13 +370,13 @@ def __call__(
330370
**kwargs,
331371
):
332372
"""
333-
Evaluate path efficiency.
373+
Evaluate task navigation efficiency.
334374
335375
:keyword response: The agent's response containing tool calls.
336376
:paramtype response: Union[str, List[Dict[str, Any]]]
337377
:keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
338378
:paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
339-
:return: The path efficiency scores and results.
340-
:rtype: Dict[str, Union[float, str]]
379+
:return: The task navigation efficiency scores and results.
380+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
341381
"""
342382
return super().__call__(*args, **kwargs)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class ErrorTarget(Enum):
7878
ECI_EVALUATOR = "ECIEvaluator"
7979
F1_EVALUATOR = "F1Evaluator"
8080
GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
81+
TASK_NAVIGATION_EFFICIENCY_EVALUATOR = "TaskNavigationEfficiencyEvaluator"
8182
PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
8283
INTENT_RESOLUTION_EVALUATOR = "IntentResolutionEvaluator"
8384
RELEVANCE_EVALUATOR = "RelevanceEvaluator"

0 commit comments

Comments
 (0)