Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._task_navigation_efficiency import TaskNavigationEfficiencyEvaluator, TaskNavigationEfficiencyMatchingMode

__all__ = ["TaskNavigationEfficiencyEvaluator", "TaskNavigationEfficiencyMatchingMode"]
Original file line number Diff line number Diff line change
@@ -1,40 +1,73 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import json
from enum import Enum
from collections import Counter
import json
from typing import Dict, List, Union, Any, Tuple
from typing_extensions import overload, override

from azure.ai.evaluation._evaluators._common import EvaluatorBase
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
from azure.ai.evaluation._evaluators._common import EvaluatorBase
from azure.ai.evaluation._exceptions import (
ErrorCategory,
ErrorTarget,
EvaluationException,
)


class TaskNavigationEfficiencyMatchingMode(str, Enum):
"""
Enumeration of task navigation efficiency matching mode.

This enum allows you to specify which single matching technique should be used when evaluating
the efficiency of an agent's tool calls sequence against a ground truth path.
"""

EXACT_MATCH = "exact_match"
"""
Binary metric indicating whether the agent's tool calls exactly match the ground truth.

Returns True only if the agent's tool calls sequence is identical to the expected sequence
in both order and content (no extra steps, no missing steps, correct order).
"""

IN_ORDER_MATCH = "in_order_match"
"""
Binary metric allowing extra steps but requiring correct order of required tool calls.

Returns True if all ground truth steps appear in the agent's sequence in the correct
order, even if there are additional steps interspersed.
"""

ANY_ORDER_MATCH = "any_order_match"
"""
Binary metric allowing both extra steps and different ordering.

Returns True if all ground truth steps appear in the agent's sequence with sufficient
frequency, regardless of order. Most lenient matching criterion.
"""


class PathEfficiencyEvaluator(EvaluatorBase):
class TaskNavigationEfficiencyEvaluator(EvaluatorBase):
"""
Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.

The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
between the agent's tool usage trajectory and the ground truth expected steps. It also provides
three binary match metrics: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
It also returns precision, recall, and F1 scores in properties bag.

:param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
:type precision_threshold: float
:param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
:type recall_threshold: float
:param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
:type f1_score_threshold: float
:param matching_mode: The matching mode to use. Default is "exact_match".
:type matching_mode: enum[str, TaskNavigationEfficiencyMatchingMode]

.. admonition:: Example:

.. code-block:: python

from azure.ai.evaluation import PathEfficiencyEvaluator
from azure.ai.evaluation import TaskNavigationEfficiencyEvaluator

path_efficiency_eval = PathEfficiencyEvaluator(
precision_threshold=0.7,
recall_threshold=0.8,
f1_score_threshold=0.75
task_navigation_efficiency_eval = TaskNavigationEfficiencyEvaluator(
matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
)

# Example 1: Using simple tool names list
Expand Down Expand Up @@ -64,36 +97,39 @@ class PathEfficiencyEvaluator(EvaluatorBase):
)
"""

_DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5

id = "azureai://built-in/evaluators/path_efficiency"
id = "azureai://built-in/evaluators/task_navigation_efficiency"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

matching_mode: TaskNavigationEfficiencyMatchingMode
"""The matching mode to use."""

@override
def __init__(
self,
*,
precision_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
recall_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
f1_score_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
matching_mode: Union[
str, TaskNavigationEfficiencyMatchingMode
] = TaskNavigationEfficiencyMatchingMode.EXACT_MATCH,
):
self._higher_is_better = True
super().__init__()
# Type checking for metric parameter
if isinstance(matching_mode, str):
try:
self.matching_mode = TaskNavigationEfficiencyMatchingMode(matching_mode)
except ValueError:
raise ValueError(
f"matching_mode must be one of {[m.value for m in TaskNavigationEfficiencyMatchingMode]}, got '{matching_mode}'"
)
elif isinstance(matching_mode, TaskNavigationEfficiencyMatchingMode):
self.matching_mode = matching_mode
else:
raise EvaluationException(
f"matching_mode must be a string with one of {[m.value for m in TaskNavigationEfficiencyMatchingMode]} or TaskNavigationEfficiencyMatchingMode enum, got {type(matching_mode)}",
internal_message=str(matching_mode),
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
category=ErrorCategory.INVALID_VALUE,
)

# Type checking for threshold parameters
for name, value in [
("precision_threshold", precision_threshold),
("recall_threshold", recall_threshold),
("f1_score_threshold", f1_score_threshold),
]:
if not isinstance(value, float):
raise TypeError(f"{name} must be a float, got {type(value)}")

self._threshold = {
"path_efficiency_precision": precision_threshold,
"path_efficiency_recall": recall_threshold,
"path_efficiency_f1": f1_score_threshold,
}
super().__init__()

def _prepare_steps_for_comparison(
self,
Expand Down Expand Up @@ -192,14 +228,20 @@ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List
# Check if agent has at least as many occurrences of each ground truth step
return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)

_TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
TaskNavigationEfficiencyMatchingMode.EXACT_MATCH: _calculate_exact_match,
TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH: _calculate_in_order_match,
TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH: _calculate_any_order_match,
}

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[str, float]]]:
"""Produce a path efficiency evaluation result.

:param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
:type eval_input: Dict
:return: The evaluation result.
:rtype: Dict[str, Union[float, str]]
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
"""
response = eval_input["response"]
ground_truth = eval_input["ground_truth"]
Expand Down Expand Up @@ -244,12 +286,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
ground_truth_names = [name.strip() for name in tool_names_list]
ground_truth_params_dict = params_dict
use_parameter_matching = True

elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
# List format: just tool names
ground_truth_names = [step.strip() for step in ground_truth]
use_parameter_matching = False

else:
raise TypeError(
"ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
Expand All @@ -267,42 +307,43 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
)

# Calculate precision, recall, and F1 scores
metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)

# Calculate binary match metrics
exact_match = self._calculate_exact_match(agent_steps, ground_truth_steps)
in_order_match = self._calculate_in_order_match(agent_steps, ground_truth_steps)
any_order_match = self._calculate_any_order_match(agent_steps, ground_truth_steps)
additional_properties_metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)

# Convert metrics to floats, using nan for None or non-convertible values
path_efficiency_precision = (
float(metrics["precision_score"]) if metrics["precision_score"] is not None else float("nan")
)
path_efficiency_recall = float(metrics["recall_score"]) if metrics["recall_score"] is not None else float("nan")
path_efficiency_f1_score = float(metrics["f1_score"]) if metrics["f1_score"] is not None else float("nan")
for metric, score in additional_properties_metrics.items():
additional_properties_metrics[metric] = float(score) if score is not None else float("nan")

return {
"path_efficiency_precision_score": path_efficiency_precision,
"path_efficiency_recall_score": path_efficiency_recall,
"path_efficiency_f1_score": path_efficiency_f1_score,
"path_efficiency_exact_match_result": EVALUATION_PASS_FAIL_MAPPING[exact_match],
"path_efficiency_in_order_match_result": EVALUATION_PASS_FAIL_MAPPING[in_order_match],
"path_efficiency_any_order_match_result": EVALUATION_PASS_FAIL_MAPPING[any_order_match],
}
if self.matching_mode in self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS:
# Calculate binary match metrics
match_result = self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS[self.matching_mode](
self, agent_steps, ground_truth_steps
)

return {
"task_navigation_efficiency_result": EVALUATION_PASS_FAIL_MAPPING[match_result],
"properties": additional_properties_metrics,
}
else:
raise EvaluationException(
f"Unsupported matching_mode '{self.matching_mode}'",
internal_message=str(self.matching_mode),
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
category=ErrorCategory.INVALID_VALUE,
)

@overload
def __call__( # type: ignore
self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str]
) -> Dict[str, Union[float, str]]:
) -> Dict[str, Union[float, str, Dict[str, float]]]:
"""
Evaluate the path efficiency of an agent's action sequence.
Evaluate the task navigation efficiency of an agent's action sequence.

:keyword response: The agent's response containing tool calls.
:paramtype response: Union[str, List[Dict[str, Any]]]
:keyword ground_truth: List of expected tool/action steps.
:paramtype ground_truth: List[str]
:return: The path efficiency scores and results.
:rtype: Dict[str, Union[float, str]]
:return: The task navigation efficiency scores and results.
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
"""

@overload
Expand All @@ -311,16 +352,16 @@ def __call__( # type: ignore
*,
response: Union[str, List[Dict[str, Any]]],
ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
) -> Dict[str, Union[float, str]]:
) -> Dict[str, Union[float, str, Dict[str, float]]]:
"""
Evaluate the path efficiency of an agent's action sequence with tool parameters.
Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.

:keyword response: The agent's response containing tool calls.
:paramtype response: Union[str, List[Dict[str, Any]]]
:keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
:paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
:return: The path efficiency scores and results.
:rtype: Dict[str, Union[float, str]]
:return: The task navigation efficiency scores and results.
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
"""

@override
Expand All @@ -330,13 +371,13 @@ def __call__(
**kwargs,
):
"""
Evaluate path efficiency.
Evaluate task navigation efficiency.

:keyword response: The agent's response containing tool calls.
:paramtype response: Union[str, List[Dict[str, Any]]]
:keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
:paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
:return: The path efficiency scores and results.
:rtype: Dict[str, Union[float, str]]
:return: The task navigation efficiency scores and results.
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
"""
return super().__call__(*args, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class ErrorTarget(Enum):
ECI_EVALUATOR = "ECIEvaluator"
F1_EVALUATOR = "F1Evaluator"
GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
TASK_NAVIGATION_EFFICIENCY_EVALUATOR = "TaskNavigationEfficiencyEvaluator"
PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
INTENT_RESOLUTION_EVALUATOR = "IntentResolutionEvaluator"
RELEVANCE_EVALUATOR = "RelevanceEvaluator"
Expand Down
Loading