diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py deleted file mode 100644 index 13347bc717f2..000000000000 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -from ._path_efficiency import PathEfficiencyEvaluator - -__all__ = ["PathEfficiencyEvaluator"] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py new file mode 100644 index 000000000000..ad89c4170ce9 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py @@ -0,0 +1,7 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +from ._task_navigation_efficiency import TaskNavigationEfficiencyEvaluator, TaskNavigationEfficiencyMatchingMode + +__all__ = ["TaskNavigationEfficiencyEvaluator", "TaskNavigationEfficiencyMatchingMode"] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py similarity index 68% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py index 65fb0c3b4eaf..304474944e84 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py @@ -1,40 +1,73 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -import json +from enum import Enum from collections import Counter +import json from typing import Dict, List, Union, Any, Tuple from typing_extensions import overload, override -from azure.ai.evaluation._evaluators._common import EvaluatorBase from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING +from azure.ai.evaluation._evaluators._common import EvaluatorBase +from azure.ai.evaluation._exceptions import ( + ErrorCategory, + ErrorTarget, + EvaluationException, +) + + +class TaskNavigationEfficiencyMatchingMode(str, Enum): + """ + Enumeration of task navigation efficiency matching mode. + + This enum allows you to specify which single matching technique should be used when evaluating + the efficiency of an agent's tool calls sequence against a ground truth path. + """ + + EXACT_MATCH = "exact_match" + """ + Binary metric indicating whether the agent's tool calls exactly match the ground truth. + + Returns True only if the agent's tool calls sequence is identical to the expected sequence + in both order and content (no extra steps, no missing steps, correct order). + """ + + IN_ORDER_MATCH = "in_order_match" + """ + Binary metric allowing extra steps but requiring correct order of required tool calls. + + Returns True if all ground truth steps appear in the agent's sequence in the correct + order, even if there are additional steps interspersed. + """ + + ANY_ORDER_MATCH = "any_order_match" + """ + Binary metric allowing both extra steps and different ordering. + + Returns True if all ground truth steps appear in the agent's sequence with sufficient + frequency, regardless of order. Most lenient matching criterion. + """ -class PathEfficiencyEvaluator(EvaluatorBase): +class TaskNavigationEfficiencyEvaluator(EvaluatorBase): """ Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns. - The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison - between the agent's tool usage trajectory and the ground truth expected steps. It also provides - three binary match metrics: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order). + The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps. + It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order). + It also returns precision, recall, and F1 scores in properties bag. - :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5. - :type precision_threshold: float - :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5. - :type recall_threshold: float - :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5. - :type f1_score_threshold: float + :param matching_mode: The matching mode to use. Default is "exact_match". + :type matching_mode: enum[str, TaskNavigationEfficiencyMatchingMode] .. admonition:: Example: .. code-block:: python - from azure.ai.evaluation import PathEfficiencyEvaluator + from azure.ai.evaluation import TaskNavigationEfficiencyEvaluator - path_efficiency_eval = PathEfficiencyEvaluator( - precision_threshold=0.7, - recall_threshold=0.8, - f1_score_threshold=0.75 + task_navigation_efficiency_eval = TaskNavigationEfficiencyEvaluator( + matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH ) # Example 1: Using simple tool names list @@ -64,36 +97,39 @@ class PathEfficiencyEvaluator(EvaluatorBase): ) """ - _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5 - - id = "azureai://built-in/evaluators/path_efficiency" + id = "azureai://built-in/evaluators/task_navigation_efficiency" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + matching_mode: TaskNavigationEfficiencyMatchingMode + """The matching mode to use.""" + @override def __init__( self, *, - precision_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD, - recall_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD, - f1_score_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD, + matching_mode: Union[ + str, TaskNavigationEfficiencyMatchingMode + ] = TaskNavigationEfficiencyMatchingMode.EXACT_MATCH, ): - self._higher_is_better = True - super().__init__() + # Type checking for metric parameter + if isinstance(matching_mode, str): + try: + self.matching_mode = TaskNavigationEfficiencyMatchingMode(matching_mode) + except ValueError: + raise ValueError( + f"matching_mode must be one of {[m.value for m in TaskNavigationEfficiencyMatchingMode]}, got '{matching_mode}'" + ) + elif isinstance(matching_mode, TaskNavigationEfficiencyMatchingMode): + self.matching_mode = matching_mode + else: + raise EvaluationException( + f"matching_mode must be a string with one of {[m.value for m in TaskNavigationEfficiencyMatchingMode]} or TaskNavigationEfficiencyMatchingMode enum, got {type(matching_mode)}", + internal_message=str(matching_mode), + target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR, + category=ErrorCategory.INVALID_VALUE, + ) - # Type checking for threshold parameters - for name, value in [ - ("precision_threshold", precision_threshold), - ("recall_threshold", recall_threshold), - ("f1_score_threshold", f1_score_threshold), - ]: - if not isinstance(value, float): - raise TypeError(f"{name} must be a float, got {type(value)}") - - self._threshold = { - "path_efficiency_precision": precision_threshold, - "path_efficiency_recall": recall_threshold, - "path_efficiency_f1": f1_score_threshold, - } + super().__init__() def _prepare_steps_for_comparison( self, @@ -192,14 +228,20 @@ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List # Check if agent has at least as many occurrences of each ground truth step return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts) + _TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = { + TaskNavigationEfficiencyMatchingMode.EXACT_MATCH: _calculate_exact_match, + TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH: _calculate_in_order_match, + TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH: _calculate_any_order_match, + } + @override - async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: + async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[str, float]]]: """Produce a path efficiency evaluation result. :param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth". :type eval_input: Dict :return: The evaluation result. - :rtype: Dict[str, Union[float, str]] + :rtype: Dict[str, Union[float, str, Dict[str, float]]] """ response = eval_input["response"] ground_truth = eval_input["ground_truth"] @@ -244,12 +286,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: ground_truth_names = [name.strip() for name in tool_names_list] ground_truth_params_dict = params_dict use_parameter_matching = True - elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth): # List format: just tool names ground_truth_names = [step.strip() for step in ground_truth] use_parameter_matching = False - else: raise TypeError( "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])" @@ -267,42 +307,43 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: ) # Calculate precision, recall, and F1 scores - metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps) - - # Calculate binary match metrics - exact_match = self._calculate_exact_match(agent_steps, ground_truth_steps) - in_order_match = self._calculate_in_order_match(agent_steps, ground_truth_steps) - any_order_match = self._calculate_any_order_match(agent_steps, ground_truth_steps) + additional_properties_metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps) # Convert metrics to floats, using nan for None or non-convertible values - path_efficiency_precision = ( - float(metrics["precision_score"]) if metrics["precision_score"] is not None else float("nan") - ) - path_efficiency_recall = float(metrics["recall_score"]) if metrics["recall_score"] is not None else float("nan") - path_efficiency_f1_score = float(metrics["f1_score"]) if metrics["f1_score"] is not None else float("nan") + for metric, score in additional_properties_metrics.items(): + additional_properties_metrics[metric] = float(score) if score is not None else float("nan") - return { - "path_efficiency_precision_score": path_efficiency_precision, - "path_efficiency_recall_score": path_efficiency_recall, - "path_efficiency_f1_score": path_efficiency_f1_score, - "path_efficiency_exact_match_result": EVALUATION_PASS_FAIL_MAPPING[exact_match], - "path_efficiency_in_order_match_result": EVALUATION_PASS_FAIL_MAPPING[in_order_match], - "path_efficiency_any_order_match_result": EVALUATION_PASS_FAIL_MAPPING[any_order_match], - } + if self.matching_mode in self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS: + # Calculate binary match metrics + match_result = self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS[self.matching_mode]( + self, agent_steps, ground_truth_steps + ) + + return { + "task_navigation_efficiency_result": EVALUATION_PASS_FAIL_MAPPING[match_result], + "properties": additional_properties_metrics, + } + else: + raise EvaluationException( + f"Unsupported matching_mode '{self.matching_mode}'", + internal_message=str(self.matching_mode), + target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR, + category=ErrorCategory.INVALID_VALUE, + ) @overload def __call__( # type: ignore self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str] - ) -> Dict[str, Union[float, str]]: + ) -> Dict[str, Union[float, str, Dict[str, float]]]: """ - Evaluate the path efficiency of an agent's action sequence. + Evaluate the task navigation efficiency of an agent's action sequence. :keyword response: The agent's response containing tool calls. :paramtype response: Union[str, List[Dict[str, Any]]] :keyword ground_truth: List of expected tool/action steps. :paramtype ground_truth: List[str] - :return: The path efficiency scores and results. - :rtype: Dict[str, Union[float, str]] + :return: The task navigation efficiency scores and results. + :rtype: Dict[str, Union[float, str, Dict[str, float]]] """ @overload @@ -311,16 +352,16 @@ def __call__( # type: ignore *, response: Union[str, List[Dict[str, Any]]], ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]], - ) -> Dict[str, Union[float, str]]: + ) -> Dict[str, Union[float, str, Dict[str, float]]]: """ - Evaluate the path efficiency of an agent's action sequence with tool parameters. + Evaluate the task navigation efficiency of an agent's action sequence with tool parameters. :keyword response: The agent's response containing tool calls. :paramtype response: Union[str, List[Dict[str, Any]]] :keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly. :paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]] - :return: The path efficiency scores and results. - :rtype: Dict[str, Union[float, str]] + :return: The task navigation efficiency scores and results. + :rtype: Dict[str, Union[float, str, Dict[str, float]]] """ @override @@ -330,13 +371,13 @@ def __call__( **kwargs, ): """ - Evaluate path efficiency. + Evaluate task navigation efficiency. :keyword response: The agent's response containing tool calls. :paramtype response: Union[str, List[Dict[str, Any]]] :keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict). :paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]] - :return: The path efficiency scores and results. - :rtype: Dict[str, Union[float, str]] + :return: The task navigation efficiency scores and results. + :rtype: Dict[str, Union[float, str, Dict[str, float]]] """ return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py index 9b28686b9bf6..fa24d6c72aed 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py @@ -78,6 +78,7 @@ class ErrorTarget(Enum): ECI_EVALUATOR = "ECIEvaluator" F1_EVALUATOR = "F1Evaluator" GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator" + TASK_NAVIGATION_EFFICIENCY_EVALUATOR = "TaskNavigationEfficiencyEvaluator" PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator" INTENT_RESOLUTION_EVALUATOR = "IntentResolutionEvaluator" RELEVANCE_EVALUATOR = "RelevanceEvaluator" diff --git a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/path_efficiency.ipynb b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/task_navigation_efficiency.ipynb similarity index 64% rename from sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/path_efficiency.ipynb rename to sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/task_navigation_efficiency.ipynb index 7be23dd4167d..924c4c9c01a0 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/path_efficiency.ipynb +++ b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/task_navigation_efficiency.ipynb @@ -5,7 +5,7 @@ "id": "d1de6f2b", "metadata": {}, "source": [ - "# Path Efficiency Evaluator" + "# Task Navigation Efficiency Evaluator" ] }, { @@ -15,12 +15,13 @@ "source": [ "### Getting Started\n", "\n", - "This sample demonstrates how to use Path Efficiency Evaluator\n", + "This sample demonstrates how to use the Task Navigation Efficiency Evaluator to evaluate whether an agent's sequence of actions follows optimal decision-making patterns.\n", + "\n", "Before running the sample:\n", "```bash\n", "pip install azure-ai-projects azure-identity azure-ai-evaluation\n", "```\n", - "Note: The Path Efficiency Evaluator does not require Azure OpenAI configuration as it's a rule-based evaluator." + "Note: The Task Navigation Efficiency Evaluator does not require Azure OpenAI configuration as it's a rule-based evaluator." ] }, { @@ -28,23 +29,26 @@ "id": "dbc5612b", "metadata": {}, "source": [ - "The Path Efficiency Evaluator measures how efficient an agent's sequence of actions is compared to an optimal path.\n", + "The Task Navigation Efficiency Evaluator measures how efficiently an agent navigates through a sequence of actions compared to an optimal task completion path.\n", + "\n", + "The evaluator provides comprehensive evaluation with both binary matching results and additional detailed P\\R\\F1 results:\n", + "\n", + "**Primary Result:**\n", + "- **Binary Match Result**: Pass/Fail based on the selected matching mode\n", "\n", - "The evaluator provides multiple metrics:\n", + "**Available Matching Modes:**\n", + "- **Exact Match**: Agent's tool calls must exactly match the ground truth (default)\n", + "- **In-Order Match**: All ground truth steps must appear in correct order (allows extra steps)\n", + "- **Any-Order Match**: All ground truth steps must appear with sufficient frequency (most lenient)\n", "\n", - "**Numeric Scores (0.0 - 1.0):**\n", + "**Properties Bag Additional Metrics (0.0 - 1.0):**\n", "- **Precision**: How many of the agent's steps were necessary (relevant to ground truth)\n", "- **Recall**: How many of the required steps were executed by the agent \n", "- **F1 Score**: Harmonic mean of precision and recall\n", "\n", - "**Binary Match Results (pass/fail):**\n", - "- **Exact Match**: Whether the agent's steps exactly match the ground truth\n", - "- **In-Order Match**: Whether all ground truth steps appear in correct order (allows extra steps)\n", - "- **Any-Order Match**: Whether all ground truth steps appear with sufficient frequency (ignores order, allows extra steps)\n", - "\n", "The evaluation requires the following inputs:\n", "- **Response**: The agent's response containing tool calls as a list of messages\n", - "- **Ground Truth**: List of expected tool/action steps as strings" + "- **Ground Truth**: List of expected tool/action steps as strings, or tuple with parameters for matching" ] }, { @@ -52,7 +56,7 @@ "id": "1be910ff", "metadata": {}, "source": [ - "### Initialize Path Efficiency Evaluator" + "### Initialize Task Navigation Efficiency Evaluator" ] }, { @@ -62,18 +66,23 @@ "metadata": {}, "outputs": [], "source": [ - "from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator\n", + "from azure.ai.evaluation._evaluators._task_navigation_efficiency import TaskNavigationEfficiencyEvaluator, TaskNavigationEfficiencyMatchingMode\n", "from pprint import pprint\n", "\n", - "# Initialize with custom thresholds\n", - "path_efficiency_evaluator = PathEfficiencyEvaluator(\n", - " precision_threshold=0.7,\n", - " recall_threshold=0.8, \n", - " f1_score_threshold=0.75\n", + "# Initialize with exact match mode\n", + "task_navigation_efficiency_evaluator = TaskNavigationEfficiencyEvaluator(\n", + " matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH\n", ")\n", "\n", - "# Or use default thresholds (0.5 for all metrics)\n", - "# path_efficiency_evaluator = PathEfficiencyEvaluator()" + "# Other examples:\n", + "# For in-order matching (allows extra steps but requires correct order)\n", + "# task_navigation_efficiency_evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH)\n", + "\n", + "# For any-order matching (most lenient - allows extra steps and different order) \n", + "# task_navigation_efficiency_evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH)\n", + "\n", + "# Or use defaults (exact match mode)\n", + "# task_navigation_efficiency_evaluator = TaskNavigationEfficiencyEvaluator()" ] }, { @@ -81,7 +90,7 @@ "id": "0247c79d", "metadata": {}, "source": [ - "### Samples" + "### Task Navigation Efficiency Examples" ] }, { @@ -117,7 +126,7 @@ "\n", "ground_truth = [\"search\", \"analyze\", \"report\"]\n", "\n", - "result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + "result = task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", "print(\"Perfect Path Results:\")\n", "pprint(result)" ] @@ -137,7 +146,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Agent includes extra steps but maintains correct order\n", + "# Agent performs all required steps but with extra unnecessary step\n", "response = [\n", " {\n", " \"role\": \"assistant\",\n", @@ -159,7 +168,7 @@ "\n", "ground_truth = [\"search\", \"analyze\", \"report\"]\n", "\n", - "result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + "result = task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", "print(\"\\nPath with Extra Steps Results:\")\n", "pprint(result)" ] @@ -197,7 +206,10 @@ "\n", "ground_truth = [\"search\", \"analyze\", \"report\"]\n", "\n", - "result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + "# Using in-order matching mode to demonstrate the difference\n", + "in_order_task_navigation_efficiency_evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH)\n", + "\n", + "result = in_order_task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", "print(\"\\nWrong Order Results:\")\n", "pprint(result)" ] @@ -217,7 +229,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Agent misses some required steps\n", + "# Agent performs only some of the required steps (incomplete)\n", "response = [\n", " {\n", " \"role\": \"assistant\",\n", @@ -231,7 +243,7 @@ "\n", "ground_truth = [\"search\", \"analyze\", \"report\"]\n", "\n", - "result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + "result = task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", "print(\"\\nMissing Steps Results:\")\n", "pprint(result)" ] @@ -251,7 +263,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Customer service agent handling order inquiry\n", + "# Real-world example: Customer service agent handling a refund request\n", "response = [\n", " {\n", " \"role\": \"assistant\",\n", @@ -273,7 +285,7 @@ "\n", "ground_truth = [\"lookup_order\", \"calculate_refund\", \"process_refund\"]\n", "\n", - "result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + "result = task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", "print(\"\\nCustomer Service Results:\")\n", "pprint(result)" ] @@ -319,7 +331,7 @@ "\n", "ground_truth = [\"search\", \"analyze\", \"report\"]\n", "\n", - "result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + "result = task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", "print(\"\\nComplex Path with Duplicates Results:\")\n", "pprint(result)" ] @@ -339,12 +351,14 @@ "metadata": {}, "outputs": [], "source": [ - "# Test with empty agent response\n", + "# Test edge cases\n", + "\n", + "# Test with empty response\n", "try:\n", " response = []\n", " ground_truth = [\"search\", \"analyze\", \"report\"]\n", " \n", - " result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + " result = task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", " print(\"\\nEmpty Response Results:\")\n", " pprint(result)\n", "except Exception as e:\n", @@ -360,7 +374,7 @@ " ]\n", " ground_truth = []\n", " \n", - " result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + " result = task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", " print(\"\\nEmpty Ground Truth Results:\")\n", " pprint(result)\n", "except Exception as e:\n", @@ -382,7 +396,7 @@ "metadata": {}, "outputs": [], "source": [ - "# PathEfficiencyEvaluator also supports tuple format with parameters for exact parameter matching\n", + "# TaskNavigationEfficiencyEvaluator also supports tuple format with parameters for exact parameter matching\n", "response_with_params = [\n", " {\n", " \"role\": \"assistant\",\n", @@ -394,7 +408,7 @@ "# Parameters must match exactly for tools to be considered matching\n", "ground_truth_with_params = ([\"search\"], {\"search\": {\"query\": \"test\"}})\n", "\n", - "result = path_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params)\n", + "result = task_navigation_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params)\n", "print(\"\\nTuple Format with Parameters Results:\")\n", "pprint(result)" ] @@ -414,32 +428,60 @@ "metadata": {}, "outputs": [], "source": [ - "def analyze_path_efficiency(response, ground_truth, scenario_name):\n", + "# Helper functions for analysis\n", + "\n", + "def analyze_task_navigation_efficiency(response, ground_truth, scenario_name, evaluator=None):\n", " \"\"\"\n", - " Helper function to analyze and display path efficiency results\n", + " Helper function to analyze and display task navigation efficiency results\n", " \"\"\"\n", - " result = path_efficiency_evaluator(response=response, ground_truth=ground_truth)\n", + " if evaluator is None:\n", + " evaluator = task_navigation_efficiency_evaluator\n", + " \n", + " result = evaluator(response=response, ground_truth=ground_truth)\n", " \n", " print(f\"\\n{'='*50}\")\n", " print(f\"Analysis for: {scenario_name}\")\n", " print(f\"{'='*50}\")\n", " \n", " print(f\"Ground Truth Steps: {ground_truth}\")\n", + " print(f\"Evaluator Matching Mode: {evaluator.matching_mode.value}\")\n", " print(f\"{'='*50}\")\n", " \n", - " # Numeric scores\n", - " print(\"Numeric Scores:\")\n", - " print(f\" Precision: {result['path_efficiency_precision_score']:.3f}\")\n", - " print(f\" Recall: {result['path_efficiency_recall_score']:.3f}\")\n", - " print(f\" F1 Score: {result['path_efficiency_f1_score']:.3f}\")\n", - "\n", - " # Binary matches\n", - " print(\"\\nBinary Match Results:\")\n", - " print(f\" Exact Match: {result['path_efficiency_exact_match_result']}\")\n", - " print(f\" In-Order Match: {result['path_efficiency_in_order_match_result']}\")\n", - " print(f\" Any-Order Match: {result['path_efficiency_any_order_match_result']}\")\n", - "\n", - " return result" + " # Display the returned results\n", + " for key, value in result.items():\n", + " if key == \"properties\":\n", + " print(f\" {key}:\")\n", + " for prop_key, prop_value in value.items():\n", + " print(f\" {prop_key}: {prop_value:.3f}\")\n", + " else:\n", + " print(f\" {key}: {value}\")\n", + "\n", + " return result\n", + "\n", + "# Example with different matching modes\n", + "def compare_matching_modes(response, ground_truth, scenario_name):\n", + " \"\"\"\n", + " Compare results across different matching modes for the same scenario\n", + " \"\"\"\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\"Matching Mode Comparison for: {scenario_name}\")\n", + " print(f\"{'='*60}\")\n", + " \n", + " matching_modes_to_test = [\n", + " TaskNavigationEfficiencyMatchingMode.EXACT_MATCH,\n", + " TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH,\n", + " TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH\n", + " ]\n", + " \n", + " for mode in matching_modes_to_test:\n", + " evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=mode)\n", + " result = evaluator(response=response, ground_truth=ground_truth)\n", + " \n", + " # Get the main result value\n", + " result_value = result.get(\"task_navigation_efficiency_result\", \"N/A\")\n", + " print(f\" {mode.value.upper():15}: {result_value}\")\n", + " \n", + " return" ] }, { @@ -467,7 +509,7 @@ "]\n", "perfect_ground_truth = [\"authenticate\", \"fetch_data\", \"process_result\"]\n", "\n", - "analyze_path_efficiency(perfect_response, perfect_ground_truth, \"Perfect Efficiency Example\")\n", + "analyze_task_navigation_efficiency(perfect_response, perfect_ground_truth, \"Perfect Efficiency Example\")\n", "\n", "# Scenario 2: Inefficient with extra steps\n", "inefficient_response = [\n", @@ -479,7 +521,34 @@ "]\n", "inefficient_ground_truth = [\"authenticate\", \"fetch_data\", \"process_result\"]\n", "\n", - "analyze_path_efficiency(inefficient_response, inefficient_ground_truth, \"Inefficient Path with Extra Steps\")" + "analyze_task_navigation_efficiency(inefficient_response, inefficient_ground_truth, \"Inefficient Path with Extra Steps\")\n", + "\n", + "# Demonstrate different matching modes\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"COMPARING DIFFERENT MATCHING MODES\")\n", + "print(\"=\"*60)\n", + "\n", + "compare_matching_modes(inefficient_response, inefficient_ground_truth, \"Inefficient Path Analysis\")\n", + "\n", + "# Example: Creating evaluators with different matching modes\n", + "print(f\"\\n{'='*60}\")\n", + "print(\"INDIVIDUAL MATCHING MODE EXAMPLES\")\n", + "print(\"=\"*60)\n", + "\n", + "# Exact match evaluator\n", + "exact_match_evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)\n", + "exact_result = exact_match_evaluator(response=perfect_response, ground_truth=perfect_ground_truth)\n", + "print(f\"Exact Match Evaluator: {exact_result}\")\n", + "\n", + "# In-order match evaluator\n", + "in_order_evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH)\n", + "in_order_result = in_order_evaluator(response=inefficient_response, ground_truth=inefficient_ground_truth)\n", + "print(f\"In-Order Match Evaluator: {in_order_result}\")\n", + "\n", + "# Any-order match evaluator (most lenient)\n", + "any_order_evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH)\n", + "any_order_result = any_order_evaluator(response=inefficient_response, ground_truth=inefficient_ground_truth)\n", + "print(f\"Any-Order Match Evaluator: {any_order_result}\")" ] } ], diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py index a6a0f3b6805d..3f96ccad9de9 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py @@ -575,11 +575,14 @@ def evaluation_evaluate_classes_methods(self): ) # [END tool_call_accuracy_evaluator] - # [START path_efficiency_evaluator] - from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator + # [START task_navigation_efficiency_evaluator] + from azure.ai.evaluation._evaluators._task_navigation_efficiency import ( + TaskNavigationEfficiencyEvaluator, + TaskNavigationEfficiencyMatchingMode, + ) - path_efficiency_evaluator = PathEfficiencyEvaluator( - precision_threshold=0.7, recall_threshold=0.8, f1_score_threshold=0.75 + task_navigation_efficiency_evaluator = TaskNavigationEfficiencyEvaluator( + matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH ) response = [ @@ -598,7 +601,7 @@ def evaluation_evaluate_classes_methods(self): ] ground_truth = ["search", "analyze", "report"] - path_efficiency_evaluator(response=response, ground_truth=ground_truth) + task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth) # Also supports tuple format with parameters for exact parameter matching response_with_params = [ @@ -611,8 +614,8 @@ def evaluation_evaluate_classes_methods(self): ] ground_truth_with_params = (["search"], {"search": {"query": "test"}}) - path_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params) - # [END path_efficiency_evaluator] + task_navigation_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params) + # [END task_navigation_efficiency_evaluator] # [START document_retrieval_evaluator] from azure.ai.evaluation import DocumentRetrievalEvaluator diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py index d6b023a581b1..cd6e6b8b36fe 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py @@ -581,11 +581,14 @@ def evaluation_evaluate_classes_methods(self): ) # [END tool_call_accuracy_evaluator] - # [START path_efficiency_evaluator] - from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator + # [START task_navigation_efficiency_evaluator] + from azure.ai.evaluation._evaluators._task_navigation_efficiency import ( + TaskNavigationEfficiencyEvaluator, + TaskNavigationEfficiencyMatchingMode, + ) - path_efficiency_evaluator = PathEfficiencyEvaluator( - precision_threshold=0.7, recall_threshold=0.8, f1_score_threshold=0.75 + task_navigation_efficiency_evaluator = TaskNavigationEfficiencyEvaluator( + matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH ) response = [ @@ -604,7 +607,7 @@ def evaluation_evaluate_classes_methods(self): ] ground_truth = ["search", "analyze", "report"] - path_efficiency_evaluator(response=response, ground_truth=ground_truth) + task_navigation_efficiency_evaluator(response=response, ground_truth=ground_truth) # Also supports tuple format with parameters for exact parameter matching response_with_params = [ @@ -617,8 +620,8 @@ def evaluation_evaluate_classes_methods(self): ] ground_truth_with_params = (["search"], {"search": {"query": "test"}}) - path_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params) - # [END path_efficiency_evaluator] + task_navigation_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params) + # [END task_navigation_efficiency_evaluator] # [START document_retrieval_evaluator] from azure.ai.evaluation import DocumentRetrievalEvaluator diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_path_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_path_efficiency_evaluators.py deleted file mode 100644 index 3b3deea754a8..000000000000 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_path_efficiency_evaluators.py +++ /dev/null @@ -1,499 +0,0 @@ -import pytest -from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator - - -@pytest.mark.unittest -class TestPathEfficiencyEvaluator: - def test_exact_match_scenario(self): - """Test when agent steps exactly match ground truth.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "report", "arguments": {}}], - }, - ] - ground_truth = ["search", "analyze", "report"] - - result = evaluator(response=response, ground_truth=ground_truth) - - assert result["path_efficiency_precision_score"] == 1.0 - assert ( - result["path_efficiency_precision_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_precision_result"] == "pass" - assert result["path_efficiency_recall_score"] == 1.0 - assert ( - result["path_efficiency_recall_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_recall_result"] == "pass" - assert result["path_efficiency_f1_score"] == 1.0 - assert ( - result["path_efficiency_f1_threshold"] == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_f1_result"] == "pass" - assert result["path_efficiency_exact_match_result"] == "pass" - assert result["path_efficiency_in_order_match_result"] == "pass" - assert result["path_efficiency_any_order_match_result"] == "pass" - - def test_in_order_match_with_extra_steps(self): - """Test when agent has extra steps but maintains order.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "extra_step", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "analyze", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "report", "arguments": {}}], - }, - ] - ground_truth = ["search", "analyze", "report"] - - result = evaluator(response=response, ground_truth=ground_truth) - - assert result["path_efficiency_precision_score"] == 0.75 # 3/4 - assert ( - result["path_efficiency_precision_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_precision_result"] == "pass" - assert result["path_efficiency_recall_score"] == 1.0 # 3/3 - assert ( - result["path_efficiency_recall_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_recall_result"] == "pass" - assert result["path_efficiency_f1_score"] == pytest.approx(0.857, rel=1e-2) - assert ( - result["path_efficiency_f1_threshold"] == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_f1_result"] == "pass" - assert result["path_efficiency_exact_match_result"] == "fail" - assert result["path_efficiency_in_order_match_result"] == "pass" - assert result["path_efficiency_any_order_match_result"] == "pass" - - def test_any_order_match(self): - """Test when agent has all steps but in wrong order.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "report", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "search", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "analyze", "arguments": {}}], - }, - ] - ground_truth = ["search", "analyze", "report"] - - result = evaluator(response=response, ground_truth=ground_truth) - - assert result["path_efficiency_precision_score"] == 1.0 - assert ( - result["path_efficiency_precision_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_precision_result"] == "pass" - assert result["path_efficiency_recall_score"] == 1.0 - assert ( - result["path_efficiency_recall_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_recall_result"] == "pass" - assert result["path_efficiency_f1_score"] == 1.0 - assert ( - result["path_efficiency_f1_threshold"] == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_f1_result"] == "pass" - assert result["path_efficiency_exact_match_result"] == "fail" - assert result["path_efficiency_in_order_match_result"] == "fail" - assert result["path_efficiency_any_order_match_result"] == "pass" - - def test_partial_match(self): - """Test when agent misses some steps and has extra steps.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "wrong_step", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "report", "arguments": {}}], - }, - ] - ground_truth = ["search", "analyze", "report"] - - result = evaluator(response=response, ground_truth=ground_truth) - - assert result["path_efficiency_precision_score"] == pytest.approx(0.667, rel=1e-2) # 2/3 - assert ( - result["path_efficiency_precision_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_precision_result"] == "pass" - assert result["path_efficiency_recall_score"] == pytest.approx(0.667, rel=1e-2) # 2/3 - assert ( - result["path_efficiency_recall_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_recall_result"] == "pass" - assert result["path_efficiency_f1_score"] == pytest.approx(0.667, rel=1e-2) - assert ( - result["path_efficiency_f1_threshold"] == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_f1_result"] == "pass" - assert result["path_efficiency_exact_match_result"] == "fail" - assert result["path_efficiency_in_order_match_result"] == "fail" - assert result["path_efficiency_any_order_match_result"] == "fail" - - def test_no_matching_steps(self): - """Test when agent has no matching steps.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "wrong1", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "wrong2", "arguments": {}}], - }, - ] - ground_truth = ["search", "analyze", "report"] - - result = evaluator(response=response, ground_truth=ground_truth) - - assert result["path_efficiency_precision_score"] == 0.0 - assert ( - result["path_efficiency_precision_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_precision_result"] == "fail" - assert result["path_efficiency_recall_score"] == 0.0 - assert ( - result["path_efficiency_recall_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_recall_result"] == "fail" - assert result["path_efficiency_f1_score"] == 0.0 - assert ( - result["path_efficiency_f1_threshold"] == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_f1_result"] == "fail" - assert result["path_efficiency_exact_match_result"] == "fail" - assert result["path_efficiency_in_order_match_result"] == "fail" - assert result["path_efficiency_any_order_match_result"] == "fail" - - def test_empty_agent_steps(self): - """Test when agent has no tool calls.""" - evaluator = PathEfficiencyEvaluator() - - response = [] - ground_truth = ["search", "analyze", "report"] - - result = evaluator(response=response, ground_truth=ground_truth) - - assert result["path_efficiency_precision_score"] == 0.0 - assert ( - result["path_efficiency_precision_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_precision_result"] == "fail" - assert result["path_efficiency_recall_score"] == 0.0 - assert ( - result["path_efficiency_recall_threshold"] - == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_recall_result"] == "fail" - assert result["path_efficiency_f1_score"] == 0.0 - assert ( - result["path_efficiency_f1_threshold"] == PathEfficiencyEvaluator._DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD - ) - assert result["path_efficiency_f1_result"] == "fail" - assert result["path_efficiency_exact_match_result"] == "fail" - assert result["path_efficiency_in_order_match_result"] == "fail" - assert result["path_efficiency_any_order_match_result"] == "fail" - - def test_call_method(self): - """Test using the __call__ method.""" - evaluator = PathEfficiencyEvaluator(precision_threshold=0.8, recall_threshold=0.8, f1_score_threshold=0.8) - - response = [ - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], - }, - { - "role": "assistant", - "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "report", "arguments": {}}], - }, - ] - ground_truth = ["search", "analyze", "report"] - - result = evaluator(response=response, ground_truth=ground_truth) - - assert result["path_efficiency_precision_score"] == 1.0 - assert result["path_efficiency_recall_score"] == 1.0 - assert result["path_efficiency_f1_score"] == 1.0 - assert result["path_efficiency_precision_result"] == "pass" - assert result["path_efficiency_recall_result"] == "pass" - assert result["path_efficiency_f1_result"] == "pass" - assert result["path_efficiency_precision_threshold"] == 0.8 - assert result["path_efficiency_recall_threshold"] == 0.8 - assert result["path_efficiency_f1_threshold"] == 0.8 - assert result["path_efficiency_exact_match_result"] == "pass" - assert result["path_efficiency_in_order_match_result"] == "pass" - assert result["path_efficiency_any_order_match_result"] == "pass" - - def test_invalid_ground_truth(self): - """Test with invalid ground truth steps.""" - evaluator = PathEfficiencyEvaluator() - - with pytest.raises(TypeError): - evaluator(response=[], ground_truth="not_a_list") # type: ignore - - with pytest.raises(ValueError): - evaluator(response=[], ground_truth=[]) - - def test_tuple_format_with_parameters_exact_match(self): - """Test tuple format with exact parameter matching.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [ - { - "type": "tool_call", - "tool_call_id": "call_1", - "name": "search", - "arguments": {"query": "weather", "location": "NYC"}, - } - ], - }, - { - "role": "assistant", - "content": [ - { - "type": "tool_call", - "tool_call_id": "call_2", - "name": "format_result", - "arguments": {"format": "json", "style": "brief"}, - } - ], - }, - ] - - # Ground truth with tuple format: (tool_names, parameters_dict) - ground_truth = ( - ["search", "format_result"], - {"search": {"query": "weather", "location": "NYC"}, "format_result": {"format": "json", "style": "brief"}}, - ) - - result = evaluator(response=response, ground_truth=ground_truth) - - # Should have perfect scores since everything matches exactly - assert result["path_efficiency_precision_score"] == 1.0 - assert result["path_efficiency_recall_score"] == 1.0 - assert result["path_efficiency_f1_score"] == 1.0 - assert result["path_efficiency_exact_match_result"] == "pass" - assert result["path_efficiency_in_order_match_result"] == "pass" - assert result["path_efficiency_any_order_match_result"] == "pass" - - def test_tuple_format_with_parameters_mismatch(self): - """Test tuple format with parameter mismatches.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [ - { - "type": "tool_call", - "tool_call_id": "call_1", - "name": "search", - "arguments": {"query": "weather", "location": "LA"}, # Different location - } - ], - }, - { - "role": "assistant", - "content": [ - { - "type": "tool_call", - "tool_call_id": "call_2", - "name": "format_result", - "arguments": {"format": "xml", "style": "detailed"}, # Different parameters - } - ], - }, - ] - - # Ground truth with tuple format - ground_truth = ( - ["search", "format_result"], - {"search": {"query": "weather", "location": "NYC"}, "format_result": {"format": "json", "style": "brief"}}, - ) - - result = evaluator(response=response, ground_truth=ground_truth) - - # Should have zero scores since parameters don't match exactly - assert result["path_efficiency_precision_score"] == 0.0 - assert result["path_efficiency_recall_score"] == 0.0 - assert result["path_efficiency_f1_score"] == 0.0 - assert result["path_efficiency_exact_match_result"] == "fail" - assert result["path_efficiency_in_order_match_result"] == "fail" - assert result["path_efficiency_any_order_match_result"] == "fail" - - def test_tuple_format_with_parameters_partial_match(self): - """Test tuple format with some matching and some non-matching tools.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [ - { - "type": "tool_call", - "tool_call_id": "call_1", - "name": "search", - "arguments": {"query": "weather", "location": "NYC"}, # Matches exactly - } - ], - }, - { - "role": "assistant", - "content": [ - { - "type": "tool_call", - "tool_call_id": "call_2", - "name": "format_result", - "arguments": {"format": "xml", "style": "detailed"}, # Different parameters - } - ], - }, - { - "role": "assistant", - "content": [ - { - "type": "tool_call", - "tool_call_id": "call_3", - "name": "extra_tool", - "arguments": {"param": "value"}, # Extra tool not in ground truth - } - ], - }, - ] - - # Ground truth with tuple format - ground_truth = ( - ["search", "format_result"], - {"search": {"query": "weather", "location": "NYC"}, "format_result": {"format": "json", "style": "brief"}}, - ) - - result = evaluator(response=response, ground_truth=ground_truth) - - # Only "search" matches (1 out of 3 agent tools, 1 out of 2 ground truth tools) - expected_precision = 1.0 / 3.0 # 1 match out of 3 agent tools - expected_recall = 1.0 / 2.0 # 1 match out of 2 ground truth tools - expected_f1 = 2 * expected_precision * expected_recall / (expected_precision + expected_recall) - - assert result["path_efficiency_precision_score"] == pytest.approx(expected_precision, rel=1e-3) - assert result["path_efficiency_recall_score"] == pytest.approx(expected_recall, rel=1e-3) - assert result["path_efficiency_f1_score"] == pytest.approx(expected_f1, rel=1e-3) - assert result["path_efficiency_exact_match_result"] == "fail" - assert result["path_efficiency_in_order_match_result"] == "fail" - assert result["path_efficiency_any_order_match_result"] == "fail" - - def test_tuple_format_with_empty_parameters(self): - """Test tuple format where some tools have no parameters.""" - evaluator = PathEfficiencyEvaluator() - - response = [ - { - "role": "assistant", - "content": [ - {"type": "tool_call", "tool_call_id": "call_1", "name": "ping", "arguments": {}} # No parameters - ], - }, - { - "role": "assistant", - "content": [ - {"type": "tool_call", "tool_call_id": "call_2", "name": "log", "arguments": {"message": "success"}} - ], - }, - ] - - # Ground truth with tuple format including empty parameters - ground_truth = (["ping", "log"], {"ping": {}, "log": {"message": "success"}}) # Empty parameters dict - - result = evaluator(response=response, ground_truth=ground_truth) - - # Should have perfect scores since everything matches exactly - assert result["path_efficiency_precision_score"] == 1.0 - assert result["path_efficiency_recall_score"] == 1.0 - assert result["path_efficiency_f1_score"] == 1.0 - assert result["path_efficiency_exact_match_result"] == "pass" - assert result["path_efficiency_in_order_match_result"] == "pass" - assert result["path_efficiency_any_order_match_result"] == "pass" - - def test_tuple_format_invalid_inputs(self): - """Test tuple format with invalid input validation.""" - evaluator = PathEfficiencyEvaluator() - - response = [] # Empty response for testing validation - - # Test invalid tuple length - with pytest.raises(TypeError): - evaluator(response=response, ground_truth=("only_one_element",)) # type: ignore - - # Test invalid first element (not a list) - with pytest.raises(TypeError): - evaluator(response=response, ground_truth=("not_a_list", {})) # type: ignore - - # Test invalid second element (not a dict) - with pytest.raises(TypeError): - evaluator(response=response, ground_truth=(["tool"], "not_a_dict")) # type: ignore - - # Test invalid parameter values (not json serializable) - with pytest.raises(TypeError): - evaluator(response=response, ground_truth=(["tool"], {"tool": {"key": object}})) # type: ignore diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py new file mode 100644 index 000000000000..02913da64da8 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py @@ -0,0 +1,184 @@ +import pytest +from azure.ai.evaluation._evaluators._task_navigation_efficiency import ( + TaskNavigationEfficiencyEvaluator, + TaskNavigationEfficiencyMatchingMode, +) + + +@pytest.mark.unittest +class TestTaskNavigationEfficiencyEvaluator: + def test_exact_match_scenario(self): + """Test when agent steps exactly match ground truth.""" + evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + response = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "report", "arguments": {}}], + }, + ] + ground_truth = ["search", "analyze", "report"] + + result = evaluator(response=response, ground_truth=ground_truth) + assert result["task_navigation_efficiency_result"] == "pass" + assert "properties" in result + assert result["properties"]["precision_score"] == 1.0 + assert result["properties"]["recall_score"] == 1.0 + assert result["properties"]["f1_score"] == 1.0 + + def test_in_order_match_with_extra_steps(self): + """Test when agent has extra steps but maintains order.""" + evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH) + + response = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "extra_step", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "analyze", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "report", "arguments": {}}], + }, + ] + ground_truth = ["search", "analyze", "report"] + + result = evaluator(response=response, ground_truth=ground_truth) + assert result["task_navigation_efficiency_result"] == "pass" + assert result["properties"]["precision_score"] == 0.75 # 3/4 + assert result["properties"]["recall_score"] == 1.0 # 3/3 + assert result["properties"]["f1_score"] == pytest.approx(0.857, rel=1e-2) + + def test_any_order_match(self): + """Test when agent has all steps but in wrong order.""" + evaluator = TaskNavigationEfficiencyEvaluator( + matching_mode=TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH + ) + + response = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "report", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "analyze", "arguments": {}}], + }, + ] + ground_truth = ["search", "analyze", "report"] + + result = evaluator(response=response, ground_truth=ground_truth) + assert result["task_navigation_efficiency_result"] == "pass" + assert result["properties"]["precision_score"] == 1.0 + assert result["properties"]["recall_score"] == 1.0 + assert result["properties"]["f1_score"] == 1.0 + + def test_exact_match_failure(self): + """Test when exact match fails but other matches succeed.""" + exact_evaluator = TaskNavigationEfficiencyEvaluator( + matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH + ) + in_order_evaluator = TaskNavigationEfficiencyEvaluator( + matching_mode=TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH + ) + + response = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "extra_step", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "analyze", "arguments": {}}], + }, + ] + ground_truth = ["search", "analyze"] + + exact_result = exact_evaluator(response=response, ground_truth=ground_truth) + assert exact_result["task_navigation_efficiency_result"] == "fail" + + in_order_result = in_order_evaluator(response=response, ground_truth=ground_truth) + assert in_order_result["task_navigation_efficiency_result"] == "pass" + + def test_invalid_ground_truth(self): + """Test with invalid ground truth steps.""" + evaluator = TaskNavigationEfficiencyEvaluator() + + with pytest.raises(TypeError): + evaluator(response=[], ground_truth="not_a_list") # type: ignore + + with pytest.raises(ValueError): + evaluator(response=[], ground_truth=[]) + + def test_tuple_format_with_parameters(self): + """Test tuple format with exact parameter matching.""" + evaluator = TaskNavigationEfficiencyEvaluator(matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + response = [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_1", + "name": "search", + "arguments": {"query": "weather", "location": "NYC"}, + } + ], + }, + ] + + # Ground truth with tuple format: (tool_names, parameters_dict) + ground_truth = ( + ["search"], + {"search": {"query": "weather", "location": "NYC"}}, + ) + + result = evaluator(response=response, ground_truth=ground_truth) + assert result["task_navigation_efficiency_result"] == "pass" + assert result["properties"]["precision_score"] == 1.0 + assert result["properties"]["recall_score"] == 1.0 + assert result["properties"]["f1_score"] == 1.0 + + def test_matching_mode_validation(self): + """Test validation of matching_mode parameter.""" + # Test valid string mode + evaluator1 = TaskNavigationEfficiencyEvaluator(matching_mode="exact_match") + assert evaluator1.matching_mode == TaskNavigationEfficiencyMatchingMode.EXACT_MATCH + + # Test valid enum mode + evaluator2 = TaskNavigationEfficiencyEvaluator( + matching_mode=TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH + ) + assert evaluator2.matching_mode == TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH + + # Test invalid string mode + with pytest.raises(ValueError): + TaskNavigationEfficiencyEvaluator(matching_mode="invalid_mode") + + # Test invalid type for mode + with pytest.raises(Exception): # EvaluationException + TaskNavigationEfficiencyEvaluator(matching_mode=123) # type: ignore