diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 75e9067e9b47..3b74169c923f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -30,7 +30,7 @@ from ._evaluators._xpia import IndirectAttackEvaluator from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator -from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator +from ._evaluators._tool_call_quality import ToolCallQualityEvaluator from ._evaluators._document_retrieval import DocumentRetrievalEvaluator from ._model_configurations import ( AzureAIProject, @@ -130,7 +130,8 @@ def lazy_import(): "EvaluationResult", "CodeVulnerabilityEvaluator", "UngroundedAttributesEvaluator", - "ToolCallAccuracyEvaluator", + "ToolCallQualityEvaluator", + "ToolCallAccuracyEvaluator", # Backward compatibility alias "AzureOpenAIGrader", "AzureOpenAILabelGrader", "AzureOpenAIStringCheckGrader", @@ -141,6 +142,9 @@ def lazy_import(): __all__.extend([p for p in _patch_all if p not in __all__]) +# Backward compatibility alias +ToolCallAccuracyEvaluator = ToolCallQualityEvaluator + def __getattr__(name): """Handle lazy imports for optional dependencies.""" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py index e77708057173..645ae17dbbcd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py @@ -37,7 +37,8 @@ SexualEvaluator, SimilarityEvaluator, TaskAdherenceEvaluator, - ToolCallAccuracyEvaluator, + ToolCallAccuracyEvaluator, # Backward compatibility alias + ToolCallQualityEvaluator, UngroundedAttributesEvaluator, ViolenceEvaluator, ) @@ -69,7 +70,8 @@ SimilarityEvaluator: "similarity", TaskAdherenceEvaluator: "task_adherence", TaskCompletionEvaluator: "task_completion", - ToolCallAccuracyEvaluator: "tool_call_accuracy", + ToolCallAccuracyEvaluator: "tool_call_quality", # Backward compatibility + ToolCallQualityEvaluator: "tool_call_quality", UngroundedAttributesEvaluator: "ungrounded_attributes", ViolenceEvaluator: "violence", } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/__init__.py similarity index 68% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/__init__.py index 687f98d624f0..701b2af5fe32 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/__init__.py @@ -2,8 +2,8 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from ._tool_call_accuracy import ToolCallAccuracyEvaluator +from ._tool_call_quality import ToolCallQualityEvaluator __all__ = [ - "ToolCallAccuracyEvaluator", + "ToolCallQualityEvaluator", ] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/_tool_call_quality.py similarity index 90% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/_tool_call_quality.py index 99d939034bc4..3330f79d5ef8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/_tool_call_quality.py @@ -58,14 +58,14 @@ def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]: @experimental -class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): - """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining: +class ToolCallQualityEvaluator(PromptyEvaluatorBase[Union[str, float]]): + """The Tool Call Quality evaluator assesses how accurately an AI uses tools by examining: - Relevance to the conversation. - Parameter correctness according to tool definitions. - Parameter value extraction from the conversation. The evaluator uses a scoring rubric of 1 to 5: - - Score 1: The tool calls are irrelevant + - Score 1: The tool calls are irrelevant. - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed. - Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made. - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded. @@ -82,20 +82,20 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): .. admonition:: Example: .. literalinclude:: ../samples/evaluation_samples_evaluate.py - :start-after: [START tool_call_accuracy_evaluator] - :end-before: [END tool_call_accuracy_evaluator] + :start-after: [START tool_call_quality_evaluator] + :end-before: [END tool_call_quality_evaluator] :language: python :dedent: 8 - :caption: Initialize and call a ToolCallAccuracyEvaluator. + :caption: Initialize and call a ToolCallQualityEvaluator. .. admonition:: Example using Azure AI Project URL: .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py - :start-after: [START tool_call_accuracy_evaluator] - :end-before: [END tool_call_accuracy_evaluator] + :start-after: [START tool_call_quality_evaluator] + :end-before: [END tool_call_quality_evaluator] :language: python :dedent: 8 - :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format + :caption: Initialize and call ToolCallQualityEvaluator using Azure AI Project URL in the following format https://{resource_name}.services.ai.azure.com/api/projects/{project_name} .. note:: @@ -105,25 +105,25 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): however, it is recommended to use the new key moving forward as the old key will be deprecated in the future. """ - _PROMPTY_FILE = "tool_call_accuracy.prompty" - _RESULT_KEY = "tool_call_accuracy" + _PROMPTY_FILE = "tool_call_quality.prompty" + _RESULT_KEY = "tool_call_quality" - _MAX_TOOL_CALL_ACCURACY_SCORE = 5 - _MIN_TOOL_CALL_ACCURACY_SCORE = 1 - _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3 + _MAX_TOOL_CALL_QUALITY_SCORE = 5 + _MIN_TOOL_CALL_QUALITY_SCORE = 1 + _DEFAULT_TOOL_CALL_QUALITY_SCORE = 3 _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls." _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided." _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided." - _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5." + _INVALID_SCORE_MESSAGE = "Tool call quality score must be between 1 and 5." _LLM_SCORE_KEY = "tool_calls_success_level" - id = "azureai://built-in/evaluators/tool_call_accuracy" + id = "azureai://built-in/evaluators/tool_call_quality" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override - def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs): + def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_QUALITY_SCORE, credential=None, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self.threshold = threshold @@ -241,11 +241,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t score = llm_output.get(self._LLM_SCORE_KEY, None) if not score or not check_score_is_valid( score, - ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, - ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE, + ToolCallQualityEvaluator._MIN_TOOL_CALL_QUALITY_SCORE, + ToolCallQualityEvaluator._MAX_TOOL_CALL_QUALITY_SCORE, ): raise EvaluationException( - message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].", + message=f"Invalid score value: {score}. Expected a number in range [{ToolCallQualityEvaluator._MIN_TOOL_CALL_QUALITY_SCORE}, {ToolCallQualityEvaluator._MAX_TOOL_CALL_QUALITY_SCORE}].", internal_message="Invalid score value.", category=ErrorCategory.FAILED_EXECUTION, blame=ErrorBlame.SYSTEM_ERROR, @@ -266,10 +266,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t else: raise EvaluationException( - message="Tool call accuracy evaluator returned invalid output.", + message="Tool call quality evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, category=ErrorCategory.FAILED_EXECUTION, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR, ) async def _real_call(self, **kwargs): @@ -346,14 +346,14 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): message=f"Tool definition for {tool_name} not found", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR, ) else: raise EvaluationException( message=f"Tool call missing name: {tool_call}", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR, ) else: # Unsupported tool format - only converter format is supported @@ -361,7 +361,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): message=f"Unsupported tool call format. Only converter format is supported: {tool_call}", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR, ) else: # Tool call is not a dictionary @@ -369,7 +369,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): message=f"Tool call is not a dictionary: {tool_call}", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR, ) return needed_tool_definitions diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/tool_call_quality.prompty similarity index 99% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/tool_call_quality.prompty index 4713b65f4ee9..a7fb6ad25bd3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/tool_call_quality.prompty @@ -1,6 +1,6 @@ --- -name: Tool Call Accuracy -description: Evaluates Tool Call Accuracy for tool used by agent +name: Tool Call Quality +description: Evaluates Tool Call Quality for tool used by agent model: api: chat parameters: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py index 9b28686b9bf6..59ce1f711db4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py @@ -96,6 +96,7 @@ class ErrorTarget(Enum): UNKNOWN = "Unknown" CONVERSATION = "Conversation" TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator" + TOOL_CALL_QUALITY_EVALUATOR = "ToolCallQualityEvaluator" RED_TEAM = "RedTeam" AOAI_GRADER = "AoaiGrader" CONVERSATION_HISTORY_PARSING = "_get_conversation_history" diff --git a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_accuracy.ipynb b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_quality.ipynb similarity index 93% rename from sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_accuracy.ipynb rename to sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_quality.ipynb index c08365505d6f..579f4290d4a9 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_accuracy.ipynb +++ b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_quality.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Tool Call Accuracy Evaluator" + "# Tool Call Quality Evaluator" ] }, { @@ -13,7 +13,7 @@ "source": [ "### Getting Started\n", "\n", - "This sample demonstrates how to use Tool Call Accuracy Evaluator\n", + "This sample demonstrates how to use Tool Call Quality Evaluator\n", "Before running the sample:\n", "```bash\n", "pip install azure-ai-projects azure-identity azure-ai-evaluation\n", @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:\n", + "The Tool Call Quality evaluator assesses how accurately an AI uses tools by examining:\n", "- Relevance to the conversation\n", "- Parameter correctness according to tool definitions\n", "- Parameter value extraction from the conversation\n", @@ -53,10 +53,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Tool Call Accuracy requires following input:\n", + "Tool Call Quality requires following input:\n", "- Query - This can be a single query or a list of messages(conversation history with agent). Latter helps to determine if Agent used the information in history to make right tool calls.\n", "- Tool Calls - Tool Call(s) made by Agent to answer the query. Optional - if response has tool calls, if not provided evaluator will look for tool calls in response.\n", - "- Response - (Optional)Response from Agent (or any GenAI App). This can be a single text response or a list or messages generated as part of Agent Response. If tool calls are not provide Tool Call Accuracy Evaluator will look at response for tool calls.\n", + "- Response - (Optional)Response from Agent (or any GenAI App). This can be a single text response or a list or messages generated as part of Agent Response. If tool calls are not provide Tool Call Quality Evaluator will look at response for tool calls.\n", "- Tool Definitions - Tool(s) definition used by Agent to answer the query. \n" ] }, @@ -64,7 +64,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Initialize Tool Call Accuracy Evaluator\n" + "### Initialize Tool Call Quality Evaluator\n" ] }, { @@ -74,7 +74,7 @@ "outputs": [], "source": [ "import os\n", - "from azure.ai.evaluation import ToolCallAccuracyEvaluator , AzureOpenAIModelConfiguration\n", + "from azure.ai.evaluation import ToolCallQualityEvaluator , AzureOpenAIModelConfiguration\n", "from pprint import pprint\n", "\n", "model_config = AzureOpenAIModelConfiguration(\n", @@ -85,7 +85,7 @@ ")\n", "\n", "\n", - "tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)" + "tool_call_quality = ToolCallQualityEvaluator(model_config=model_config)" ] }, { @@ -140,7 +140,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = tool_call_accuracy(query=query, tool_calls=tool_call, tool_definitions=tool_definition)\n", + "response = tool_call_quality(query=query, tool_calls=tool_call, tool_definitions=tool_definition)\n", "pprint(response)" ] }, @@ -197,7 +197,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = tool_call_accuracy(query=query, tool_calls=tool_calls, tool_definitions=tool_definition)\n", + "response = tool_call_quality(query=query, tool_calls=tool_calls, tool_definitions=tool_definition)\n", "pprint(response)" ] }, @@ -206,7 +206,7 @@ "metadata": {}, "source": [ "#### Tool Calls passed as part of `Response` (common for agent case)\n", - "- Tool Call Accuracy Evaluator extracts tool calls from response" + "- Tool Call Quality Evaluator extracts tool calls from response" ] }, { diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py index a6a0f3b6805d..b32896e41014 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py @@ -541,9 +541,9 @@ def evaluation_evaluate_classes_methods(self): ) # [END groundedness_pro_evaluator] - # [START tool_call_accuracy_evaluator] + # [START tool_call_quality_evaluator] import os - from azure.ai.evaluation import ToolCallAccuracyEvaluator + from azure.ai.evaluation import ToolCallQualityEvaluator model_config = { "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"), @@ -551,8 +551,8 @@ def evaluation_evaluate_classes_methods(self): "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"), } - tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config) - tool_call_accuracy_evaluator( + tool_call_quality_evaluator = ToolCallQualityEvaluator(model_config=model_config) + tool_call_quality_evaluator( query="How is the weather in New York?", response="The weather in New York is sunny.", tool_calls={ @@ -573,7 +573,7 @@ def evaluation_evaluate_classes_methods(self): }, }, ) - # [END tool_call_accuracy_evaluator] + # [END tool_call_quality_evaluator] # [START path_efficiency_evaluator] from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py index d6b023a581b1..29c324ec16fc 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py @@ -547,9 +547,9 @@ def evaluation_evaluate_classes_methods(self): ) # [END groundedness_pro_evaluator] - # [START tool_call_accuracy_evaluator] + # [START tool_call_quality_evaluator] import os - from azure.ai.evaluation import ToolCallAccuracyEvaluator + from azure.ai.evaluation import ToolCallQualityEvaluator model_config = { "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"), # https://.services.ai.azure.com @@ -557,8 +557,8 @@ def evaluation_evaluate_classes_methods(self): "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"), } - tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config) - tool_call_accuracy_evaluator( + tool_call_quality_evaluator = ToolCallQualityEvaluator(model_config=model_config) + tool_call_quality_evaluator( query="How is the weather in New York?", response="The weather in New York is sunny.", tool_calls={ @@ -579,7 +579,7 @@ def evaluation_evaluate_classes_methods(self): }, }, ) - # [END tool_call_accuracy_evaluator] + # [END tool_call_quality_evaluator] # [START path_efficiency_evaluator] from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py index 3b3580817eb5..e9de7635c046 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py @@ -1,16 +1,16 @@ import pytest -from azure.ai.evaluation import evaluate, ToolCallAccuracyEvaluator +from azure.ai.evaluation import evaluate, ToolCallQualityEvaluator from azure.ai.evaluation._exceptions import EvaluationException @pytest.mark.usefixtures("mock_model_config") @pytest.mark.unittest class TestEvaluate: - def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): - tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=mock_model_config) + def test_tool_call_quality_evaluator_missing_inputs(self, mock_model_config): + tool_call_quality = ToolCallQualityEvaluator(model_config=mock_model_config) # Test with missing tool_calls and response - result = tool_call_accuracy( + result = tool_call_quality( query="Where is the Eiffel Tower?", tool_definitions=[ { @@ -28,14 +28,13 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } ], ) - assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert result[ToolCallQualityEvaluator._RESULT_KEY] == ToolCallQualityEvaluator._NOT_APPLICABLE_RESULT assert ( - ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] + ToolCallQualityEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallQualityEvaluator._RESULT_KEY}_reason"] ) # Test with missing tool_definitions - result = tool_call_accuracy( + result = tool_call_quality( query="Where is the Eiffel Tower?", tool_definitions=[], tool_calls=[ @@ -46,14 +45,14 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } ], ) - assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert result[ToolCallQualityEvaluator._RESULT_KEY] == ToolCallQualityEvaluator._NOT_APPLICABLE_RESULT assert ( - ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE - in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] + ToolCallQualityEvaluator._NO_TOOL_DEFINITIONS_MESSAGE + in result[f"{ToolCallQualityEvaluator._RESULT_KEY}_reason"] ) # Test with response that has no tool calls - result = tool_call_accuracy( + result = tool_call_quality( query="Where is the Eiffel Tower?", response="The Eiffel Tower is in Paris.", tool_definitions=[ @@ -72,14 +71,13 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } ], ) - assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert result[ToolCallQualityEvaluator._RESULT_KEY] == ToolCallQualityEvaluator._NOT_APPLICABLE_RESULT assert ( - ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] + ToolCallQualityEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallQualityEvaluator._RESULT_KEY}_reason"] ) # Test with tool call for which definition is not provided - result = tool_call_accuracy( + result = tool_call_quality( query="Where is the Eiffel Tower?", tool_calls=[{"type": "tool_call", "name": "some_other_tool", "arguments": {}}], tool_definitions=[ @@ -98,8 +96,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } ], ) - assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert result[ToolCallQualityEvaluator._RESULT_KEY] == ToolCallQualityEvaluator._NOT_APPLICABLE_RESULT assert ( - ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] + ToolCallQualityEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE + in result[f"{ToolCallQualityEvaluator._RESULT_KEY}_reason"] ) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_quality_evaluator.py similarity index 88% rename from sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py rename to sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_quality_evaluator.py index 714b1b4073e2..ea63f7e79fc4 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_quality_evaluator.py @@ -1,7 +1,7 @@ from unittest.mock import MagicMock import pytest -from azure.ai.evaluation import ToolCallAccuracyEvaluator +from azure.ai.evaluation import ToolCallQualityEvaluator from azure.ai.evaluation._exceptions import EvaluationException @@ -71,9 +71,9 @@ async def flow_side_effect(timeout, **kwargs): @pytest.mark.usefixtures("mock_model_config") @pytest.mark.unittest -class TestToolCallAccuracyEvaluator: +class TestToolCallQualityEvaluator: def test_evaluate_tools_valid1(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test evaluation with one good and one bad tool call @@ -124,18 +124,18 @@ def test_evaluate_tools_valid1(self, mock_model_config): ] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert key in result and f"{key}_result" in result and f"{key}_threshold" in result assert result[key] == 3.0 # Mixed good/bad gets score 3 assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert result[f"{key}_threshold"] == ToolCallQualityEvaluator._DEFAULT_TOOL_CALL_QUALITY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." assert "details" in result def test_evaluate_tools_valid2(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test evaluation with two bad tool calls @@ -186,18 +186,18 @@ def test_evaluate_tools_valid2(self, mock_model_config): ] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert key in result and f"{key}_result" in result and f"{key}_threshold" in result assert result[key] == 1.0 # All bad gets score 1 assert result[f"{key}_result"] == "fail" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert result[f"{key}_threshold"] == ToolCallQualityEvaluator._DEFAULT_TOOL_CALL_QUALITY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." assert "details" in result def test_evaluate_tools_valid3(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test evaluation with two good tool calls @@ -248,19 +248,19 @@ def test_evaluate_tools_valid3(self, mock_model_config): ] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert key in result and f"{key}_result" in result and f"{key}_threshold" in result assert result[key] == 5.0 # All good gets score 5 assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert result[f"{key}_threshold"] == ToolCallQualityEvaluator._DEFAULT_TOOL_CALL_QUALITY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." assert "details" in result def test_evaluate_tools_one_eval_fails(self, mock_model_config): with pytest.raises(EvaluationException) as exc_info: - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test evaluation with an invalid tool call ID to trigger failure @@ -294,7 +294,7 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config): assert "Invalid score value" in str(exc_info.value) def test_evaluate_tools_some_not_applicable(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test with one function tool and one non-function tool @@ -345,16 +345,16 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): ] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert result[key] == ToolCallQualityEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE + assert result[f"{key}_threshold"] == ToolCallQualityEvaluator._DEFAULT_TOOL_CALL_QUALITY_SCORE + assert result[f"{key}_reason"] == ToolCallQualityEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE assert result["details"] == {} def test_evaluate_tools_all_not_applicable(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test with only non-function tools @@ -385,16 +385,16 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): ] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert result[key] == ToolCallQualityEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE + assert result[f"{key}_threshold"] == ToolCallQualityEvaluator._DEFAULT_TOOL_CALL_QUALITY_SCORE + assert result[f"{key}_reason"] == ToolCallQualityEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE assert result["details"] == {} def test_evaluate_tools_no_tools(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test with no tool calls provided @@ -418,16 +418,16 @@ def test_evaluate_tools_no_tools(self, mock_model_config): ] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None - assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert result[key] == ToolCallQualityEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE + assert result[f"{key}_threshold"] == ToolCallQualityEvaluator._DEFAULT_TOOL_CALL_QUALITY_SCORE + assert result[f"{key}_reason"] == ToolCallQualityEvaluator._NO_TOOL_CALLS_MESSAGE assert result["details"] == {} def test_evaluate_bing_custom_search(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test relevant bing custom search - converter format @@ -445,13 +445,13 @@ def test_evaluate_bing_custom_search(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" def test_evaluate_bing_grounding(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test relevant bing grounding for house prices - converter format @@ -469,13 +469,13 @@ def test_evaluate_bing_grounding(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" def test_evaluate_file_search(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test file search for credit card statement - converter format @@ -491,13 +491,13 @@ def test_evaluate_file_search(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" def test_evaluate_azure_ai_search(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test Azure AI Search for real estate - converter format @@ -513,13 +513,13 @@ def test_evaluate_azure_ai_search(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" def test_evaluate_fabric_dataagent(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test Fabric Data Agent for financial analysis - converter format @@ -535,13 +535,13 @@ def test_evaluate_fabric_dataagent(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" def test_evaluate_code_interpreter(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test code interpreter for statistical analysis - converter format @@ -559,13 +559,13 @@ def test_evaluate_code_interpreter(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" def test_evaluate_sharepoint_grounding(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test SharePoint grounding for document search - converter format @@ -581,13 +581,13 @@ def test_evaluate_sharepoint_grounding(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" def test_evaluate_open_api(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test OpenAPI function call for exchange rates - converter format @@ -603,13 +603,13 @@ def test_evaluate_open_api(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == "not applicable" assert result[f"{key}_result"] == "pass" def test_evaluate_open_api_with_tool_definition(self, mock_model_config): - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator = ToolCallQualityEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test OpenAPI function call for exchange rates - converter format @@ -680,7 +680,7 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config): ] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._RESULT_KEY + key = ToolCallQualityEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass"