Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from ._evaluators._xpia import IndirectAttackEvaluator
from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
from ._evaluators._tool_call_quality import ToolCallQualityEvaluator
from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
from ._model_configurations import (
AzureAIProject,
Expand Down Expand Up @@ -130,7 +130,8 @@ def lazy_import():
"EvaluationResult",
"CodeVulnerabilityEvaluator",
"UngroundedAttributesEvaluator",
"ToolCallAccuracyEvaluator",
"ToolCallQualityEvaluator",
"ToolCallAccuracyEvaluator", # Backward compatibility alias
"AzureOpenAIGrader",
"AzureOpenAILabelGrader",
"AzureOpenAIStringCheckGrader",
Expand All @@ -141,6 +142,9 @@ def lazy_import():

__all__.extend([p for p in _patch_all if p not in __all__])

# Backward compatibility alias
ToolCallAccuracyEvaluator = ToolCallQualityEvaluator


def __getattr__(name):
"""Handle lazy imports for optional dependencies."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
SexualEvaluator,
SimilarityEvaluator,
TaskAdherenceEvaluator,
ToolCallAccuracyEvaluator,
ToolCallAccuracyEvaluator, # Backward compatibility alias
ToolCallQualityEvaluator,
UngroundedAttributesEvaluator,
ViolenceEvaluator,
)
Expand Down Expand Up @@ -69,7 +70,8 @@
SimilarityEvaluator: "similarity",
TaskAdherenceEvaluator: "task_adherence",
TaskCompletionEvaluator: "task_completion",
ToolCallAccuracyEvaluator: "tool_call_accuracy",
ToolCallAccuracyEvaluator: "tool_call_quality", # Backward compatibility
ToolCallQualityEvaluator: "tool_call_quality",
UngroundedAttributesEvaluator: "ungrounded_attributes",
ViolenceEvaluator: "violence",
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._tool_call_accuracy import ToolCallAccuracyEvaluator
from ._tool_call_quality import ToolCallQualityEvaluator

__all__ = [
"ToolCallAccuracyEvaluator",
"ToolCallQualityEvaluator",
]
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,14 @@ def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:


@experimental
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
class ToolCallQualityEvaluator(PromptyEvaluatorBase[Union[str, float]]):
"""The Tool Call Quality evaluator assesses how accurately an AI uses tools by examining:
- Relevance to the conversation.
- Parameter correctness according to tool definitions.
- Parameter value extraction from the conversation.

The evaluator uses a scoring rubric of 1 to 5:
- Score 1: The tool calls are irrelevant
- Score 1: The tool calls are irrelevant.
- Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed.
- Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made.
- Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded.
Expand All @@ -82,20 +82,20 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
.. admonition:: Example:

.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START tool_call_accuracy_evaluator]
:end-before: [END tool_call_accuracy_evaluator]
:start-after: [START tool_call_quality_evaluator]
:end-before: [END tool_call_quality_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call a ToolCallAccuracyEvaluator.
:caption: Initialize and call a ToolCallQualityEvaluator.

.. admonition:: Example using Azure AI Project URL:

.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
:start-after: [START tool_call_accuracy_evaluator]
:end-before: [END tool_call_accuracy_evaluator]
:start-after: [START tool_call_quality_evaluator]
:end-before: [END tool_call_quality_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
:caption: Initialize and call ToolCallQualityEvaluator using Azure AI Project URL in the following format
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}

.. note::
Expand All @@ -105,25 +105,25 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
"""

_PROMPTY_FILE = "tool_call_accuracy.prompty"
_RESULT_KEY = "tool_call_accuracy"
_PROMPTY_FILE = "tool_call_quality.prompty"
_RESULT_KEY = "tool_call_quality"

_MAX_TOOL_CALL_ACCURACY_SCORE = 5
_MIN_TOOL_CALL_ACCURACY_SCORE = 1
_DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
_MAX_TOOL_CALL_QUALITY_SCORE = 5
_MIN_TOOL_CALL_QUALITY_SCORE = 1
_DEFAULT_TOOL_CALL_QUALITY_SCORE = 3

_NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
_NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
_INVALID_SCORE_MESSAGE = "Tool call quality score must be between 1 and 5."

_LLM_SCORE_KEY = "tool_calls_success_level"

id = "azureai://built-in/evaluators/tool_call_accuracy"
id = "azureai://built-in/evaluators/tool_call_quality"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

@override
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_QUALITY_SCORE, credential=None, **kwargs):
current_dir = os.path.dirname(__file__)
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
self.threshold = threshold
Expand Down Expand Up @@ -241,11 +241,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
score = llm_output.get(self._LLM_SCORE_KEY, None)
if not score or not check_score_is_valid(
score,
ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE,
ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE,
ToolCallQualityEvaluator._MIN_TOOL_CALL_QUALITY_SCORE,
ToolCallQualityEvaluator._MAX_TOOL_CALL_QUALITY_SCORE,
):
raise EvaluationException(
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallQualityEvaluator._MIN_TOOL_CALL_QUALITY_SCORE}, {ToolCallQualityEvaluator._MAX_TOOL_CALL_QUALITY_SCORE}].",
internal_message="Invalid score value.",
category=ErrorCategory.FAILED_EXECUTION,
blame=ErrorBlame.SYSTEM_ERROR,
Expand All @@ -266,10 +266,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t

else:
raise EvaluationException(
message="Tool call accuracy evaluator returned invalid output.",
message="Tool call quality evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
)

async def _real_call(self, **kwargs):
Expand Down Expand Up @@ -346,30 +346,30 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
message=f"Tool definition for {tool_name} not found",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
)
else:
raise EvaluationException(
message=f"Tool call missing name: {tool_call}",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
)
else:
# Unsupported tool format - only converter format is supported
raise EvaluationException(
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
)
else:
# Tool call is not a dictionary
raise EvaluationException(
message=f"Tool call is not a dictionary: {tool_call}",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
)

return needed_tool_definitions
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
name: Tool Call Accuracy
description: Evaluates Tool Call Accuracy for tool used by agent
name: Tool Call Quality
description: Evaluates Tool Call Quality for tool used by agent
model:
api: chat
parameters:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class ErrorTarget(Enum):
UNKNOWN = "Unknown"
CONVERSATION = "Conversation"
TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
TOOL_CALL_QUALITY_EVALUATOR = "ToolCallQualityEvaluator"
RED_TEAM = "RedTeam"
AOAI_GRADER = "AoaiGrader"
CONVERSATION_HISTORY_PARSING = "_get_conversation_history"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tool Call Accuracy Evaluator"
"# Tool Call Quality Evaluator"
]
},
{
Expand All @@ -13,7 +13,7 @@
"source": [
"### Getting Started\n",
"\n",
"This sample demonstrates how to use Tool Call Accuracy Evaluator\n",
"This sample demonstrates how to use Tool Call Quality Evaluator\n",
"Before running the sample:\n",
"```bash\n",
"pip install azure-ai-projects azure-identity azure-ai-evaluation\n",
Expand All @@ -33,7 +33,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:\n",
"The Tool Call Quality evaluator assesses how accurately an AI uses tools by examining:\n",
"- Relevance to the conversation\n",
"- Parameter correctness according to tool definitions\n",
"- Parameter value extraction from the conversation\n",
Expand All @@ -53,18 +53,18 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Tool Call Accuracy requires following input:\n",
"Tool Call Quality requires following input:\n",
"- Query - This can be a single query or a list of messages(conversation history with agent). Latter helps to determine if Agent used the information in history to make right tool calls.\n",
"- Tool Calls - Tool Call(s) made by Agent to answer the query. Optional - if response has tool calls, if not provided evaluator will look for tool calls in response.\n",
"- Response - (Optional)Response from Agent (or any GenAI App). This can be a single text response or a list or messages generated as part of Agent Response. If tool calls are not provide Tool Call Accuracy Evaluator will look at response for tool calls.\n",
"- Response - (Optional)Response from Agent (or any GenAI App). This can be a single text response or a list or messages generated as part of Agent Response. If tool calls are not provide Tool Call Quality Evaluator will look at response for tool calls.\n",
"- Tool Definitions - Tool(s) definition used by Agent to answer the query. \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Initialize Tool Call Accuracy Evaluator\n"
"### Initialize Tool Call Quality Evaluator\n"
]
},
{
Expand All @@ -74,7 +74,7 @@
"outputs": [],
"source": [
"import os\n",
"from azure.ai.evaluation import ToolCallAccuracyEvaluator , AzureOpenAIModelConfiguration\n",
"from azure.ai.evaluation import ToolCallQualityEvaluator , AzureOpenAIModelConfiguration\n",
"from pprint import pprint\n",
"\n",
"model_config = AzureOpenAIModelConfiguration(\n",
Expand All @@ -85,7 +85,7 @@
")\n",
"\n",
"\n",
"tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)"
"tool_call_quality = ToolCallQualityEvaluator(model_config=model_config)"
]
},
{
Expand Down Expand Up @@ -140,7 +140,7 @@
"metadata": {},
"outputs": [],
"source": [
"response = tool_call_accuracy(query=query, tool_calls=tool_call, tool_definitions=tool_definition)\n",
"response = tool_call_quality(query=query, tool_calls=tool_call, tool_definitions=tool_definition)\n",
"pprint(response)"
]
},
Expand Down Expand Up @@ -197,7 +197,7 @@
"metadata": {},
"outputs": [],
"source": [
"response = tool_call_accuracy(query=query, tool_calls=tool_calls, tool_definitions=tool_definition)\n",
"response = tool_call_quality(query=query, tool_calls=tool_calls, tool_definitions=tool_definition)\n",
"pprint(response)"
]
},
Expand All @@ -206,7 +206,7 @@
"metadata": {},
"source": [
"#### Tool Calls passed as part of `Response` (common for agent case)\n",
"- Tool Call Accuracy Evaluator extracts tool calls from response"
"- Tool Call Quality Evaluator extracts tool calls from response"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -541,18 +541,18 @@ def evaluation_evaluate_classes_methods(self):
)
# [END groundedness_pro_evaluator]

# [START tool_call_accuracy_evaluator]
# [START tool_call_quality_evaluator]
import os
from azure.ai.evaluation import ToolCallAccuracyEvaluator
from azure.ai.evaluation import ToolCallQualityEvaluator

model_config = {
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
}

tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config)
tool_call_accuracy_evaluator(
tool_call_quality_evaluator = ToolCallQualityEvaluator(model_config=model_config)
tool_call_quality_evaluator(
query="How is the weather in New York?",
response="The weather in New York is sunny.",
tool_calls={
Expand All @@ -573,7 +573,7 @@ def evaluation_evaluate_classes_methods(self):
},
},
)
# [END tool_call_accuracy_evaluator]
# [END tool_call_quality_evaluator]

# [START path_efficiency_evaluator]
from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -547,18 +547,18 @@ def evaluation_evaluate_classes_methods(self):
)
# [END groundedness_pro_evaluator]

# [START tool_call_accuracy_evaluator]
# [START tool_call_quality_evaluator]
import os
from azure.ai.evaluation import ToolCallAccuracyEvaluator
from azure.ai.evaluation import ToolCallQualityEvaluator

model_config = {
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"), # https://<account_name>.services.ai.azure.com
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
}

tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config)
tool_call_accuracy_evaluator(
tool_call_quality_evaluator = ToolCallQualityEvaluator(model_config=model_config)
tool_call_quality_evaluator(
query="How is the weather in New York?",
response="The weather in New York is sunny.",
tool_calls={
Expand All @@ -579,7 +579,7 @@ def evaluation_evaluate_classes_methods(self):
},
},
)
# [END tool_call_accuracy_evaluator]
# [END tool_call_quality_evaluator]

# [START path_efficiency_evaluator]
from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator
Expand Down
Loading