math evaluator update (#37283)

ninghu · web-flow · commit 7aeef6f433e9 · 2024-09-10T17:41:30.000-07:00
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
@@ -6,7 +6,7 @@
 except ImportError:
     import constants
 
-from typing import List, cast
+from typing import List
 
 import nltk
 import numpy as np
@@ -45,14 +45,11 @@ def get_harm_severity_level(harm_score: int) -> str:
 def nltk_tokenize(text: str) -> List[str]:
     """Tokenize the input text using the NLTK tokenizer."""
 
-    is_latin_or_numeric = all(
-        ("\u0020" <= c <= "\u007E")  # Basic Latin
-        or ("\u00A0" <= c <= "\u00FF")  # Latin-1 Supplement
-        or ("0" <= c <= "9")  # Digits
-        for c in text
-    )
+    if not text.isascii():
+        # Use NISTTokenizer for international tokenization
+        tokens = NISTTokenizer().international_tokenize(text)
+    else:
+        # By default, use NLTK word tokenizer
+        tokens = nltk.word_tokenize(text)
 
-    if is_latin_or_numeric:
-        return cast(List[str], nltk.word_tokenize(text))
-
-    return list(NISTTokenizer().international_tokenize(text))
+    return list(tokens)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/_bleu/_bleu.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/_bleu/_bleu.py
@@ -15,6 +15,7 @@ async def __call__(self, *, answer: str, ground_truth: str, **kwargs):
         reference_tokens = nltk_tokenize(ground_truth)
         hypothesis_tokens = nltk_tokenize(answer)
 
+        # NIST Smoothing
         smoothing_function = SmoothingFunction().method4
         score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/_meteor/_meteor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluators/_meteor/_meteor.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from nltk.translate.meteor_score import single_meteor_score
+from nltk.translate.meteor_score import meteor_score
 
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from azure.ai.evaluation._common.utils import nltk_tokenize
@@ -17,8 +17,8 @@ async def __call__(self, *, ground_truth: str, answer: str, **kwargs):
         reference_tokens = nltk_tokenize(ground_truth)
         hypothesis_tokens = nltk_tokenize(answer)
 
-        score = single_meteor_score(
-            reference_tokens,
+        score = meteor_score(
+            [reference_tokens],
             hypothesis_tokens,
             alpha=self._alpha,
             beta=self._beta,