update

sbalandi · sbalandi · commit 97fa04aa9444 · 2025-10-03T19:47:43.000+01:00
diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt
@@ -11,4 +11,5 @@ datasets>=3.6.0
 auto-gptq; sys_platform == "linux"
 autoawq<0.2.8; sys_platform == "linux"
 sentencepiece
-jinja2>=3.1.0
+jinja2>=3.1.0
+scipy
diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py
@@ -4,7 +4,7 @@
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq, AutoTokenizer
 
-from .reranking_evaluator import DEF_TOP_K, DEF_MAX_LENGTH
+from .reranking_evaluator import DEF_TOP_K, DEF_MAX_LENGTH, reranking_base_on_causallm_arch
 from .utils import mock_torch_cuda_is_available, mock_AwqQuantizer_validate_environment
 
 
@@ -452,9 +452,14 @@ def load_reranking_genai_pipeline(model_dir, device="CPU", ov_config=None):
 
 
 def load_reranking_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False):
+    try:
+        config = AutoConfig.from_pretrained(model_id, trust_remote_code=False)
+    except Exception:
+        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+
     if use_hf:
         logger.info("Using HF Transformers API")
-        if 'qwen3' in model_id.lower():
+        if reranking_base_on_causallm_arch(config):
             from transformers import AutoModelForCausalLM
             model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
         else:
@@ -466,7 +471,7 @@ def load_reranking_model(model_id, device="CPU", ov_config=None, use_hf=False, u
     else:
         logger.info("Using Optimum API")
         model_cls = None
-        if 'qwen3' in model_id.lower():
+        if reranking_base_on_causallm_arch(config):
             from optimum.intel.openvino import OVModelForCausalLM
             model_cls = OVModelForCausalLM
         else:
@@ -478,7 +483,7 @@ def load_reranking_model(model_id, device="CPU", ov_config=None, use_hf=False, u
                 model_id, device=device, ov_config=ov_config, safety_checker=None,
             )
         except ValueError as e:
-            logger.error("Failed to load reranking pipeline. Details:\n", e)
+            logger.error("Failed to load reranking pipeline, an attempt will be made again with updated parameters. Details:\n", e)
             model = model_cls.from_pretrained(
                 model_id,
                 trust_remote_code=True,
diff --git a/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py b/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py
@@ -17,6 +17,10 @@
 DEF_MAX_LENGTH_QWEN = 8192
 
 
+def reranking_base_on_causallm_arch(config):
+    return config.model_type == "qwen3" and "Qwen3ForCausalLM" in config.architectures
+
+
 def preprocess_fn(example):
     return {
         "query": example["query"],
@@ -27,7 +31,7 @@ def preprocess_fn(example):
 def prepare_default_data(num_samples=None):
     DATASET_NAME = "microsoft/ms_marco"
     NUM_SAMPLES = num_samples if num_samples else 24
-    set_seed(70)
+    set_seed(42)
     default_dataset = datasets.load_dataset(
         DATASET_NAME, 'v2.1', split="test", streaming=True
     ).shuffle(42).take(NUM_SAMPLES)
@@ -65,7 +69,7 @@ def __init__(
             self.gt_data = pd.read_csv(gt_data, keep_default_na=False)
 
         self.similarity = RerankingSimilarity()
-        # self.last_cmp = None
+        self.last_cmp = None
 
     def get_generation_fn(self):
         return self.generation_fn
@@ -106,6 +110,9 @@ def default_gen_answer(model, tokenizer, query, passages):
             device = "cpu"
             if hasattr(model, "device"):
                 device = model.device
+
+            # post/pre processing for qwen models added according to transformers Qwen3-Embedding-0.6B model card:
+            # https://huggingface.co/Qwen/Qwen3-Reranker-0.6B#transformers-usage
             if model.config.model_type == "qwen3":
                 prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the'\
                          + 'Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
@@ -122,9 +129,7 @@ def default_gen_answer(model, tokenizer, query, passages):
                 )
                 for i, ele in enumerate(input_data["input_ids"]):
                     input_data["input_ids"][i] = prefix_tokens + ele + suffix_tokens
-                input_data = tokenizer.pad(input_data, padding=True, return_tensors="pt", max_length=DEF_MAX_LENGTH_QWEN)
-                for key in input_data:
-                    input_data[key] = input_data[key].to(device)
+                input_data = tokenizer.pad(input_data, padding=True, return_tensors="pt", max_length=DEF_MAX_LENGTH_QWEN).to(device)
             else:
                 tokenizer_kwargs = {"truncation": True, "padding": True, "max_length": DEF_MAX_LENGTH}
                 inputs = [query] * len(passages)
@@ -133,9 +138,8 @@ def default_gen_answer(model, tokenizer, query, passages):
             with torch.no_grad():
                 outputs = model(**input_data).logits
 
-            if model.config.model_type == "qwen3":
+            if reranking_base_on_causallm_arch(model.config):
                 batch_scores = outputs[:, -1, :]
-
                 token_false_id = tokenizer.convert_tokens_to_ids("no")
                 token_true_id = tokenizer.convert_tokens_to_ids("yes")
                 true_vector = batch_scores[:, token_true_id]
diff --git a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py
@@ -198,7 +198,7 @@ def evaluate(self, data_gold, data_prediction):
                 # documets on the same position of top_n is different
                 if i >= len(prediction_data) or int(score[0]) != int(prediction_data[i][0]):
                     per_query_text.append(math.inf)
-                    mean_per_query_text.append(abs(score[1] - prediction_data[i][1]))
+                    mean_per_query_text.append(len(gold_data))
                 else:
                     per_query_text.append(abs(score[1] - prediction_data[i][1]))
                     mean_per_query_text.append(abs(score[1] - prediction_data[i][1]))