From c20a0eed863d78367c336e277e4cd693913dc5f9 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Wed, 1 Oct 2025 19:17:53 +0100 Subject: [PATCH 1/3] [wwb] Add text reranking pipeline --- .../tests/test_cli_reranking.py | 140 +++++++++++++ .../whowhatbench/__init__.py | 2 + .../whowhatbench/model_loaders.py | 76 ++++++- .../whowhatbench/reranking_evaluator.py | 187 ++++++++++++++++++ .../whowhatbench/whowhat_metrics.py | 36 ++++ tools/who_what_benchmark/whowhatbench/wwb.py | 38 +++- 6 files changed, 475 insertions(+), 4 deletions(-) create mode 100644 tools/who_what_benchmark/tests/test_cli_reranking.py create mode 100644 tools/who_what_benchmark/whowhatbench/reranking_evaluator.py diff --git a/tools/who_what_benchmark/tests/test_cli_reranking.py b/tools/who_what_benchmark/tests/test_cli_reranking.py new file mode 100644 index 0000000000..4ddeb890c3 --- /dev/null +++ b/tools/who_what_benchmark/tests/test_cli_reranking.py @@ -0,0 +1,140 @@ +import subprocess # nosec B404 +import pytest +import logging +from test_cli_image import run_wwb + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@pytest.mark.parametrize( + ("model_id", "model_type"), + [ + ("cross-encoder/ms-marco-TinyBERT-L2-v2", "text-reranking"), + ], +) +def test_reranking_basic(model_id, model_type, tmp_path): + GT_FILE = tmp_path / "gt.csv" + MODEL_PATH = tmp_path / model_id.replace("/", "--") + + result = subprocess.run(["optimum-cli", "export", + "openvino", "-m", model_id, + MODEL_PATH, "--task", + "text-classification", + "--trust-remote-code"], + capture_output=True, + text=True, + ) + assert result.returncode == 0 + + # Collect reference with HF model + run_wwb([ + "--base-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--hf", + ]) + + # test Optimum + run_wwb([ + "--target-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ]) + + # test GenAI + run_wwb([ + "--target-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--genai", + "--output", + tmp_path, + ]) + + # test w/o models + run_wwb([ + "--target-data", + tmp_path / "target.csv", + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--genai", + ]) + + +@pytest.mark.parametrize( + ("model_id", "model_type"), + [ + ("Qwen/Qwen3-Reranker-0.6B", "text-reranking"), + ], +) +def test_reranking_qwen(model_id, model_type, tmp_path): + GT_FILE = tmp_path / "gt.csv" + MODEL_PATH = tmp_path / model_id.replace("/", "--") + + result = subprocess.run(["optimum-cli", "export", + "openvino", "-m", model_id, + MODEL_PATH, "--task", + "text-generation", + "--trust-remote-code"], + capture_output=True, + text=True, + ) + assert result.returncode == 0 + + # Collect reference with HF model + run_wwb([ + "--base-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--hf", + ]) + + # test Optimum + run_wwb([ + "--target-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ]) diff --git a/tools/who_what_benchmark/whowhatbench/__init__.py b/tools/who_what_benchmark/whowhatbench/__init__.py index 1fe4511880..dc37e25954 100644 --- a/tools/who_what_benchmark/whowhatbench/__init__.py +++ b/tools/who_what_benchmark/whowhatbench/__init__.py @@ -6,6 +6,7 @@ from .im2im_evaluator import Image2ImageEvaluator from .inpaint_evaluator import InpaintingEvaluator from .embeddings_evaluator import EmbeddingsEvaluator +from .reranking_evaluator import RerankingEvaluator __all__ = [ @@ -17,5 +18,6 @@ "Image2ImageEvaluator", "InpaintingEvaluator", "EmbeddingsEvaluator", + "RerankingEvaluator", "EVALUATOR_REGISTRY", ] diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py index 7b8de76d40..cd69aa219e 100644 --- a/tools/who_what_benchmark/whowhatbench/model_loaders.py +++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py @@ -4,7 +4,9 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq, AutoTokenizer -from .embeddings_evaluator import DEFAULT_MAX_LENGTH +from .embeddings_evaluator import DEFAULT_MAX_LENGTH as EMBED_DEFAULT_MAX_LENGTH +from .reranking_evaluator import DEFAULT_MAX_LENGTH as RERANK_DEFAULT_MAX_LENGTH +from .reranking_evaluator import DEFAULT_TOP_K as RERANK_DEFAULT_TOP_K from .utils import mock_torch_cuda_is_available, mock_AwqQuantizer_validate_environment @@ -21,7 +23,7 @@ def __init__(self, model, model_dir, model_type): self.model = model self.model_type = model_type - if model_type in ["text", "visual-text", "text-embedding"]: + if model_type in ["text", "visual-text", "text-embedding", "text-reranking"]: try: self.config = AutoConfig.from_pretrained(model_dir) except Exception: @@ -444,7 +446,7 @@ def load_embedding_genai_pipeline(model_dir, device="CPU", ov_config=None, **kwa config.pooling_type = openvino_genai.TextEmbeddingPipeline.PoolingType.LAST_TOKEN else: config.pooling_type = openvino_genai.TextEmbeddingPipeline.PoolingType.CLS - config.max_length = DEFAULT_MAX_LENGTH + config.max_length = EMBED_DEFAULT_MAX_LENGTH config.normalize = kwargs.get("embeds_normalize", False) config.pad_to_max_length = True @@ -485,6 +487,72 @@ def load_embedding_model(model_id, device="CPU", ov_config=None, use_hf=False, u ) return model +def load_reranking_genai_pipeline(model_dir, device="CPU", ov_config=None): + try: + import openvino_genai + except ImportError as e: + logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e) + exit(-1) + + logger.info("Using OpenVINO GenAI TextRerankPipeline API") + + config = openvino_genai.TextRerankPipeline.Config() + config.top_n = RERANK_DEFAULT_TOP_K + config.max_length = RERANK_DEFAULT_MAX_LENGTH + + pipeline = openvino_genai.TextRerankPipeline(model_dir, device.upper(), config, **ov_config) + + return GenAIModelWrapper( + pipeline, + model_dir, + "text-reranking" + ) + + +def load_reranking_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False): + try: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=False) + except Exception: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + + if use_hf: + logger.info("Using HF Transformers API") + if reranking_base_on_causallm_arch(config): + from transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) + else: + from transformers import AutoModelForSequenceClassification + model = AutoModelForSequenceClassification.from_pretrained(model_id, trust_remote_code=True) + elif use_genai: + logger.info("Using OpenVINO GenAI API") + model = load_reranking_genai_pipeline(model_id, device, ov_config) + else: + logger.info("Using Optimum API") + model_cls = None + if reranking_base_on_causallm_arch(config): + from optimum.intel.openvino import OVModelForCausalLM + model_cls = OVModelForCausalLM + else: + from optimum.intel.openvino import OVModelForSequenceClassification + model_cls = OVModelForSequenceClassification + + try: + model = model_cls.from_pretrained( + model_id, device=device, ov_config=ov_config, safety_checker=None, + ) + except ValueError as e: + logger.error("Failed to load reranking pipeline, an attempt will be made again with updated parameters. Details:\n", e) + model = model_cls.from_pretrained( + model_id, + trust_remote_code=True, + use_cache=False, + device=device, + ov_config=ov_config, + safety_checker=None + ) + + return model + def load_model( model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False, **kwargs @@ -512,5 +580,7 @@ def load_model( return load_inpainting_model(model_id, device, ov_options, use_hf, use_genai) elif model_type == "text-embedding": return load_embedding_model(model_id, device, ov_options, use_hf, use_genai, **kwargs) + elif model_type == "text-reranking": + return load_reranking_model(model_id, device, ov_options, use_hf, use_genai) else: raise ValueError(f"Unsupported model type: {model_type}") diff --git a/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py b/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py new file mode 100644 index 0000000000..2f7a2594f8 --- /dev/null +++ b/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py @@ -0,0 +1,187 @@ +from typing import Any, Union + +import os +import scipy +import torch +import pandas as pd +from tqdm import tqdm +from .registry import register_evaluator, BaseEvaluator +from .whowhat_metrics import RerankingSimilarity +from transformers import set_seed +import datasets +import numpy as np + + +DEF_TOP_K = 5 +DEFAULT_MAX_LENGTH = 200 +DEFAULT_MAX_LENGTH_QWEN = 8192 + + +def preprocess_fn(example): + return { + "query": example["query"], + "passages": example["passages"]["passage_text"], + } + + +def prepare_default_data(num_samples=None): + DATASET_NAME = "microsoft/ms_marco" + NUM_SAMPLES = num_samples if num_samples else 24 + set_seed(70) + default_dataset = datasets.load_dataset( + DATASET_NAME, 'v2.1', split="test", streaming=True + ).shuffle(42).take(NUM_SAMPLES) + return default_dataset.map( + lambda x: preprocess_fn(x), remove_columns=default_dataset.column_names + ) + + +@register_evaluator( + "text-reranking" +) +class RerankingEvaluator(BaseEvaluator): + def __init__( + self, + base_model: Any = None, + tokenizer: Any = None, + gt_data: str = None, + test_data: Union[str, list] = None, + num_samples=None, + gen_rerank_fn=None + ) -> None: + assert ( + base_model is not None or gt_data is not None + ), "Text generation pipeline for evaluation or ground trush data must be defined" + + self.test_data = test_data + self.tokenizer = tokenizer + self.num_samples = num_samples + self.generation_fn = gen_rerank_fn + self.gt_dir = os.path.dirname(gt_data) + + if base_model: + self.gt_data = self._generate_data(base_model, gen_rerank_fn) + else: + self.gt_data = pd.read_csv(gt_data, keep_default_na=False) + + self.similarity = RerankingSimilarity() + # self.last_cmp = None + + def get_generation_fn(self): + return self.generation_fn + + def score(self, model_or_data, gen_answer_fn=None, output_dir=None, **kwargs): + if output_dir is None: + result_folder = os.path.join(self.gt_dir, "target") + else: + result_folder = os.path.join(output_dir, "target") + + if isinstance(model_or_data, str) and os.path.exists(model_or_data): + predictions = pd.read_csv(model_or_data, keep_default_na=False) + else: + predictions = self._generate_data(model_or_data, gen_answer_fn, result_folder) + self.predictions = predictions + + all_metrics, all_metrics_per_query = self.similarity.evaluate( + self.gt_data, predictions + ) + + self.last_cmp = all_metrics_per_query + self.last_cmp["query"] = predictions["query"].values + self.last_cmp["passages"] = predictions["passages"].values + self.last_cmp["source_model"] = self.gt_data["top_n_scores_path"].values + self.last_cmp["optimized_model"] = predictions["top_n_scores_path"].values + self.last_cmp = pd.DataFrame(self.last_cmp) + + return pd.DataFrame(all_metrics_per_query), pd.DataFrame([all_metrics]) + + def worst_examples(self, top_k: int = 5, metric="similarity"): + assert self.last_cmp is not None + res = self.last_cmp.nsmallest(top_k, metric) + res = list(row for idx, row in res.iterrows()) + return res + + def _generate_data(self, model, gen_answer_fn=None, result_dir="reference"): + def default_gen_answer(model, tokenizer, query, passages): + device = "cpu" + if hasattr(model, "device"): + device = model.device + if model.config.model_type == "qwen3": + prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the'\ + + 'Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n' + suffix = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" + task = "Given a web search query, retrieve relevant passages that answer the query" + prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False) + suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False) + pairs = [] + for doc in passages: + pairs.append(f": {task}\n: {query}\n: {doc}") + input_data = tokenizer( + pairs, padding=False, truncation="longest_first", return_attention_mask=False, + max_length=DEFAULT_MAX_LENGTH_QWEN - len(prefix_tokens) - len(suffix_tokens) + ) + for i, ele in enumerate(input_data["input_ids"]): + input_data["input_ids"][i] = prefix_tokens + ele + suffix_tokens + input_data = tokenizer.pad(input_data, padding=True, return_tensors="pt", max_length=DEFAULT_MAX_LENGTH_QWEN) + for key in input_data: + input_data[key] = input_data[key].to(device) + else: + tokenizer_kwargs = {"truncation": True, "padding": True, "max_length": DEFAULT_MAX_LENGTH} + inputs = [query] * len(passages) + input_data = tokenizer(inputs, passages, return_tensors="pt", **tokenizer_kwargs) + + with torch.no_grad(): + outputs = model(**input_data).logits + + if model.config.model_type == "qwen3": + batch_scores = outputs[:, -1, :] + + token_false_id = tokenizer.convert_tokens_to_ids("no") + token_true_id = tokenizer.convert_tokens_to_ids("yes") + true_vector = batch_scores[:, token_true_id] + false_vector = batch_scores[:, token_false_id] + batch_scores = torch.stack([false_vector, true_vector], dim=1) + batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1) + scores = batch_scores[:, 1].exp() + else: + if outputs.shape[1] > 1: + scores = outputs[:, 1] + else: + scores = outputs.flatten() + scores = scipy.special.expit(scores) + sorted_scores = [] + for index, (score, _) in enumerate(zip(scores, passages)): + sorted_scores.append(np.array([index, score.numpy()])) + sorted_scores.sort(key=lambda x: x[1], reverse=True) + return np.array(sorted_scores[:DEF_TOP_K]) + + gen_answer_fn = gen_answer_fn or default_gen_answer + + # TODO: add possibility to use custom dataset/csv + data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples)) + + scores_path = [] + passages = [] + query = [] + inptus = ( + data.values + if self.num_samples is None + else data.values[: self.num_samples] + ) + + if not os.path.exists(result_dir): + os.makedirs(result_dir) + + for i, data in tqdm(enumerate(inptus), desc="Evaluate pipeline"): + result = gen_answer_fn(model, self.tokenizer, data[0], data[1]) + query.append(data[0]) + passages.append(data[1]) + result_path = os.path.join(result_dir, f"scores_{i}.npy") + with open(result_path, 'wb') as f: + np.save(f, result) + scores_path.append(result_path) + + res_data = {"query": query, "passages": passages, "top_n_scores_path": scores_path} + df = pd.DataFrame(res_data) + + return df diff --git a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py index 26d141abb8..fa1a5df331 100644 --- a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py +++ b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py @@ -12,6 +12,7 @@ from sentence_transformers import SentenceTransformer, util from transformers import CLIPImageProcessor, CLIPModel from tqdm import tqdm +import math def evaluate_similarity(model, data_gold, data_prediction): @@ -195,3 +196,38 @@ def evaluate(self, data_gold, data_prediction): metric_dict = {"similarity": np.mean(metric_per_gen)} return metric_dict, {"similarity": metric_per_gen, "similarity_per_passages": metric_per_passages} + + +class RerankingSimilarity: + def evaluate(self, data_gold, data_prediction): + gold_results = data_gold["top_n_scores_path"].values + prediction_results = data_prediction["top_n_scores_path"].values + + metric_per_query = [] + similarity_per_query = [] + for gold, prediction in tqdm( + zip(gold_results, prediction_results), desc="Reranking Similarity evaluation" + ): + with open(gold, 'rb') as f: + gold_data = np.load(f) + + with open(prediction, 'rb') as f: + prediction_data = np.load(f) + + per_query_text = [] + for i, score in enumerate(gold_data): + # documets on the same position of top_n is different + if i >= len(prediction_data) or int(score[0]) != int(prediction_data[i][0]): + per_query_text.append(math.inf) + else: + per_query_text.append(abs(score[1] - prediction_data[i][1])) + metric_per_query.append(per_query_text) + + if math.inf in per_query_text: + similarity_per_query.append(0) + else: + dist = np.linalg.norm(per_query_text) + similarity_per_query.append(1 / (1 + dist)) + + metric_dict = {"similarity": np.mean(similarity_per_query)} + return metric_dict, {"similarity": similarity_per_query, "per_text_score_list": metric_per_query} diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 0bd848a0b4..73c347d992 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -62,7 +62,7 @@ def parse_args(): parser.add_argument( "--model-type", type=str, - choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting", "text-embedding"], + choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"], default="text", help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, " "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt", @@ -218,6 +218,12 @@ def parse_args(): choices=["left", "right"], default=None, help="Side to use for padding 'left' or 'right'. Applicable only for text embeddings") + parser.add_argument( + "--rag-config", + type=str, + default=None, + help="Path to the JSON file with config for Embedding/Reranker Pipeline", + ) return parser.parse_args() @@ -434,6 +440,10 @@ def genai_gen_embedding(model, tokenizer, passages, **kwargs): return embeddings +def genai_gen_reranking(model, tokenizer, query, documents): + return model.rerank(query, documents) + + def is_model_with_automatic_crop(config): return "internvl" in config.model_type or "minicpmv" in config.model_type @@ -538,6 +548,15 @@ def create_evaluator(base_model, args): normalize=args.embeds_normalize, padding_side=args.embeds_padding_side, ) + elif task == "text-reranking": + return EvaluatorCLS( + base_model=base_model, + tokenizer=load_tokenizer(args), + gt_data=args.gt_data, + test_data=prompts, + num_samples=args.num_samples, + gen_rerank_fn=genai_gen_reranking if args.genai else None + ) else: raise ValueError(f"Unsupported task: {task}") except KeyError as e: @@ -616,6 +635,21 @@ def read_cb_config(path): return {} +def print_rag_results(evaluator): + metric_of_interest = "similarity" + worst_examples = evaluator.worst_examples( + top_k=5, metric=metric_of_interest) + for i, e in enumerate(worst_examples): + logger.info( + "=======================================================================================================" + ) + logger.info(f"Top-{i+1} example:") + logger.info("## Query:\n%s\n", e["query"]) + logger.info("## Passages num:\n%s\n", len(e["passages"])) + logger.info("## Similarity:\n%s\n", e["similarity"]) + logger.info("## Top_n scores:\n%s\n", e["per_text_score_list"]) + + def main(): args = parse_args() check_args(args) @@ -708,6 +742,8 @@ def main(): print_image_results(evaluator) elif args.model_type in ['text-embedding']: print_embeds_results(evaluator) + elif args.model_type in ['text-reranking']: + print_rag_results(evaluator) if __name__ == "__main__": From 8896ffbf7431abd8f2361698de5e1819044a5b8d Mon Sep 17 00:00:00 2001 From: sbalandi Date: Fri, 3 Oct 2025 18:58:14 +0100 Subject: [PATCH 2/3] update --- tools/who_what_benchmark/requirements.txt | 3 ++- .../whowhatbench/model_loaders.py | 1 + .../whowhatbench/reranking_evaluator.py | 18 +++++++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt index 03b150e718..1df1d49fff 100644 --- a/tools/who_what_benchmark/requirements.txt +++ b/tools/who_what_benchmark/requirements.txt @@ -11,4 +11,5 @@ datasets>=3.6.0 auto-gptq; sys_platform == "linux" autoawq<0.2.8; sys_platform == "linux" sentencepiece -jinja2>=3.1.0 \ No newline at end of file +jinja2>=3.1.0 +scipy \ No newline at end of file diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py index cd69aa219e..a72458f0e1 100644 --- a/tools/who_what_benchmark/whowhatbench/model_loaders.py +++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py @@ -7,6 +7,7 @@ from .embeddings_evaluator import DEFAULT_MAX_LENGTH as EMBED_DEFAULT_MAX_LENGTH from .reranking_evaluator import DEFAULT_MAX_LENGTH as RERANK_DEFAULT_MAX_LENGTH from .reranking_evaluator import DEFAULT_TOP_K as RERANK_DEFAULT_TOP_K +from .reranking_evaluator import reranking_base_on_causallm_arch from .utils import mock_torch_cuda_is_available, mock_AwqQuantizer_validate_environment diff --git a/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py b/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py index 2f7a2594f8..b5d5f6700a 100644 --- a/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py @@ -17,6 +17,10 @@ DEFAULT_MAX_LENGTH_QWEN = 8192 +def reranking_base_on_causallm_arch(config): + return config.model_type == "qwen3" and "Qwen3ForCausalLM" in config.architectures + + def preprocess_fn(example): return { "query": example["query"], @@ -27,7 +31,7 @@ def preprocess_fn(example): def prepare_default_data(num_samples=None): DATASET_NAME = "microsoft/ms_marco" NUM_SAMPLES = num_samples if num_samples else 24 - set_seed(70) + set_seed(42) default_dataset = datasets.load_dataset( DATASET_NAME, 'v2.1', split="test", streaming=True ).shuffle(42).take(NUM_SAMPLES) @@ -65,7 +69,7 @@ def __init__( self.gt_data = pd.read_csv(gt_data, keep_default_na=False) self.similarity = RerankingSimilarity() - # self.last_cmp = None + self.last_cmp = None def get_generation_fn(self): return self.generation_fn @@ -106,6 +110,9 @@ def default_gen_answer(model, tokenizer, query, passages): device = "cpu" if hasattr(model, "device"): device = model.device + + # post/pre processing for qwen models added according to transformers Qwen3-Embedding-0.6B model card: + # https://huggingface.co/Qwen/Qwen3-Reranker-0.6B#transformers-usage if model.config.model_type == "qwen3": prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the'\ + 'Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n' @@ -122,9 +129,7 @@ def default_gen_answer(model, tokenizer, query, passages): ) for i, ele in enumerate(input_data["input_ids"]): input_data["input_ids"][i] = prefix_tokens + ele + suffix_tokens - input_data = tokenizer.pad(input_data, padding=True, return_tensors="pt", max_length=DEFAULT_MAX_LENGTH_QWEN) - for key in input_data: - input_data[key] = input_data[key].to(device) + input_data = tokenizer.pad(input_data, padding=True, return_tensors="pt", max_length=DEFAULT_MAX_LENGTH_QWEN).to(device) else: tokenizer_kwargs = {"truncation": True, "padding": True, "max_length": DEFAULT_MAX_LENGTH} inputs = [query] * len(passages) @@ -133,9 +138,8 @@ def default_gen_answer(model, tokenizer, query, passages): with torch.no_grad(): outputs = model(**input_data).logits - if model.config.model_type == "qwen3": + if reranking_base_on_causallm_arch(model.config): batch_scores = outputs[:, -1, :] - token_false_id = tokenizer.convert_tokens_to_ids("no") token_true_id = tokenizer.convert_tokens_to_ids("yes") true_vector = batch_scores[:, token_true_id] From cf1457b02eb0067d9b15e2782d8d893575d24db2 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Mon, 6 Oct 2025 13:52:14 +0100 Subject: [PATCH 3/3] fix metric and Qwen3-Reranker accuracy --- .../whowhatbench/model_loaders.py | 7 ++- .../whowhatbench/reranking_evaluator.py | 51 +++++++++++-------- tools/who_what_benchmark/whowhatbench/wwb.py | 3 +- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py index a72458f0e1..98479b87f9 100644 --- a/tools/who_what_benchmark/whowhatbench/model_loaders.py +++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py @@ -5,9 +5,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq, AutoTokenizer from .embeddings_evaluator import DEFAULT_MAX_LENGTH as EMBED_DEFAULT_MAX_LENGTH -from .reranking_evaluator import DEFAULT_MAX_LENGTH as RERANK_DEFAULT_MAX_LENGTH -from .reranking_evaluator import DEFAULT_TOP_K as RERANK_DEFAULT_TOP_K -from .reranking_evaluator import reranking_base_on_causallm_arch +from .reranking_evaluator import DEFAULT_MAX_LENGTH as RERANK_DEFAULT_MAX_LENGTH, DEFAULT_TOP_K as RERANK_DEFAULT_TOP_K, reranking_base_on_causallm_arch from .utils import mock_torch_cuda_is_available, mock_AwqQuantizer_validate_environment @@ -488,6 +486,7 @@ def load_embedding_model(model_id, device="CPU", ov_config=None, use_hf=False, u ) return model + def load_reranking_genai_pipeline(model_dir, device="CPU", ov_config=None): try: import openvino_genai @@ -553,7 +552,7 @@ def load_reranking_model(model_id, device="CPU", ov_config=None, use_hf=False, u ) return model - + def load_model( model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False, **kwargs diff --git a/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py b/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py index b5d5f6700a..00580d2b9c 100644 --- a/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/reranking_evaluator.py @@ -12,7 +12,7 @@ import numpy as np -DEF_TOP_K = 5 +DEFAULT_TOP_K = 5 DEFAULT_MAX_LENGTH = 200 DEFAULT_MAX_LENGTH_QWEN = 8192 @@ -114,22 +114,31 @@ def default_gen_answer(model, tokenizer, query, passages): # post/pre processing for qwen models added according to transformers Qwen3-Embedding-0.6B model card: # https://huggingface.co/Qwen/Qwen3-Reranker-0.6B#transformers-usage if model.config.model_type == "qwen3": - prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the'\ + prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the '\ + 'Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n' suffix = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" task = "Given a web search query, retrieve relevant passages that answer the query" - prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False) - suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False) pairs = [] - for doc in passages: - pairs.append(f": {task}\n: {query}\n: {doc}") - input_data = tokenizer( - pairs, padding=False, truncation="longest_first", return_attention_mask=False, - max_length=DEFAULT_MAX_LENGTH_QWEN - len(prefix_tokens) - len(suffix_tokens) - ) - for i, ele in enumerate(input_data["input_ids"]): - input_data["input_ids"][i] = prefix_tokens + ele + suffix_tokens - input_data = tokenizer.pad(input_data, padding=True, return_tensors="pt", max_length=DEFAULT_MAX_LENGTH_QWEN).to(device) + if reranking_base_on_causallm_arch(model.config): + for doc in passages: + pairs.append(f": {task}\n: {query}\n: {doc}") + prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False) + suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False) + input_data = tokenizer( + pairs, padding=False, truncation="longest_first", return_attention_mask=False, + max_length=DEFAULT_MAX_LENGTH_QWEN - len(prefix_tokens) - len(suffix_tokens) + ) + for i, ele in enumerate(input_data["input_ids"]): + input_data["input_ids"][i] = prefix_tokens + ele + suffix_tokens + input_data = tokenizer.pad(input_data, + padding=True, + return_tensors="pt", + max_length=DEFAULT_MAX_LENGTH_QWEN, + padding_side="left").to(device) + else: + for doc in passages: + pairs.append(f"{prefix}: {task}\n: {query}\n: {doc}{suffix}") + input_data = tokenizer(pairs, padding=True, truncation=True, max_length=DEFAULT_MAX_LENGTH_QWEN, return_tensors="pt", padding_side="left") else: tokenizer_kwargs = {"truncation": True, "padding": True, "max_length": DEFAULT_MAX_LENGTH} inputs = [query] * len(passages) @@ -157,26 +166,26 @@ def default_gen_answer(model, tokenizer, query, passages): for index, (score, _) in enumerate(zip(scores, passages)): sorted_scores.append(np.array([index, score.numpy()])) sorted_scores.sort(key=lambda x: x[1], reverse=True) - return np.array(sorted_scores[:DEF_TOP_K]) + return np.array(sorted_scores[:DEFAULT_TOP_K]) gen_answer_fn = gen_answer_fn or default_gen_answer # TODO: add possibility to use custom dataset/csv - data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples)) + df = pd.DataFrame.from_dict(prepare_default_data(self.num_samples)) scores_path = [] passages = [] query = [] - inptus = ( - data.values + inputs = ( + df.values if self.num_samples is None - else data.values[: self.num_samples] + else df.values[: self.num_samples] ) if not os.path.exists(result_dir): os.makedirs(result_dir) - for i, data in tqdm(enumerate(inptus), desc="Evaluate pipeline"): + for i, data in tqdm(enumerate(inputs), desc="Evaluate pipeline"): result = gen_answer_fn(model, self.tokenizer, data[0], data[1]) query.append(data[0]) passages.append(data[1]) @@ -186,6 +195,6 @@ def default_gen_answer(model, tokenizer, query, passages): scores_path.append(result_path) res_data = {"query": query, "passages": passages, "top_n_scores_path": scores_path} - df = pd.DataFrame(res_data) + df_result = pd.DataFrame(res_data) - return df + return df_result diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 73c347d992..cf127c3aa5 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -65,7 +65,8 @@ def parse_args(): choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"], default="text", help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, " - "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt", + "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt " + "image-inpainting - for image generation based on image, mask and prompt, text-reranking - for reranking a list of texts based on relevance to query", ) parser.add_argument( "--data-encoder",