Skip to content

Commit b6274d7

Browse files
committed
[wwb] Add text embeddings pipeline
1 parent 4274a9a commit b6274d7

File tree

6 files changed

+415
-3
lines changed

6 files changed

+415
-3
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import subprocess # nosec B404
2+
import pytest
3+
import logging
4+
from test_cli_image import run_wwb
5+
6+
7+
logging.basicConfig(level=logging.INFO)
8+
logger = logging.getLogger(__name__)
9+
10+
11+
@pytest.mark.parametrize(
12+
("model_id", "model_type"),
13+
[
14+
("BAAI/bge-small-en-v1.5", "text-embedding"),
15+
("Qwen/Qwen3-Embedding-0.6B", "text-embedding"),
16+
],
17+
)
18+
def test_embeddings_basic(model_id, model_type, tmp_path):
19+
GT_FILE = tmp_path / "gt.csv"
20+
MODEL_PATH = tmp_path / model_id.replace("/", "--")
21+
22+
result = subprocess.run(["optimum-cli", "export",
23+
"openvino", "-m", model_id,
24+
MODEL_PATH, "--task",
25+
"feature-extraction",
26+
"--trust-remote-code"],
27+
capture_output=True,
28+
text=True,
29+
)
30+
assert result.returncode == 0
31+
32+
# Collect reference with HF model
33+
run_wwb([
34+
"--base-model",
35+
model_id,
36+
"--num-samples",
37+
"1",
38+
"--gt-data",
39+
GT_FILE,
40+
"--device",
41+
"CPU",
42+
"--model-type",
43+
model_type,
44+
"--hf",
45+
])
46+
47+
# test Optimum
48+
run_wwb([
49+
"--target-model",
50+
MODEL_PATH,
51+
"--num-samples",
52+
"1",
53+
"--gt-data",
54+
GT_FILE,
55+
"--device",
56+
"CPU",
57+
"--model-type",
58+
model_type,
59+
])
60+
61+
# test GenAI
62+
run_wwb([
63+
"--target-model",
64+
MODEL_PATH,
65+
"--num-samples",
66+
"1",
67+
"--gt-data",
68+
GT_FILE,
69+
"--device",
70+
"CPU",
71+
"--model-type",
72+
model_type,
73+
"--genai",
74+
"--output",
75+
tmp_path,
76+
])
77+
78+
# test w/o models
79+
run_wwb([
80+
"--target-data",
81+
tmp_path / "target.csv",
82+
"--num-samples",
83+
"1",
84+
"--gt-data",
85+
GT_FILE,
86+
"--device",
87+
"CPU",
88+
"--model-type",
89+
model_type,
90+
"--genai",
91+
])

tools/who_what_benchmark/whowhatbench/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from .visualtext_evaluator import VisualTextEvaluator
66
from .im2im_evaluator import Image2ImageEvaluator
77
from .inpaint_evaluator import InpaintingEvaluator
8+
from .embeddings_evaluator import EmbeddingsEvaluator
89

910

1011
__all__ = [
@@ -15,5 +16,6 @@
1516
"VisualTextEvaluator",
1617
"Image2ImageEvaluator",
1718
"InpaintingEvaluator",
19+
"EmbeddingsEvaluator",
1820
"EVALUATOR_REGISTRY",
1921
]
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
from typing import Any, Union
2+
3+
import os
4+
import torch
5+
import numpy as np
6+
import pandas as pd
7+
from tqdm import tqdm
8+
from importlib.resources import files
9+
from .registry import register_evaluator, BaseEvaluator
10+
from .whowhat_metrics import EmbedsSimilarity
11+
from .utils import patch_awq_for_inference, get_ignore_parameters_flag
12+
from transformers import set_seed
13+
import datasets
14+
from torch import Tensor
15+
16+
DEF_MAX_LENGTH = 100
17+
18+
19+
def prepare_default_data(num_samples=None):
20+
DATASET_NAME = "microsoft/ms_marco"
21+
NUM_SAMPLES = num_samples if num_samples else 24
22+
set_seed(42)
23+
default_dataset = datasets.load_dataset(
24+
DATASET_NAME, 'v2.1', split="test", streaming=True
25+
).shuffle(42).take(NUM_SAMPLES)
26+
return default_dataset.map(
27+
lambda x: {'passages': x['passages']['passage_text']}, remove_columns=default_dataset.column_names
28+
)
29+
30+
31+
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
32+
left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
33+
if left_padding:
34+
return last_hidden_states[:, -1]
35+
else:
36+
sequence_lengths = attention_mask.sum(dim=1) - 1
37+
batch_size = last_hidden_states.shape[0]
38+
batch_dim = torch.arange(batch_size, device=last_hidden_states.device)
39+
result = last_hidden_states[batch_dim, sequence_lengths]
40+
return result
41+
42+
43+
def mean_pooling(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
44+
input_mask_expanded = (
45+
attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).to(last_hidden_states.dtype)
46+
)
47+
sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
48+
sum_mask = input_mask_expanded.sum(1)
49+
sum_mask = torch.clamp(sum_mask, min=1e-9)
50+
51+
return sum_embeddings / sum_mask
52+
53+
54+
@register_evaluator(
55+
"text-embedding"
56+
)
57+
class EmbeddingsEvaluator(BaseEvaluator):
58+
def __init__(
59+
self,
60+
base_model: Any = None,
61+
tokenizer: Any = None,
62+
gt_data: str = None,
63+
test_data: Union[str, list] = None,
64+
num_samples=None,
65+
gen_embeds_fn=None,
66+
pooling_type=None,
67+
normalize=None,
68+
padding_side=None
69+
) -> None:
70+
assert (
71+
base_model is not None or gt_data is not None
72+
), "Text generation pipeline for evaluation or ground trush data must be defined"
73+
74+
self.test_data = test_data
75+
self.tokenizer = tokenizer
76+
self.num_samples = num_samples
77+
self.generation_fn = gen_embeds_fn
78+
self.pooling_type = pooling_type or 'cls'
79+
self.normalize = normalize or False
80+
self.padding_side = padding_side or 'right'
81+
self.gt_dir = os.path.dirname(gt_data)
82+
83+
if base_model:
84+
self.gt_data = self._generate_data(base_model, gen_embeds_fn)
85+
else:
86+
self.gt_data = pd.read_csv(gt_data, keep_default_na=False)
87+
88+
self.similarity = EmbedsSimilarity()
89+
self.last_cmp = None
90+
91+
def get_generation_fn(self):
92+
return self.generation_fn
93+
94+
def score(self, model_or_data, gen_answer_fn=None, output_dir=None, **kwargs):
95+
if output_dir is None:
96+
result_folder = os.path.join(self.gt_dir, "target")
97+
else:
98+
result_folder = os.path.join(output_dir, "target")
99+
100+
if isinstance(model_or_data, str) and os.path.exists(model_or_data):
101+
predictions = pd.read_csv(model_or_data, keep_default_na=False)
102+
else:
103+
predictions = self._generate_data(model_or_data, gen_answer_fn, result_folder)
104+
self.predictions = predictions
105+
106+
all_metrics_per_prompt = {}
107+
all_metrics = {}
108+
all_metrics, all_metrics_per_prompt = self.similarity.evaluate(
109+
self.gt_data, predictions
110+
)
111+
112+
self.last_cmp = all_metrics_per_prompt
113+
self.last_cmp["passages"] = predictions["passages"].values
114+
self.last_cmp["source_model"] = self.gt_data["embeds_path"].values
115+
self.last_cmp["optimized_model"] = predictions["embeds_path"].values
116+
self.last_cmp = pd.DataFrame(self.last_cmp)
117+
118+
return pd.DataFrame(all_metrics_per_prompt), pd.DataFrame([all_metrics])
119+
120+
def worst_examples(self, top_k: int = 5, metric="similarity"):
121+
assert self.last_cmp is not None
122+
res = self.last_cmp.nsmallest(top_k, metric)
123+
return list(row for idx, row in res.iterrows())
124+
125+
def _generate_data(self, model, gen_answer_fn=None, result_dir="reference"):
126+
def default_gen_answer(model, tokenizer, passages, **kwargs):
127+
device = "cpu"
128+
if hasattr(model, "device"):
129+
device = model.device
130+
tokenizer_kwargs = {'padding': 'max_length', 'max_length': DEF_MAX_LENGTH,
131+
'truncation': True, 'padding_side': kwargs.get('padding_side', 'right')}
132+
inputs = self.tokenizer(passages, return_tensors="pt", **tokenizer_kwargs).to(device)
133+
134+
with torch.no_grad():
135+
outputs = model(**inputs)
136+
137+
if model.config.model_type == "qwen3" or kwargs.get("pooling_type", "last_token"):
138+
embeddings = last_token_pool(outputs.last_hidden_state, inputs["attention_mask"])
139+
elif kwargs.get("pooling_type", "mean"):
140+
embeddings = mean_pooling(outputs.last_hidden_state, inputs["attention_mask"])
141+
else:
142+
embeddings = outputs.last_hidden_state[:, 0]
143+
144+
if kwargs.get("normalize", False):
145+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
146+
return embeddings
147+
148+
gen_answer_fn = gen_answer_fn or default_gen_answer
149+
150+
if self.test_data:
151+
if isinstance(self.test_data, str):
152+
data = pd.read_csv(self.test_data)
153+
else:
154+
if isinstance(self.test_data, dict):
155+
assert "prompts" in self.test_data
156+
data = dict(self.test_data)
157+
else:
158+
data = {"prompts": list(self.test_data)}
159+
data = pd.DataFrame.from_dict(data)
160+
else:
161+
data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples))
162+
163+
embeds_paths = []
164+
passages = []
165+
inptus = (
166+
data.values
167+
if self.num_samples is None
168+
else data.values[: self.num_samples]
169+
)
170+
171+
if not os.path.exists(result_dir):
172+
os.makedirs(result_dir)
173+
174+
for i, data in tqdm(enumerate(inptus), desc="Evaluate pipeline"):
175+
kwargs = {'padding_side': self.padding_side,
176+
'pooling_type': self.pooling_type,
177+
'normalize': self.normalize}
178+
result = gen_answer_fn(model, self.tokenizer, data[0], **kwargs)
179+
passages.append(data[0])
180+
result_path = os.path.join(result_dir, f"embeds_{i}.npy")
181+
with open(result_path, 'wb') as f:
182+
np.save(f, result)
183+
embeds_paths.append(result_path)
184+
185+
res_data = {"passages": passages, "embeds_path": embeds_paths}
186+
df = pd.DataFrame(res_data)
187+
188+
return df

tools/who_what_benchmark/whowhatbench/model_loaders.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq, AutoTokenizer
66

7+
from .embeddings_evaluator import DEF_MAX_LENGTH
78
from .utils import mock_torch_cuda_is_available, mock_AwqQuantizer_validate_environment
89

910

@@ -20,7 +21,7 @@ def __init__(self, model, model_dir, model_type):
2021
self.model = model
2122
self.model_type = model_type
2223

23-
if model_type == "text" or model_type == "visual-text":
24+
if model_type in ["text", "visual-text", "text-embedding"]:
2425
try:
2526
self.config = AutoConfig.from_pretrained(model_dir)
2627
except Exception:
@@ -428,6 +429,62 @@ def load_inpainting_model(
428429
return model
429430

430431

432+
def load_embedding_genai_pipeline(model_dir, device="CPU", ov_config=None, **kwargs):
433+
try:
434+
import openvino_genai
435+
except ImportError as e:
436+
logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e)
437+
exit(-1)
438+
439+
config = openvino_genai.TextEmbeddingPipeline.Config()
440+
if kwargs.get("pooling_type"):
441+
if kwargs.get("pooling_type") == "mean":
442+
config.pooling_type = openvino_genai.TextEmbeddingPipeline.PoolingType.MEAN
443+
elif kwargs.get("pooling_type") == "last_token":
444+
config.pooling_type = openvino_genai.TextEmbeddingPipeline.PoolingType.LAST_TOKEN
445+
else:
446+
config.pooling_type = openvino_genai.TextEmbeddingPipeline.PoolingType.CLS
447+
config.max_length = DEF_MAX_LENGTH
448+
config.normalize = kwargs.get("normalize", False)
449+
config.pad_to_max_length = True
450+
451+
logger.info("Using OpenVINO GenAI TextEmbeddingPipeline API")
452+
pipeline = openvino_genai.TextEmbeddingPipeline(model_dir, device.upper(), config, **ov_config)
453+
454+
return GenAIModelWrapper(
455+
pipeline,
456+
model_dir,
457+
"text-embedding"
458+
)
459+
460+
461+
def load_embedding_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, **kwargs):
462+
if use_hf:
463+
from transformers import AutoModel
464+
logger.info("Using HF Transformers API")
465+
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
466+
elif use_genai:
467+
logger.info("Using OpenVINO GenAI API")
468+
model = load_embedding_genai_pipeline(model_id, device, ov_config, **kwargs)
469+
else:
470+
logger.info("Using Optimum API")
471+
from optimum.intel.openvino import OVModelForFeatureExtraction
472+
try:
473+
model = OVModelForFeatureExtraction.from_pretrained(
474+
model_id, device=device, ov_config=ov_config, safety_checker=None,
475+
)
476+
except ValueError as e:
477+
logger.error("Failed to load inpaiting pipeline. Details:\n", e)
478+
model = OVModelForFeatureExtraction.from_pretrained(
479+
model_id,
480+
trust_remote_code=True,
481+
use_cache=True,
482+
device=device,
483+
ov_config=ov_config,
484+
safety_checker=None
485+
)
486+
return model
487+
431488
def load_model(
432489
model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False, **kwargs
433490
):
@@ -452,5 +509,7 @@ def load_model(
452509
return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai)
453510
elif model_type == "image-inpainting":
454511
return load_inpainting_model(model_id, device, ov_options, use_hf, use_genai)
512+
elif model_type == "text-embedding":
513+
return load_embedding_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
455514
else:
456515
raise ValueError(f"Unsupported model type: {model_type}")

0 commit comments

Comments
 (0)