Skip to content
Merged
6 changes: 6 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,12 @@ Multimodal Embedding
- :code:`TIGER-Lab/VLM2Vec-Full`
- 🚧
- ✅︎
* - :code:`Qwen2VLForConditionalGeneration`
- Qwen2-VL
- T + I
- :code:`MrLight/dse-qwen2-2b-mrl-v1`
-
- ✅︎

.. important::
Some model architectures support both generation and embedding tasks.
Expand Down
17 changes: 17 additions & 0 deletions docs/source/models/vlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -310,4 +310,21 @@ Since the request schema is not defined by OpenAI client, we post a request to t
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"])

Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model.

.. code-block:: bash

vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
--trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja

.. important::

Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings,
which is handled by the jinja template.

.. important::

Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code
example below for details.

A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
111 changes: 93 additions & 18 deletions examples/openai_chat_embedding_client_for_multimodal.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,108 @@
import base64
import io

import requests
from PIL import Image

image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model":
"TIGER-Lab/VLM2Vec-Full",
"messages": [{

def vlm2vec():
response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model":
"TIGER-Lab/VLM2Vec-Full",
"messages": [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "Represent the given image."
},
],
}],
"encoding_format":
"float",
},
)
response.raise_for_status()
response_json = response.json()

print("Embedding output:", response_json["data"][0]["embedding"])


def dse_qwen2_vl(inp: dict):
# Embedding an Image
if inp["dtype"] == "image":
messages = [{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": inp["image_url"],
}
}, {
"type": "text",
"text": "What is shown in this image?"
}]
}]
# Embedding a Text Query
else:
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer = io.BytesIO()
image_placeholder = Image.new("RGB", (56, 56))
image_placeholder.save(buffer, "png")
buffer.seek(0)
image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
"url": f"data:image/jpeg;base64,{image_placeholder}",
}
},
{
"type": "text",
"text": "Represent the given image."
"text": f"Query: {inp['content']}"
},
],
}],
"encoding_format":
"float",
},
)
response.raise_for_status()
response_json = response.json()

print("Embedding output:", response_json["data"][0]["embedding"])
]
}]

response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model": "MrLight/dse-qwen2-2b-mrl-v1",
"messages": messages,
"encoding_format": "float",
},
)
response.raise_for_status()
response_json = response.json()

print("Embedding output:", response_json["data"][0]["embedding"])


if __name__ == '__main__':
vlm2vec()

dse_qwen2_vl({
"dtye": "image",
"image_url": image_url,
})
dse_qwen2_vl({
"dtype": "text",
"content": "What is the weather like today?",
})
7 changes: 7 additions & 0 deletions examples/template_dse_qwen2_vl.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system
You are a helpful assistant.<|im_end|>
{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %}
{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %}
{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %}
{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %}
{% endraw %}{% endif %}<|endoftext|>
228 changes: 228 additions & 0 deletions tests/models/embedding/vision_language/test_dse_qwen2_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
import os
from typing import List, Type

import pytest
import torch
from PIL import Image
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ..utils import check_embeddings_close

HF_TEXT_PROMPTS = [
# T -> X
(
"Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501,
Image.new("RGB", (56, 56))),
# T -> X
("Query: Retrieve an image of this caption: cherry blossom",
Image.new("RGB", (56, 56))),
]

HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"What is shown in this image?",
"cherry_blossom":
"What is shown in this image?"
})

MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]


class QwenVLEncoder:

def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
attn = "flash_attention_2" if self.device == "cuda" else None

os.environ["TOKENIZERS_PARALLELISM"] = "true"
self.processor = AutoProcessor.from_pretrained(MODELS[0])
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
MODELS[0], attn_implementation=attn,
torch_dtype=torch.bfloat16).to(self.device).eval()
self.processor.tokenizer.padding_side = "left"
self.model.padding_side = "left"
self.base_embed_dim = 1536

def _get_embedding(self, last_hidden_state: torch.Tensor,
dimension: int) -> torch.Tensor:
reps = last_hidden_state[:, -1]
reps = torch.nn.functional.normalize(reps[0, :dimension], p=2, dim=-1)
return reps

def embed(self, inp: dict, embed_dim: int = 1536) -> torch.Tensor:
"""
inp: dict
{
"dtype": "image",
"image": PIL.Image,
}
or
{
"dtype": "text",
"question": (str) the question to embed,
}
embed_dim: int
Will slice embeddings like emb[:embed_dim]
"""
if inp["dtype"] == "image":
messages = [[{
"role":
"user",
"content": [{
"type": "image",
"image": inp["image"]
}, {
"type": "text",
"text": "What is shown in this image?"
}]
}]]
else:
messages = [[{
"role":
"user",
"content": [
{
"type": "image",
"image": Image.new("RGB", (28, 28)),
"resized_height": 1,
"resized_width": 1
}, # need a dummy image here for an easier process.
{
"type": "text",
"text": f"{inp['question']}"
},
]
}]]
image_inputs, _ = process_vision_info(messages)

texts = [
self.processor.apply_chat_template(
msg, tokenize=False, add_generation_prompt=True) +
"<|endoftext|>" for msg in messages
]
inputs = self.processor(text=texts,
images=image_inputs,
padding="longest",
return_tensors="pt").to(self.device)
inputs = self.model.prepare_inputs_for_generation(**inputs,
use_cache=False)

with torch.no_grad():
output = self.model(**inputs,
return_dict=True,
output_hidden_states=True)

embeddings = self._get_embedding(output.hidden_states[-1], embed_dim)
return embeddings


def _run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
input_texts: List[str],
input_images: PromptImageInput,
model: str,
*,
dtype: str,
) -> None:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
processor = AutoProcessor.from_pretrained(MODELS[0])
with vllm_runner(model,
task="embedding",
dtype=dtype,
enforce_eager=True,
max_model_len=8192) as vllm_model:
texts = [
processor.apply_chat_template([{
"role":
"user",
"content": [
{
"type": "image",
"image": Image.new("RGB", (28, 28)),
"resized_height": 1,
"resized_width": 1
},
{
"type": "text",
"text": text
},
]
}],
tokenize=False,
add_generation_prompt=True) +
"<|endoftext|>" for text in input_texts
]
vllm_outputs = vllm_model.encode(texts, images=input_images)

hf_model = QwenVLEncoder()
hf_outputs = []
for text, image in zip(input_texts, input_images):
if text.startswith("Query:"):
inp = {"dtype": "text", "question": text}
else:
inp = {"dtype": "image", "image": image}
hf_outputs.append(hf_model.embed(inp).tolist())
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [(text, image_placeholder)
for text, image_placeholder in HF_TEXT_PROMPTS]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]

_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
model,
dtype=dtype,
)


@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image)
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]

_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
model,
dtype=dtype,
)
Loading