From ebae2f31828d01f9749a5bcba1f05a28ef8aeaad Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 10:20:20 +0000 Subject: [PATCH 1/5] Use a proper chat template for VLM2Vec --- docs/source/models/vlm.rst | 12 ++++++++--- ..._chat_completion_client_for_multimodal.py} | 0 ...ai_chat_embedding_client_for_multimodal.py | 21 +++++++++++++++++++ examples/template_vlm2vec.jinja | 16 ++++++++++++++ vllm/entrypoints/chat_utils.py | 15 +++++++++---- 5 files changed, 57 insertions(+), 7 deletions(-) rename examples/{openai_api_client_for_multimodal.py => openai_chat_completion_client_for_multimodal.py} (100%) create mode 100644 examples/openai_chat_embedding_client_for_multimodal.py create mode 100644 examples/template_vlm2vec.jinja diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index ac6405b9807a..77b126cf9f2a 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -240,8 +240,7 @@ To consume the server, you can use the OpenAI client like in the example below: ) print("Chat completion output:", chat_response.choices[0].message.content) - -A full code example can be found in `examples/openai_api_client_for_multimodal.py `_. +A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. .. tip:: There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. @@ -269,13 +268,18 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. .. code-block:: bash vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ - --trust-remote-code --max-model-len 4096 + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja .. important:: Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` to run this model in embedding mode instead of text generation mode. +.. important:: + + VLM2Vec does not expect chat-based input. We use a `custom chat template `_ + to combine the text and images together. + Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: .. code-block:: python @@ -301,3 +305,5 @@ Since this schema is not defined by OpenAI client, we post a request to the serv response.raise_for_status() response_json = response.json() print("Embedding output:", response_json["data"][0]["embedding"]) + +A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py similarity index 100% rename from examples/openai_api_client_for_multimodal.py rename to examples/openai_chat_completion_client_for_multimodal.py diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py new file mode 100644 index 000000000000..1e07fb2759c7 --- /dev/null +++ b/examples/openai_chat_embedding_client_for_multimodal.py @@ -0,0 +1,21 @@ +import requests + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, +) +response.raise_for_status() +response_json = response.json() +print("Embedding output:", response_json["data"][0]["embedding"]) diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec.jinja new file mode 100644 index 000000000000..489b99604af3 --- /dev/null +++ b/examples/template_vlm2vec.jinja @@ -0,0 +1,16 @@ +{%- if messages | length > 1 -%} + {{ raise_exception('Embedding models should only embed one message at a time') }} +{%- endif -%} + +{% set vars = namespace(parts=[], next_image_id=1) %} +{%- for message in messages -%} + {%- for content in message['content'] -%} + {%- if content['type'] == 'text' -%} + {%- set vars.parts = vars.parts + [content['text']] %} + {%- elif content['type'] == 'image' -%} + {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %} + {%- set vars.next_image_id = vars.next_image_id + 1 %} + {%- endif -%} + {%- endfor -%} +{%- endfor -%} +{{ vars.parts | join(' ') }} diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index ce36f20760f4..e38380214a6c 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -156,6 +156,10 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): self._items: List[_T] = [] + @property + def model_config(self) -> ModelConfig: + return self._model_config + @staticmethod @lru_cache(maxsize=None) def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: @@ -491,10 +495,13 @@ def _parse_chat_message_content_parts( content: List[Union[str, Dict[str, str]]] = [] mm_parser = mm_tracker.create_parser() - wrap_dicts = \ - mm_tracker._model_config.hf_config.model_type in \ - MODEL_KEEP_MULTI_MODAL_CONTENT or \ - (chat_template_text_format == "openai") + model_config = mm_tracker.model_config + + wrap_dicts = ( + chat_template_text_format == "openai" + or model_config.task == "embedding" and model_config.is_multimodal_model + or model_config.hf_config.model_type in MODEL_KEEP_MULTI_MODAL_CONTENT + ) for part in parts: parse_res = _parse_chat_message_content_part( From 44db47c0bf4ec5bc7a6af8460d3a948d2a4f8591 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 10:25:05 +0000 Subject: [PATCH 2/5] format --- ...ai_chat_embedding_client_for_multimodal.py | 22 ++++++++++++++----- vllm/entrypoints/chat_utils.py | 10 ++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py index 1e07fb2759c7..effb588e1387 100644 --- a/examples/openai_chat_embedding_client_for_multimodal.py +++ b/examples/openai_chat_embedding_client_for_multimodal.py @@ -5,17 +5,29 @@ response = requests.post( "http://localhost:8000/v1/embeddings", json={ - "model": "TIGER-Lab/VLM2Vec-Full", + "model": + "TIGER-Lab/VLM2Vec-Full", "messages": [{ - "role": "user", + "role": + "user", "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, ], }], - "encoding_format": "float", + "encoding_format": + "float", }, ) response.raise_for_status() response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"]) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index e38380214a6c..bc2de2d16247 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -497,11 +497,11 @@ def _parse_chat_message_content_parts( mm_parser = mm_tracker.create_parser() model_config = mm_tracker.model_config - wrap_dicts = ( - chat_template_text_format == "openai" - or model_config.task == "embedding" and model_config.is_multimodal_model - or model_config.hf_config.model_type in MODEL_KEEP_MULTI_MODAL_CONTENT - ) + wrap_dicts = (chat_template_text_format == "openai" + or (model_config.task == "embedding" + and model_config.is_multimodal_model) + or (model_config.hf_config.model_type + in MODEL_KEEP_MULTI_MODAL_CONTENT)) for part in parts: parse_res = _parse_chat_message_content_part( From 0649949db75fa590fd4258b64060626095abc091 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 10:34:55 +0000 Subject: [PATCH 3/5] Fix link --- docs/source/models/vlm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 77b126cf9f2a..fbb8119eb94c 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -277,7 +277,7 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. .. important:: - VLM2Vec does not expect chat-based input. We use a `custom chat template `_ + VLM2Vec does not expect chat-based input. We use a `custom chat template `_ to combine the text and images together. Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: From c07662443468a01e6c4ea6c298cc1780ebc7ce9f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 10:35:37 +0000 Subject: [PATCH 4/5] Reword --- docs/source/models/vlm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index fbb8119eb94c..3377502a6db2 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -280,7 +280,7 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. VLM2Vec does not expect chat-based input. We use a `custom chat template `_ to combine the text and images together. -Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: +Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: .. code-block:: python From dd73c7167e0122cdc103e52a067aabd8d6c50d94 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 12:38:21 +0000 Subject: [PATCH 5/5] Fix tests --- tests/entrypoints/openai/test_vision_embedding.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 73a69da32e43..d0c43b47bf0a 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -6,11 +6,14 @@ from vllm.multimodal.utils import encode_image_base64, fetch_image -from ...utils import RemoteOpenAIServer +from ...utils import VLLM_PATH, RemoteOpenAIServer MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 +vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" +assert vlm2vec_jinja_path.exists() + # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", @@ -35,6 +38,8 @@ def server(): "--trust-remote-code", "--limit-mm-per-prompt", f"image={MAXIMUM_IMAGES}", + "--chat-template", + str(vlm2vec_jinja_path), ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -90,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, assert len(embeddings["data"]) == 1 assert len(embeddings["data"][0]["embedding"]) == 3072 assert embeddings["usage"]["completion_tokens"] == 0 - assert embeddings["usage"]["prompt_tokens"] == 771 - assert embeddings["usage"]["total_tokens"] == 771 + assert embeddings["usage"]["prompt_tokens"] == 762 + assert embeddings["usage"]["total_tokens"] == 762