From ebae2f31828d01f9749a5bcba1f05a28ef8aeaad Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 10:20:20 +0000
Subject: [PATCH 1/5] Use a proper chat template for VLM2Vec

---
 docs/source/models/vlm.rst                    | 12 ++++++++---
 ..._chat_completion_client_for_multimodal.py} |  0
 ...ai_chat_embedding_client_for_multimodal.py | 21 +++++++++++++++++++
 examples/template_vlm2vec.jinja               | 16 ++++++++++++++
 vllm/entrypoints/chat_utils.py                | 15 +++++++++----
 5 files changed, 57 insertions(+), 7 deletions(-)
 rename examples/{openai_api_client_for_multimodal.py => openai_chat_completion_client_for_multimodal.py} (100%)
 create mode 100644 examples/openai_chat_embedding_client_for_multimodal.py
 create mode 100644 examples/template_vlm2vec.jinja

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index ac6405b9807a..77b126cf9f2a 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -240,8 +240,7 @@ To consume the server, you can use the OpenAI client like in the example below:
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
 
-
-A full code example can be found in `examples/openai_api_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_api_client_for_multimodal.py>`_.
+A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
 
 .. tip::
     There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
@@ -269,13 +268,18 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
 .. code-block:: bash
 
     vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
-      --trust-remote-code --max-model-len 4096
+      --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 
 .. important::
 
     Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
     to run this model in embedding mode instead of text generation mode.
 
+.. important::
+
+    VLM2Vec does not expect chat-based input. We use a `custom chat template <examples/template_vlm2vec.jinja>`_
+    to combine the text and images together.
+
 Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
 
 .. code-block:: python
@@ -301,3 +305,5 @@ Since this schema is not defined by OpenAI client, we post a request to the serv
     response.raise_for_status()
     response_json = response.json()
     print("Embedding output:", response_json["data"][0]["embedding"])
+
+A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
similarity index 100%
rename from examples/openai_api_client_for_multimodal.py
rename to examples/openai_chat_completion_client_for_multimodal.py
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
new file mode 100644
index 000000000000..1e07fb2759c7
--- /dev/null
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -0,0 +1,21 @@
+import requests
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+response = requests.post(
+    "http://localhost:8000/v1/embeddings",
+    json={
+        "model": "TIGER-Lab/VLM2Vec-Full",
+        "messages": [{
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "Represent the given image."},
+            ],
+        }],
+        "encoding_format": "float",
+    },
+)
+response.raise_for_status()
+response_json = response.json()
+print("Embedding output:", response_json["data"][0]["embedding"])
diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec.jinja
new file mode 100644
index 000000000000..489b99604af3
--- /dev/null
+++ b/examples/template_vlm2vec.jinja
@@ -0,0 +1,16 @@
+{%- if messages | length > 1 -%}
+    {{ raise_exception('Embedding models should only embed one message at a time') }}
+{%- endif -%}
+
+{% set vars = namespace(parts=[], next_image_id=1) %}
+{%- for message in messages -%}
+    {%- for content in message['content'] -%}
+        {%- if content['type'] == 'text' -%}
+            {%- set vars.parts = vars.parts + [content['text']] %}
+        {%- elif content['type'] == 'image' -%}
+            {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
+            {%- set vars.next_image_id = vars.next_image_id + 1 %}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endfor -%}
+{{ vars.parts | join(' ') }}
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ce36f20760f4..e38380214a6c 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -156,6 +156,10 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
 
         self._items: List[_T] = []
 
+    @property
+    def model_config(self) -> ModelConfig:
+        return self._model_config
+
     @staticmethod
     @lru_cache(maxsize=None)
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -491,10 +495,13 @@ def _parse_chat_message_content_parts(
     content: List[Union[str, Dict[str, str]]] = []
 
     mm_parser = mm_tracker.create_parser()
-    wrap_dicts = \
-        mm_tracker._model_config.hf_config.model_type in \
-            MODEL_KEEP_MULTI_MODAL_CONTENT or \
-        (chat_template_text_format == "openai")
+    model_config = mm_tracker.model_config
+
+    wrap_dicts = (
+        chat_template_text_format == "openai"
+        or model_config.task == "embedding" and model_config.is_multimodal_model
+        or model_config.hf_config.model_type in MODEL_KEEP_MULTI_MODAL_CONTENT
+    )
 
     for part in parts:
         parse_res = _parse_chat_message_content_part(

From 44db47c0bf4ec5bc7a6af8460d3a948d2a4f8591 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 10:25:05 +0000
Subject: [PATCH 2/5] format

---
 ...ai_chat_embedding_client_for_multimodal.py | 22 ++++++++++++++-----
 vllm/entrypoints/chat_utils.py                | 10 ++++-----
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
index 1e07fb2759c7..effb588e1387 100644
--- a/examples/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -5,17 +5,29 @@
 response = requests.post(
     "http://localhost:8000/v1/embeddings",
     json={
-        "model": "TIGER-Lab/VLM2Vec-Full",
+        "model":
+        "TIGER-Lab/VLM2Vec-Full",
         "messages": [{
-            "role": "user",
+            "role":
+            "user",
             "content": [
-                {"type": "image_url", "image_url": {"url": image_url}},
-                {"type": "text", "text": "Represent the given image."},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Represent the given image."
+                },
             ],
         }],
-        "encoding_format": "float",
+        "encoding_format":
+        "float",
     },
 )
 response.raise_for_status()
 response_json = response.json()
+
 print("Embedding output:", response_json["data"][0]["embedding"])
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index e38380214a6c..bc2de2d16247 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -497,11 +497,11 @@ def _parse_chat_message_content_parts(
     mm_parser = mm_tracker.create_parser()
     model_config = mm_tracker.model_config
 
-    wrap_dicts = (
-        chat_template_text_format == "openai"
-        or model_config.task == "embedding" and model_config.is_multimodal_model
-        or model_config.hf_config.model_type in MODEL_KEEP_MULTI_MODAL_CONTENT
-    )
+    wrap_dicts = (chat_template_text_format == "openai"
+                  or (model_config.task == "embedding"
+                      and model_config.is_multimodal_model)
+                  or (model_config.hf_config.model_type
+                      in MODEL_KEEP_MULTI_MODAL_CONTENT))
 
     for part in parts:
         parse_res = _parse_chat_message_content_part(

From 0649949db75fa590fd4258b64060626095abc091 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 10:34:55 +0000
Subject: [PATCH 3/5] Fix link

---
 docs/source/models/vlm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 77b126cf9f2a..fbb8119eb94c 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -277,7 +277,7 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
 
 .. important::
 
-    VLM2Vec does not expect chat-based input. We use a `custom chat template <examples/template_vlm2vec.jinja>`_
+    VLM2Vec does not expect chat-based input. We use a `custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`_
     to combine the text and images together.
 
 Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:

From c07662443468a01e6c4ea6c298cc1780ebc7ce9f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 10:35:37 +0000
Subject: [PATCH 4/5] Reword

---
 docs/source/models/vlm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index fbb8119eb94c..3377502a6db2 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -280,7 +280,7 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
     VLM2Vec does not expect chat-based input. We use a `custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`_
     to combine the text and images together.
 
-Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
+Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
 
 .. code-block:: python
 

From dd73c7167e0122cdc103e52a067aabd8d6c50d94 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 1 Nov 2024 12:38:21 +0000
Subject: [PATCH 5/5] Fix tests

---
 tests/entrypoints/openai/test_vision_embedding.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 73a69da32e43..d0c43b47bf0a 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -6,11 +6,14 @@
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
-from ...utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2
 
+vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
+assert vlm2vec_jinja_path.exists()
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -35,6 +38,8 @@ def server():
         "--trust-remote-code",
         "--limit-mm-per-prompt",
         f"image={MAXIMUM_IMAGES}",
+        "--chat-template",
+        str(vlm2vec_jinja_path),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -90,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings["data"]) == 1
     assert len(embeddings["data"][0]["embedding"]) == 3072
     assert embeddings["usage"]["completion_tokens"] == 0
-    assert embeddings["usage"]["prompt_tokens"] == 771
-    assert embeddings["usage"]["total_tokens"] == 771
+    assert embeddings["usage"]["prompt_tokens"] == 762
+    assert embeddings["usage"]["total_tokens"] == 762