Skip to content

Commit 1fff606

Browse files
DarkLight1337garg-amit
authored andcommitted
[Model] Support E5-V (vllm-project#9576)
Signed-off-by: Amit Garg <[email protected]>
1 parent cc6534b commit 1fff606

File tree

12 files changed

+532
-90
lines changed

12 files changed

+532
-90
lines changed

docs/source/models/supported_models.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,14 @@ The following modalities are supported depending on the model:
334334
- **V**\ ideo
335335
- **A**\ udio
336336

337+
Any combination of modalities joined by :code:`+` are supported.
338+
339+
- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
340+
341+
On the other hand, modalities separated by :code:`/` are mutually exclusive.
342+
343+
- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
344+
337345
.. _supported_vlms:
338346

339347
Text Generation
@@ -484,6 +492,12 @@ Multimodal Embedding
484492
- Example HF Models
485493
- :ref:`LoRA <lora>`
486494
- :ref:`PP <distributed_serving>`
495+
* - :code:`LlavaNextForConditionalGeneration`
496+
- LLaVA-NeXT-based
497+
- T / I
498+
- :code:`royokong/e5-v`
499+
-
500+
- ✅︎
487501
* - :code:`Phi3VForCausalLM`
488502
- Phi-3-Vision-based
489503
- T + I

examples/offline_inference_vision_language.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
2-
This example shows how to use vLLM for running offline inference
3-
with the correct prompt format on vision language models.
2+
This example shows how to use vLLM for running offline inference with
3+
the correct prompt format on vision language models for text generation.
44
55
For most models, the prompt format should follow corresponding examples
66
on HuggingFace model repository.
@@ -450,7 +450,7 @@ def main(args):
450450
if __name__ == "__main__":
451451
parser = FlexibleArgumentParser(
452452
description='Demo on using vLLM for offline inference with '
453-
'vision language models')
453+
'vision language models for text generation')
454454
parser.add_argument('--model-type',
455455
'-m',
456456
type=str,
Lines changed: 169 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,170 @@
1+
"""
2+
This example shows how to use vLLM for running offline inference with
3+
the correct prompt format on vision language models for multimodal embedding.
4+
5+
For most models, the prompt format should follow corresponding examples
6+
on HuggingFace model repository.
7+
"""
8+
from argparse import Namespace
9+
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
10+
11+
from PIL.Image import Image
12+
113
from vllm import LLM
2-
from vllm.assets.image import ImageAsset
3-
4-
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
5-
prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501
6-
7-
# Create an LLM.
8-
llm = LLM(
9-
model="TIGER-Lab/VLM2Vec-Full",
10-
task="embedding",
11-
trust_remote_code=True,
12-
max_model_len=4096,
13-
max_num_seqs=2,
14-
mm_processor_kwargs={"num_crops": 16},
15-
)
16-
17-
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
18-
outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
19-
20-
# Print the outputs.
21-
for output in outputs:
22-
print(output.outputs.embedding) # list of 3072 floats
14+
from vllm.multimodal.utils import fetch_image
15+
from vllm.utils import FlexibleArgumentParser
16+
17+
18+
class TextQuery(TypedDict):
19+
modality: Literal["text"]
20+
text: str
21+
22+
23+
class ImageQuery(TypedDict):
24+
modality: Literal["image"]
25+
image: Image
26+
27+
28+
class TextImageQuery(TypedDict):
29+
modality: Literal["text+image"]
30+
text: str
31+
image: Image
32+
33+
34+
QueryModality = Literal["text", "image", "text+image"]
35+
Query = Union[TextQuery, ImageQuery, TextImageQuery]
36+
37+
38+
class ModelRequestData(NamedTuple):
39+
llm: LLM
40+
prompt: str
41+
image: Optional[Image]
42+
43+
44+
def run_e5_v(query: Query):
45+
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
46+
47+
if query["modality"] == "text":
48+
text = query["text"]
49+
prompt = llama3_template.format(
50+
f"{text}\nSummary above sentence in one word: ")
51+
image = None
52+
elif query["modality"] == "image":
53+
prompt = llama3_template.format(
54+
"<image>\nSummary above image in one word: ")
55+
image = query["image"]
56+
else:
57+
modality = query['modality']
58+
raise ValueError(f"Unsupported query modality: '{modality}'")
59+
60+
llm = LLM(
61+
model="royokong/e5-v",
62+
task="embedding",
63+
max_model_len=4096,
64+
)
65+
66+
return ModelRequestData(
67+
llm=llm,
68+
prompt=prompt,
69+
image=image,
70+
)
71+
72+
73+
def run_vlm2vec(query: Query):
74+
if query["modality"] == "text":
75+
text = query["text"]
76+
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
77+
image = None
78+
elif query["modality"] == "image":
79+
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
80+
image = query["image"]
81+
elif query["modality"] == "text+image":
82+
text = query["text"]
83+
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
84+
image = query["image"]
85+
else:
86+
modality = query['modality']
87+
raise ValueError(f"Unsupported query modality: '{modality}'")
88+
89+
llm = LLM(
90+
model="TIGER-Lab/VLM2Vec-Full",
91+
task="embedding",
92+
trust_remote_code=True,
93+
mm_processor_kwargs={"num_crops": 4},
94+
)
95+
96+
return ModelRequestData(
97+
llm=llm,
98+
prompt=prompt,
99+
image=image,
100+
)
101+
102+
103+
def get_query(modality: QueryModality):
104+
if modality == "text":
105+
return TextQuery(modality="text", text="A dog sitting in the grass")
106+
107+
if modality == "image":
108+
return ImageQuery(
109+
modality="image",
110+
image=fetch_image(
111+
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501
112+
),
113+
)
114+
115+
if modality == "text+image":
116+
return TextImageQuery(
117+
modality="text+image",
118+
text="A cat standing in the snow.",
119+
image=fetch_image(
120+
"https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501
121+
),
122+
)
123+
124+
msg = f"Modality {modality} is not supported."
125+
raise ValueError(msg)
126+
127+
128+
def run_encode(model: str, modality: QueryModality):
129+
query = get_query(modality)
130+
req_data = model_example_map[model](query)
131+
132+
mm_data = {}
133+
if req_data.image is not None:
134+
mm_data["image"] = req_data.image
135+
136+
outputs = req_data.llm.encode({
137+
"prompt": req_data.prompt,
138+
"multi_modal_data": mm_data,
139+
})
140+
141+
for output in outputs:
142+
print(output.outputs.embedding)
143+
144+
145+
def main(args: Namespace):
146+
run_encode(args.model_name, args.modality)
147+
148+
149+
model_example_map = {
150+
"e5_v": run_e5_v,
151+
"vlm2vec": run_vlm2vec,
152+
}
153+
154+
if __name__ == "__main__":
155+
parser = FlexibleArgumentParser(
156+
description='Demo on using vLLM for offline inference with '
157+
'vision language models for multimodal embedding')
158+
parser.add_argument('--model-name',
159+
'-m',
160+
type=str,
161+
default="vlm2vec",
162+
choices=model_example_map.keys(),
163+
help='The name of the embedding model.')
164+
parser.add_argument('--modality',
165+
type=str,
166+
default="image",
167+
choices=get_args(QueryModality),
168+
help='Modality of the input.')
169+
args = parser.parse_args()
170+
main(args)

examples/offline_inference_vision_language_multi_image.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
This example shows how to use vLLM for running offline inference with
3-
multi-image input on vision language models, using the chat template defined
4-
by the model.
3+
multi-image input on vision language models for text generation,
4+
using the chat template defined by the model.
55
"""
66
from argparse import Namespace
77
from typing import List, NamedTuple, Optional
@@ -334,7 +334,8 @@ def main(args: Namespace):
334334
if __name__ == "__main__":
335335
parser = FlexibleArgumentParser(
336336
description='Demo on using vLLM for offline inference with '
337-
'vision language models that support multi-image input')
337+
'vision language models that support multi-image input for text '
338+
'generation')
338339
parser.add_argument('--model-type',
339340
'-m',
340341
type=str,

tests/conftest.py

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,12 @@
4343
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
4444
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
4545

46-
PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
47-
PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
48-
List[List[Tuple[np.ndarray, int]]]]
49-
PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
46+
_M = TypeVar("_M")
47+
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
48+
49+
PromptImageInput = _PromptMultiModalInput[Image.Image]
50+
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
51+
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
5052

5153

5254
def _read_prompts(filename: str) -> List[str]:
@@ -318,12 +320,12 @@ def get_inputs(
318320
"text": prompt,
319321
"return_tensors": "pt",
320322
}
321-
if images is not None and images[i] is not None:
322-
processor_kwargs["images"] = images[i]
323-
if videos is not None and videos[i] is not None:
324-
processor_kwargs["videos"] = videos[i]
325-
if audios is not None and audios[i] is not None:
326-
audio, sr = audios[i]
323+
if images is not None and (image := images[i]) is not None:
324+
processor_kwargs["images"] = image
325+
if videos is not None and (video := videos[i]) is not None:
326+
processor_kwargs["videos"] = video
327+
if audios is not None and (audio_tuple := audios[i]) is not None:
328+
audio, sr = audio_tuple
327329
processor_kwargs["audio"] = audio
328330
processor_kwargs["sampling_rate"] = sr
329331

@@ -338,7 +340,7 @@ def generate(
338340
self,
339341
prompts: List[str],
340342
images: Optional[PromptImageInput] = None,
341-
videos: Optional[List[np.ndarray]] = None,
343+
videos: Optional[PromptVideoInput] = None,
342344
audios: Optional[PromptAudioInput] = None,
343345
**kwargs: Any,
344346
) -> List[Tuple[List[List[int]], List[str]]]:
@@ -368,7 +370,7 @@ def generate_greedy(
368370
prompts: List[str],
369371
max_tokens: int,
370372
images: Optional[PromptImageInput] = None,
371-
videos: Optional[List[np.ndarray]] = None,
373+
videos: Optional[PromptVideoInput] = None,
372374
audios: Optional[PromptAudioInput] = None,
373375
**kwargs: Any,
374376
) -> List[Tuple[List[int], str]]:
@@ -409,7 +411,7 @@ def generate_greedy_logprobs(
409411
prompts: List[str],
410412
max_tokens: int,
411413
images: Optional[PromptImageInput] = None,
412-
videos: Optional[List[np.ndarray]] = None,
414+
videos: Optional[PromptVideoInput] = None,
413415
audios: Optional[PromptAudioInput] = None,
414416
**kwargs: Any,
415417
) -> List[List[torch.Tensor]]:
@@ -488,7 +490,7 @@ def generate_greedy_logprobs_limit(
488490
num_logprobs: int,
489491
images: Optional[PromptImageInput] = None,
490492
audios: Optional[PromptAudioInput] = None,
491-
videos: Optional[List[np.ndarray]] = None,
493+
videos: Optional[PromptVideoInput] = None,
492494
**kwargs: Any,
493495
) -> List[TokensTextLogprobs]:
494496
all_inputs = self.get_inputs(prompts,
@@ -657,15 +659,18 @@ def get_inputs(
657659
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
658660
if images is not None:
659661
for i, image in enumerate(images):
660-
inputs[i]["multi_modal_data"] = {"image": image}
662+
if image is not None:
663+
inputs[i]["multi_modal_data"] = {"image": image}
661664

662665
if videos is not None:
663666
for i, video in enumerate(videos):
664-
inputs[i]["multi_modal_data"] = {"video": video}
667+
if video is not None:
668+
inputs[i]["multi_modal_data"] = {"video": video}
665669

666670
if audios is not None:
667671
for i, audio in enumerate(audios):
668-
inputs[i]["multi_modal_data"] = {"audio": audio}
672+
if audio is not None:
673+
inputs[i]["multi_modal_data"] = {"audio": audio}
669674

670675
return inputs
671676

@@ -837,13 +842,20 @@ def generate_beam_search(
837842
returned_outputs.append((token_ids, texts))
838843
return returned_outputs
839844

840-
def encode(self, prompts: List[str]) -> List[List[float]]:
841-
req_outputs = self.model.encode(prompts)
842-
outputs = []
843-
for req_output in req_outputs:
844-
embedding = req_output.outputs.embedding
845-
outputs.append(embedding)
846-
return outputs
845+
def encode(
846+
self,
847+
prompts: List[str],
848+
images: Optional[PromptImageInput] = None,
849+
videos: Optional[PromptVideoInput] = None,
850+
audios: Optional[PromptAudioInput] = None,
851+
) -> List[List[float]]:
852+
inputs = self.get_inputs(prompts,
853+
images=images,
854+
videos=videos,
855+
audios=audios)
856+
857+
req_outputs = self.model.encode(inputs)
858+
return [req_output.outputs.embedding for req_output in req_outputs]
847859

848860
def __enter__(self):
849861
return self

tests/models/embedding/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ def check_embeddings_close(
1616

1717
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
1818
zip(embeddings_0_lst, embeddings_1_lst)):
19-
assert len(embeddings_0) == len(embeddings_1)
19+
assert len(embeddings_0) == len(embeddings_1), (
20+
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
2021

2122
sim = F.cosine_similarity(torch.tensor(embeddings_0),
2223
torch.tensor(embeddings_1),

0 commit comments

Comments
 (0)