Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
86 commits
Select commit Hold shift + click to select a range
d29f791
[Core] Use key-only cache in P0
DarkLight1337 Aug 16, 2025
e996375
Fix
DarkLight1337 Aug 16, 2025
6b5fa6f
Rename
DarkLight1337 Aug 16, 2025
ca8fbe1
Simplify
DarkLight1337 Aug 16, 2025
67e7f8e
Fix mypy
DarkLight1337 Aug 16, 2025
c75474a
Clean
DarkLight1337 Aug 16, 2025
68c0104
Handle DP inside factory
DarkLight1337 Aug 16, 2025
15beccd
Doc
DarkLight1337 Aug 16, 2025
79aefee
Reorder to fit docs
DarkLight1337 Aug 16, 2025
353730a
[Refactor] Allow optional MultiModalKwargsItem
DarkLight1337 Aug 16, 2025
3b6dc78
Fix
DarkLight1337 Aug 16, 2025
c78f259
Merge branch 'optional-mm-item' into mm-cache-interface
DarkLight1337 Aug 16, 2025
00a4e73
Message
DarkLight1337 Aug 16, 2025
a05829b
Fix type annotation
DarkLight1337 Aug 16, 2025
6881cf3
Update
DarkLight1337 Aug 16, 2025
219381e
Update
DarkLight1337 Aug 16, 2025
b7d1b17
More WIP
DarkLight1337 Aug 16, 2025
5147bb1
[Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs
DarkLight1337 Aug 17, 2025
28cc31d
Update docs
DarkLight1337 Aug 17, 2025
a032c23
Fixes
DarkLight1337 Aug 17, 2025
01c9a24
Fix test
DarkLight1337 Aug 17, 2025
3055f82
Fix cache
DarkLight1337 Aug 17, 2025
af3fb3d
Fix cache
DarkLight1337 Aug 17, 2025
220bdfc
Rename
DarkLight1337 Aug 17, 2025
9fa89a1
Fix common processor test
DarkLight1337 Aug 17, 2025
ed614e1
Fix equality check
DarkLight1337 Aug 17, 2025
4df611a
Fix mypy
DarkLight1337 Aug 17, 2025
b6b0ff8
Fixes
DarkLight1337 Aug 17, 2025
09acec2
Fix serialization
DarkLight1337 Aug 18, 2025
95ed7ec
Fix caching
DarkLight1337 Aug 18, 2025
3b0096f
Fix test
DarkLight1337 Aug 18, 2025
5fbd824
Merge branch 'main' into separate-mm-kwargs
DarkLight1337 Aug 18, 2025
946958a
Add deprecation
DarkLight1337 Aug 18, 2025
ef8e93b
Update types
DarkLight1337 Aug 18, 2025
dc5ee9b
Merge branch 'separate-mm-kwargs' into mm-cache-interface
DarkLight1337 Aug 18, 2025
4844d33
[Refactor] Get prompt updates earlier
DarkLight1337 Aug 18, 2025
b5cb48d
Fix
DarkLight1337 Aug 18, 2025
dbb7ccd
Merge branch 'prompt-updates' into mm-cache-interface
DarkLight1337 Aug 18, 2025
c74b413
define proxy
DarkLight1337 Aug 18, 2025
ac738bb
Update
DarkLight1337 Aug 18, 2025
2f58b27
Remove requires_out_mm
DarkLight1337 Aug 18, 2025
480d5fc
Update
DarkLight1337 Aug 18, 2025
05937e2
Fixes
DarkLight1337 Aug 18, 2025
5199f41
Fix mypy
DarkLight1337 Aug 18, 2025
3cb51ac
Fix
DarkLight1337 Aug 18, 2025
b7404d6
Clean up
DarkLight1337 Aug 18, 2025
40fcfea
Fix incorrect index
DarkLight1337 Aug 19, 2025
c909f6f
Improve doc
DarkLight1337 Aug 19, 2025
96341c1
Improve doc
DarkLight1337 Aug 19, 2025
486f2fc
Simplify
DarkLight1337 Aug 19, 2025
fbf8aca
Typo
DarkLight1337 Aug 19, 2025
cd4be9a
Fix type annotation
DarkLight1337 Aug 19, 2025
a8b4607
Fix MiniCPMV
DarkLight1337 Aug 19, 2025
c3661ba
Simplify
DarkLight1337 Aug 19, 2025
0585a3e
Fix missing prompt updates
DarkLight1337 Aug 19, 2025
a6e2516
Fix Phi3V and Phi4MM
DarkLight1337 Aug 19, 2025
03c18b6
Fix cache checking
DarkLight1337 Aug 19, 2025
d837dcc
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 20, 2025
232b92a
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 20, 2025
a27faaf
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 20, 2025
d96ef41
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 21, 2025
674109c
Remove warning
DarkLight1337 Aug 21, 2025
589c89a
Update docstring
DarkLight1337 Aug 21, 2025
14e802b
Initialize cache internally
DarkLight1337 Aug 21, 2025
fa2ae90
Simplify
DarkLight1337 Aug 21, 2025
c4d5991
Update cache factories
DarkLight1337 Aug 21, 2025
16ec368
Remove unused logger
DarkLight1337 Aug 21, 2025
149076d
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 21, 2025
985fad4
[Refactor] Enable multiple targets per prompt update definition
DarkLight1337 Aug 22, 2025
ba91f95
Wrong format specifier
DarkLight1337 Aug 22, 2025
afb22c8
Comment
DarkLight1337 Aug 22, 2025
42a9788
Merge branch 'one-replacement-per-modality' into mm-cache-interface
DarkLight1337 Aug 22, 2025
ab0fa80
Update processing
DarkLight1337 Aug 22, 2025
603240d
Clean up
DarkLight1337 Aug 22, 2025
b6cabf4
Comment
DarkLight1337 Aug 22, 2025
bd2a382
Merge branch 'one-replacement-per-modality' into mm-cache-interface
DarkLight1337 Aug 23, 2025
79704be
Update
DarkLight1337 Aug 23, 2025
0aa7a3c
Fix
DarkLight1337 Aug 23, 2025
7e87cfa
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 25, 2025
b31b619
Fix cache tests
DarkLight1337 Aug 25, 2025
e80c84b
Fix Phi3V
DarkLight1337 Aug 25, 2025
9818ed2
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 25, 2025
d617f25
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 25, 2025
0704b07
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 26, 2025
a88c606
Fixes
DarkLight1337 Aug 26, 2025
99049ec
Merge branch 'main' into mm-cache-interface
DarkLight1337 Aug 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/configuration/conserving_memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",

If you run out of CPU RAM, try the following options:

- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process)
- (Multi-modal models only) you can set the size of multi-modal cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB).
- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).

## Multi-modal input limits
Expand Down
44 changes: 35 additions & 9 deletions docs/configuration/optimization.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,20 +204,33 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
to avoid CPU resource exhaustion.

!!! note
[Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
because it requires a one-to-one correspondence between API and engine core processes.
API server scale-out disables [multi-modal IPC caching](#ipc-caching)
because it requires a one-to-one correspondance between API and engine core processes.

## Multi-Modal Caching
This does not impact [multi-modal processor caching](#processor-caching).

### Processor Cache
## Multi-Modal Caching

By default, the multi-modal processor cache is enabled to avoid repeatedly processing
the same multi-modal inputs via Hugging Face `AutoProcessor`,
Multi-modal caching avoids repeated transfer or processing of the same multi-modal data,
which commonly occurs in multi-turn conversations.

You can adjust the size of the cache by setting the value of `mm_processor_cache_gb`
(default 4 GiB per API process + 4 GiB per engine core process).
If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`.
### Processor Caching

Multi-modal processor caching is automatically enabled
to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalProcessor`.

### IPC Caching

Multi-modal IPC caching is automatically enabled when
there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes,
to avoid repeatedly transferring the same multi-modal inputs between them.

### Configuration

You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB).

If you do not benefit much from the cache, you can disable both IPC
and processor caching completely via `mm_processor_cache_gb=0`.

Examples:

Expand All @@ -230,3 +243,16 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_cache_gb=0)
```

### Cache Placement

Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:

| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory |
|-------------------|-------------|------------|------------|-------------|
| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` |
| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
| ❌ | ❌ | N/A | N/A | `0` |

K: Stores the hashes of multi-modal items
V: Stores the processed tensor data of multi-modal items
8 changes: 5 additions & 3 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
cached_tokenizer_from_config,
encode_tokens)
Expand Down Expand Up @@ -63,6 +64,8 @@ def _test_processing_correctness(
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
# Ensure that the cache can fit all of the data
mm_processor_cache_gb=2048,
)

model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
Expand All @@ -71,8 +74,7 @@ def _test_processing_correctness(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
# Ensure that it can fit all of the data
cache = ProcessingCache(capacity_gb=2048)
cache = MultiModalProcessorOnlyCache(model_config)

processing_info = factories.info(ctx)
supported_mm_limits = processing_info.get_supported_mm_limits()
Expand Down
182 changes: 174 additions & 8 deletions tests/multimodal/test_cache.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,64 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional

import numpy as np
import pytest
import torch

from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
from vllm.config import ModelConfig, ParallelConfig, VllmConfig
from vllm.multimodal.cache import (MultiModalCache,
MultiModalProcessorCacheItem,
MultiModalProcessorCacheItemMetadata,
processor_cache_from_config,
receiver_cache_from_config)
from vllm.multimodal.hasher import MultiModalHasher
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
MultiModalKwargsItems,
MultiModalSharedField)
from vllm.multimodal.processing import PromptInsertion
from vllm.multimodal.registry import MultiModalRegistry


def _dummy_elem(
modality: str,
key: str,
size: int,
*,
rng: Optional[np.random.RandomState] = None,
):
if rng is None:
data = torch.empty((size, ), dtype=torch.int8)
else:
data = torch.from_numpy(rng.randint(4, size=(size, ), dtype=np.int8))

def _dummy_elem(modality: str, key: str, size: int):
return MultiModalFieldElem(
modality=modality,
key=key,
data=torch.empty((size, ), dtype=torch.int8),
data=data,
field=MultiModalSharedField(1),
)


def _dummy_item(modality: str, size_by_key: dict[str, int]):
def _dummy_item(
modality: str,
size_by_key: dict[str, int],
*,
rng: Optional[np.random.RandomState] = None,
):
return MultiModalKwargsItem.from_elems([
_dummy_elem(modality, key, size) for key, size in size_by_key.items()
_dummy_elem(modality, key, size, rng=rng)
for key, size in size_by_key.items()
])


def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]):
def _dummy_items(
size_by_key_modality: dict[str, dict[str, int]],
*,
rng: Optional[np.random.RandomState] = None,
):
return MultiModalKwargsItems.from_seq([
_dummy_item(modality, size_by_key)
_dummy_item(modality, size_by_key, rng=rng)
for modality, size_by_key in size_by_key_modality.items()
])

Expand All @@ -48,5 +80,139 @@ def test_cache_item_size(item, expected_size):
cache[""] = item
assert cache.currsize == expected_size

cache[""] = MultiModalCacheItemMetadata.wraps(item)
prompt_update = PromptInsertion("dummy", "target", "insertion") \
.resolve(0)

cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
assert cache.currsize == expected_size

cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
assert cache.currsize == expected_size


def _create_vllm_config(
*,
mm_processor_cache_gb: float,
enable_ipc: bool,
):
return VllmConfig(
model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb),
parallel_config=ParallelConfig(
data_parallel_size=1 if enable_ipc else 2),
)


def _compare_caches(
config_0: VllmConfig,
config_1: VllmConfig,
*,
item_capacity: int = 8,
hit_rate: float = 0.5,
max_items_per_iter: int = 3,
is_cached_calls_per_iter: int,
n_iter: int = 100,
seed: int = 0,
):
mm_registry = MultiModalRegistry()
cache_0_p0 = processor_cache_from_config(config_0, mm_registry)
cache_0_p1 = receiver_cache_from_config(config_0, mm_registry)
cache_1_p0 = processor_cache_from_config(config_1, mm_registry)
cache_1_p1 = receiver_cache_from_config(config_1, mm_registry)

cache_size_gb = max(
config_0.model_config.mm_processor_cache_gb,
config_1.model_config.mm_processor_cache_gb,
)
item_size_gb = int(cache_size_gb / item_capacity)

rng = np.random.RandomState(seed)
all_items = [
_dummy_item("item", {"key": item_size_gb}, rng=rng)
for _ in range(int(item_capacity / hit_rate))
]
all_hashes = [
MultiModalHasher.hash_kwargs(item=item.get_data())
for item in all_items
]

# Should not be used since there is nothing to convert to text
prompt_update = PromptInsertion("dummy", "target", "insertion")

for it in range(n_iter):
num_items_to_select = rng.randint(0, max_items_per_iter)
item_idxs_to_select = rng.choice(len(all_items), num_items_to_select)

selected_items = [all_items[idx] for idx in item_idxs_to_select]
selected_hashes = [all_hashes[idx] for idx in item_idxs_to_select]

if cache_0_p0 is None:
cache_0_p0_out = selected_items
else:
for _ in range(is_cached_calls_per_iter):
cache_0_p0.is_cached(selected_hashes)
cache_0_p0_out = [
item for item, _ in cache_0_p0.get_and_update(
[(item, prompt_update.content) for item in selected_items],
selected_hashes,
)
]

if cache_1_p0 is None:
cache_1_p0_out = selected_items
else:
for _ in range(is_cached_calls_per_iter):
cache_1_p0.is_cached(selected_hashes)
cache_1_p0_out = [
item for item, _ in cache_1_p0.get_and_update(
[(item, prompt_update.content) for item in selected_items],
selected_hashes,
)
]

if cache_0_p1 is None:
cache_0_p1_out = cache_0_p0_out
else:
cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
selected_hashes)

if cache_1_p1 is None:
cache_1_p1_out = cache_1_p0_out
else:
cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
selected_hashes)

assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"


@pytest.mark.parametrize("is_cached_calls_per_iter", [1, 2, 3])
def test_ipc_enable_disable_consistency(is_cached_calls_per_iter):
cache_size_gb = 1 / (1 << 20)

vllm_config_ipc_enabled = _create_vllm_config(
mm_processor_cache_gb=cache_size_gb,
enable_ipc=True,
)
vllm_config_ipc_disabled = _create_vllm_config(
mm_processor_cache_gb=0,
enable_ipc=False,
)
vllm_config_cache_disabled = _create_vllm_config(
mm_processor_cache_gb=cache_size_gb,
enable_ipc=True,
)

_compare_caches(
vllm_config_ipc_enabled,
vllm_config_ipc_disabled,
is_cached_calls_per_iter=is_cached_calls_per_iter,
)
_compare_caches(
vllm_config_ipc_disabled,
vllm_config_cache_disabled,
is_cached_calls_per_iter=is_cached_calls_per_iter,
)
_compare_caches(
vllm_config_cache_disabled,
vllm_config_ipc_enabled,
is_cached_calls_per_iter=is_cached_calls_per_iter,
)
26 changes: 2 additions & 24 deletions vllm/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ class ModelConfig:
from `AutoProcessor.from_pretrained`. The available overrides depend on the
model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
"""
mm_processor_cache_gb: int = 4
mm_processor_cache_gb: float = 4
"""The size (in GiB) of the multi-modal processor cache, which is used to
avoid re-processing past multi-modal inputs.

Expand Down Expand Up @@ -884,12 +884,6 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:

return None

def set_mm_processor_cache_gb(self, value: int) -> None:
mm_config = self.get_multimodal_config()

self.mm_processor_cache_gb = value
mm_config.mm_processor_cache_gb = value

def _get_encoder_config(self):
return get_sentence_transformer_tokenizer_config(
self.model, self.revision)
Expand Down Expand Up @@ -1697,22 +1691,6 @@ def uses_mrope(self) -> bool:
def is_multimodal_model(self) -> bool:
return self.multimodal_config is not None

@property
def enable_mm_processor_cache(self) -> bool:
"""Whether the multi-modal processor cache should be enabled."""
mm_config = self.multimodal_config
if mm_config is None:
return False

return mm_config.mm_processor_cache_gb > 0

def get_mm_input_cache_gb(self) -> int:
mm_config = self.multimodal_config
if mm_config is None:
return 0

return envs.VLLM_MM_INPUT_CACHE_GIB

@property
def is_cross_encoder(self) -> bool:
return (self._model_info.supports_cross_encoding
Expand Down Expand Up @@ -2561,7 +2539,7 @@ class MultiModalConfig:
`{"num_crops": 4}`.
"""

mm_processor_cache_gb: int = 4
mm_processor_cache_gb: float = 4
"""
The size (in GiB) of the multi-modal processor cache, which is used to

Expand Down
14 changes: 1 addition & 13 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ class EngineArgs:
mm_processor_kwargs: Optional[Dict[str, Any]] = \
MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = False # DEPRECATED
mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
# LoRA fields
Expand Down Expand Up @@ -1293,18 +1293,6 @@ def create_engine_config(
worker_extension_cls=self.worker_extension_cls,
)

if model_config.is_multimodal_model:
dp_supports_mm_processor_cache = (self.data_parallel_size == 1
or data_parallel_external_lb)
if (not dp_supports_mm_processor_cache
and model_config.mm_processor_cache_gb > 0):
logger.warning(
"Multi-modal processor cache is disabled because "
"it is not compatible with data parallelism when "
"there does not exist a one-to-one correspondance "
"between API and engine core processes.")
model_config.set_mm_processor_cache_gb(0)

speculative_config = self.create_speculative_config(
target_model_config=model_config,
target_parallel_config=parallel_config,
Expand Down
Loading