Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions tests/core/block/test_cpu_offloading_block_allocator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import pytest

from vllm.core.block.cpu_offloading_block_allocator import (
CpuOffloadingBlockAllocator)
from vllm.utils import Device, chunk_list


@pytest.mark.parametrize("num_cpu_blocks", [1024])
@pytest.mark.parametrize("num_gpu_blocks", [256])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("allocator_type", ["prefix_caching"])
def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuOffloadingBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
)

assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks

gpu_blocks = [
allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
for _ in range(num_gpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
assert len(allocator._uncached_blocks) == num_gpu_blocks

blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(0.0)
assert len(blocks_to_swap_out) == 0
assert len(blocks_to_swap_in) == 0
assert len(allocator._uncached_blocks) == num_gpu_blocks

_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks

blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(1.0)
assert len(blocks_to_swap_out) == 0
assert len(blocks_to_swap_in) == 0
assert len(allocator._uncached_blocks) == 0


@pytest.mark.parametrize("num_cpu_blocks", [1024])
@pytest.mark.parametrize("num_gpu_blocks", [256])
@pytest.mark.parametrize("block_size", [2])
@pytest.mark.parametrize("allocator_type", ["prefix_caching"])
def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuOffloadingBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
)

unique_token_ids = list(
range((num_cpu_blocks + num_gpu_blocks) * block_size))
gpu_token_ids = list(
chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
gpu_token_ids2 = list(
chunk_list(
unique_token_ids[num_gpu_blocks * block_size:2 * num_gpu_blocks *
block_size], block_size))

gpu_blocks = [
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.GPU)
for token_ids in gpu_token_ids
]

assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
assert len(allocator._uncached_blocks) == num_gpu_blocks

blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(0.0)
assert len(blocks_to_swap_out) == 0
assert len(blocks_to_swap_in) == 0
assert len(allocator._uncached_blocks) == num_gpu_blocks

allocator.mark_blocks_as_computed([block.block_id for block in gpu_blocks])
blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(1.0)
assert len(blocks_to_swap_out) + len(blocks_to_swap_in) == num_gpu_blocks
assert len(allocator._uncached_blocks) == 0

_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks

blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(1.0)
assert len(blocks_to_swap_out) == 0
assert len(blocks_to_swap_in) == 0
assert len(allocator._uncached_blocks) == 0

# allocate another gpu sequence to flush out the GPU cache
gpu_blocks = [
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.GPU)
for token_ids in gpu_token_ids2
]

assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
assert all([
not allocator._allocators[Device.GPU].block_is_computed(block.block_id)
for block in gpu_blocks
])

_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks

blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(2.0)
assert len(blocks_to_swap_out) == 0
assert len(blocks_to_swap_in) == 0
assert len(allocator._uncached_blocks) == 0

# allocate original gpu sequence. It should hit CPU cache.
gpu_blocks = [
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.GPU)
for token_ids in gpu_token_ids
]

delta = num_cpu_blocks - num_gpu_blocks
assert allocator.get_num_free_blocks(Device.CPU) == delta
assert allocator.get_num_free_blocks(Device.GPU) == 0
assert all([
allocator._allocators[Device.GPU].block_is_computed(block.block_id)
for block in gpu_blocks
])

blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(3.0)
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
9 changes: 9 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,7 @@ def __init__(
sliding_window: Optional[int] = None,
enable_prefix_caching: bool = False,
cpu_offload_gb: float = 0,
block_allocator: str = "CpuGpuBlockAllocator",
) -> None:
self.block_size = block_size
self.gpu_memory_utilization = gpu_memory_utilization
Expand All @@ -980,6 +981,7 @@ def __init__(
self.sliding_window = sliding_window
self.enable_prefix_caching = enable_prefix_caching
self.cpu_offload_gb = cpu_offload_gb
self.block_allocator = block_allocator

self._verify_args()
self._verify_cache_dtype()
Expand All @@ -1004,6 +1006,13 @@ def _verify_args(self) -> None:
raise ValueError("CUDA Paged Attention kernel only supports "
f"block sizes up to 32. Got {self.block_size}.")

if self.block_allocator not in [
"CpuGpuBlockAllocator", "CpuOffloadingBlockAllocator"
]:
raise ValueError(
"Only CpuGpuBlockAllocator and CpuOffloadingBlockAllocator is "
f"supported. Got {self.block_allocator}.")

def _verify_cache_dtype(self) -> None:
if self.cache_dtype == "auto":
pass
Expand Down
16 changes: 11 additions & 5 deletions vllm/core/block/cpu_gpu_block_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,17 +339,23 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float:
assert device in self._allocators
return self._allocators[device].get_prefix_cache_hit_rate()

def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
def get_and_reset_swaps(self,
now: float) -> Tuple[List[Tuple[int, int]], ...]:
"""Returns and clears the mapping of source to destination block IDs.
Will be called after every swapping operations for now, and after every
schedule when BlockManagerV2 become default. Currently not useful.


Args:
now (float): The time stamp.
Returns:
List[Tuple[int, int]]: A mapping of source to destination block IDs.
A tuple of two lists: (blocks_to_swap_out, blocks_to_swap_in).
Each list is a List[Tuple[int, int]], containing the mapping of
source to destination block IDs. The block IDs are physical block
IDs and it's expected to be used by the cache engine directly.
"""
mapping = self._swap_mapping.copy()
self._swap_mapping.clear()
return list(mapping.items())
# return empty lists, to keep compatibility with previous behavior
return [], []

def find_cached_blocks_prefix(
self,
Expand Down
Loading
Loading