vllm-project · ApostaC · Dec 26, 2024 · Dec 27, 2024
diff --git a/tests/core/block/test_cpu_offloading_block_allocator.py b/tests/core/block/test_cpu_offloading_block_allocator.py
@@ -0,0 +1,139 @@
+import pytest
+
+from vllm.core.block.cpu_offloading_block_allocator import (
+    CpuOffloadingBlockAllocator)
+from vllm.utils import Device, chunk_list
+
+
+@pytest.mark.parametrize("num_cpu_blocks", [1024])
+@pytest.mark.parametrize("num_gpu_blocks", [256])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("allocator_type", ["prefix_caching"])
+def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                block_size: int, allocator_type: str):
+    allocator = CpuOffloadingBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    gpu_blocks = [
+        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
+        for _ in range(num_gpu_blocks)
+    ]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+    assert len(allocator._uncached_blocks) == num_gpu_blocks
+
+    blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(0.0)
+    assert len(blocks_to_swap_out) == 0
+    assert len(blocks_to_swap_in) == 0
+    assert len(allocator._uncached_blocks) == num_gpu_blocks
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(1.0)
+    assert len(blocks_to_swap_out) == 0
+    assert len(blocks_to_swap_in) == 0
+    assert len(allocator._uncached_blocks) == 0
+
+
+@pytest.mark.parametrize("num_cpu_blocks", [1024])
+@pytest.mark.parametrize("num_gpu_blocks", [256])
+@pytest.mark.parametrize("block_size", [2])
+@pytest.mark.parametrize("allocator_type", ["prefix_caching"])
+def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
+                                  block_size: int, allocator_type: str):
+    allocator = CpuOffloadingBlockAllocator.create(
+        allocator_type=allocator_type,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=num_cpu_blocks,
+        block_size=block_size,
+    )
+
+    unique_token_ids = list(
+        range((num_cpu_blocks + num_gpu_blocks) * block_size))
+    gpu_token_ids = list(
+        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
+    gpu_token_ids2 = list(
+        chunk_list(
+            unique_token_ids[num_gpu_blocks * block_size:2 * num_gpu_blocks *
+                             block_size], block_size))
+
+    gpu_blocks = [
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.GPU)
+        for token_ids in gpu_token_ids
+    ]
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+    assert len(allocator._uncached_blocks) == num_gpu_blocks
+
+    blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(0.0)
+    assert len(blocks_to_swap_out) == 0
+    assert len(blocks_to_swap_in) == 0
+    assert len(allocator._uncached_blocks) == num_gpu_blocks
+
+    allocator.mark_blocks_as_computed([block.block_id for block in gpu_blocks])
+    blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(1.0)
+    assert len(blocks_to_swap_out) + len(blocks_to_swap_in) == num_gpu_blocks
+    assert len(allocator._uncached_blocks) == 0
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(1.0)
+    assert len(blocks_to_swap_out) == 0
+    assert len(blocks_to_swap_in) == 0
+    assert len(allocator._uncached_blocks) == 0
+
+    # allocate another gpu sequence to flush out the GPU cache
+    gpu_blocks = [
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.GPU)
+        for token_ids in gpu_token_ids2
+    ]
+
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+    assert all([
+        not allocator._allocators[Device.GPU].block_is_computed(block.block_id)
+        for block in gpu_blocks
+    ])
+
+    _ = [allocator.free(block) for block in gpu_blocks]
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+
+    blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(2.0)
+    assert len(blocks_to_swap_out) == 0
+    assert len(blocks_to_swap_in) == 0
+    assert len(allocator._uncached_blocks) == 0
+
+    # allocate original gpu sequence. It should hit CPU cache.
+    gpu_blocks = [
+        allocator.allocate_immutable_block(prev_block=None,
+                                           token_ids=token_ids,
+                                           device=Device.GPU)
+        for token_ids in gpu_token_ids
+    ]
+
+    delta = num_cpu_blocks - num_gpu_blocks
+    assert allocator.get_num_free_blocks(Device.CPU) == delta
+    assert allocator.get_num_free_blocks(Device.GPU) == 0
+    assert all([
+        allocator._allocators[Device.GPU].block_is_computed(block.block_id)
+        for block in gpu_blocks
+    ])
+
+    blocks_to_swap_out, blocks_to_swap_in = allocator.get_and_reset_swaps(3.0)
+    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
diff --git a/vllm/config.py b/vllm/config.py
@@ -970,6 +970,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         enable_prefix_caching: bool = False,
         cpu_offload_gb: float = 0,
+        block_allocator: str = "CpuGpuBlockAllocator",
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
@@ -980,6 +981,7 @@ def __init__(
         self.sliding_window = sliding_window
         self.enable_prefix_caching = enable_prefix_caching
         self.cpu_offload_gb = cpu_offload_gb
+        self.block_allocator = block_allocator
 
         self._verify_args()
         self._verify_cache_dtype()
@@ -1004,6 +1006,13 @@ def _verify_args(self) -> None:
             raise ValueError("CUDA Paged Attention kernel only supports "
                              f"block sizes up to 32. Got {self.block_size}.")
 
+        if self.block_allocator not in [
+                "CpuGpuBlockAllocator", "CpuOffloadingBlockAllocator"
+        ]:
+            raise ValueError(
+                "Only CpuGpuBlockAllocator and CpuOffloadingBlockAllocator is "
+                f"supported. Got {self.block_allocator}.")
+
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -339,17 +339,23 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float:
         assert device in self._allocators
         return self._allocators[device].get_prefix_cache_hit_rate()
 
-    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
+    def get_and_reset_swaps(self,
+                            now: float) -> Tuple[List[Tuple[int, int]], ...]:
         """Returns and clears the mapping of source to destination block IDs.
         Will be called after every swapping operations for now, and after every
         schedule when BlockManagerV2 become default. Currently not useful.
-
+
+        Args:
+            now (float): The time stamp.
         Returns:
-            List[Tuple[int, int]]: A mapping of source to destination block IDs.
+            A tuple of two lists: (blocks_to_swap_out, blocks_to_swap_in).
+            Each list is a List[Tuple[int, int]], containing the mapping of 
+            source to destination block IDs. The block IDs are physical block
+            IDs and it's expected to be used by the cache engine directly.
         """
-        mapping = self._swap_mapping.copy()
         self._swap_mapping.clear()
-        return list(mapping.items())
+        # return empty lists, to keep compatibility with previous behavior
+        return [], []
 
     def find_cached_blocks_prefix(
         self,