vllm-project · youkaichao · Feb 23, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -194,6 +194,7 @@ steps:
     - tests/v1
   commands:
     # split the test to avoid interference
+    - VLLM_ENABLE_V1_MULTIPROCESSING=0 VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
     - VLLM_USE_V1=1 pytest -v -s v1/core
     - VLLM_USE_V1=1 pytest -v -s v1/engine
     - VLLM_USE_V1=1 pytest -v -s v1/sample

diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
@@ -47,6 +47,9 @@ def test_consistent_across_ranks(obj):
     llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
 test_consistent_across_ranks(
     llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
+              model.parameters())
+test_consistent_across_ranks(len(params))
 
 # all ranks should have the same outputs
 for output in outputs:

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
@@ -5,6 +5,7 @@
 import time
 import uuid
 from concurrent.futures import Future
+from typing import List
 
 import pytest
 from transformers import AutoTokenizer
@@ -211,8 +212,9 @@ def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
 
     class DummyExecutor(UniProcExecutor):
 
-        def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-            super().initialize(kv_cache_config)
+        def initialize_from_config(
+                self, kv_cache_configs: List[KVCacheConfig]) -> None:
+            super().initialize_from_config(kv_cache_configs)
 
             # This executor actually can only run 1 batch at a time
             self.semaphore = threading.Semaphore(1)

diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
@@ -93,9 +93,10 @@ def _init_executor(self) -> None:
             ("ExecutorWithExternalLauncher needs deterministic "
             "execution, so it"
             "does not support delay_factor in scheduling")
-        assert not envs.VLLM_USE_V1, \
-            ("V1 architecture cannot guarantee deterministic execution, "
-            "so it is not supported in ExecutorWithExternalLauncher.")
+        if envs.VLLM_USE_V1:
+            assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \
+            ("To get deterministic execution in V1, "
+            "please set VLLM_ENABLE_V1_MULTIPROCESSING=0")
         self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
                                                rpc_rank=0)
         # engines are launched in torchrun-compatible launchers

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -110,7 +110,7 @@ def _initialize_kv_caches(self,
         num_cpu_blocks = 0
 
         # Initialize kv cache and warmup the execution
-        self.model_executor.initialize(kv_cache_configs)
+        self.model_executor.initialize_from_config(kv_cache_configs)
 
         elapsed = time.time() - start
         logger.info(("init engine (profile, create kv cache, "

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
@@ -44,6 +44,7 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
+        self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
@@ -76,6 +77,10 @@ def __init__(
             log_stats=False,  # FIXME: implement
         )
 
+        if not multiprocess_mode:
+            # for v0 compatibility
+            self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
+
     @classmethod
     def from_engine_args(
         cls,

diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
@@ -3,6 +3,9 @@
 from concurrent.futures import Future
 from typing import List, Type, Union
 
+import torch
+import torch.distributed as dist
+
 from vllm.config import VllmConfig
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.uniproc_executor import (  # noqa
@@ -49,12 +52,14 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
                              f"{distributed_executor_backend}")
         return executor_class
 
-    def initialize(self, kv_cache_configs: List[KVCacheConfig]) -> None:
+    def initialize_from_config(self,
+                               kv_cache_configs: List[KVCacheConfig]) -> None:
         """
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
         """
-        self.collective_rpc("initialize_cache", args=(kv_cache_configs, ))
+        self.collective_rpc("initialize_from_config",
+                            args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
     def determine_available_memory(self) -> int:  # in bytes
@@ -89,4 +94,13 @@ class UniProcExecutor(UniProcExecutorV0, Executor):
 
 
 class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
-    pass
+
+    def determine_available_memory(self) -> int:  # in bytes
+        # same as determine_num_available_blocks below,
+        # we need to get the min across all ranks.
+        memory = super().determine_available_memory()
+        from vllm.distributed.parallel_state import get_world_group
+        cpu_group = get_world_group().cpu_group
+        memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64)
+        dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return memory_tensor.item()
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -2,7 +2,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 
 import torch
 import torch.distributed
@@ -185,9 +185,8 @@ def determine_available_memory(self) -> int:
     def get_kv_cache_spec(self) -> KVCacheSpec:
         return self.model_runner.get_kv_cache_spec()
 
-    def initialize_cache(self, kv_cache_configs: List[KVCacheConfig]) -> None:
+    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
-        kv_cache_config = kv_cache_configs[self.rank]
         if self.vllm_config.model_config.enable_sleep_mode:
             allocator = CuMemAllocator.get_instance()
             context = allocator.use_memory_pool(tag="kv_cache")
@@ -225,7 +224,7 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
-        return output if self.rank == 0 else None
+        return output if self.is_driver_worker else None
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:

@@ -170,9 +170,8 @@ def get_model(self) -> nn.Module:
     def get_kv_cache_spec(self) -> KVCacheSpec:
         return self.model_runner.get_kv_cache_spec()
 
-    def initialize_cache(self, kv_cache_configs: List[KVCacheConfig]) -> None:
+    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
-        kv_cache_config = kv_cache_configs[self.rank]
         self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def check_health(self) -> None:

diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
@@ -567,6 +567,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
             self.worker = worker_class(**kwargs)
             assert self.worker is not None
 
+    def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
+        kv_cache_config = kv_cache_configs[self.rpc_rank]
+        self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+
     def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:
             target = self if self.worker is None else self.worker