|
3 | 3 | from concurrent.futures import Future
|
4 | 4 | from typing import List, Type, Union
|
5 | 5 |
|
| 6 | +import torch |
| 7 | +import torch.distributed as dist |
| 8 | + |
6 | 9 | from vllm.config import VllmConfig
|
7 | 10 | from vllm.executor.executor_base import ExecutorBase
|
8 | 11 | from vllm.executor.uniproc_executor import ( # noqa
|
@@ -49,12 +52,14 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
|
49 | 52 | f"{distributed_executor_backend}")
|
50 | 53 | return executor_class
|
51 | 54 |
|
52 |
| - def initialize(self, kv_cache_configs: List[KVCacheConfig]) -> None: |
| 55 | + def initialize_from_config(self, |
| 56 | + kv_cache_configs: List[KVCacheConfig]) -> None: |
53 | 57 | """
|
54 | 58 | Initialize the KV caches and begin the model execution loop of the
|
55 | 59 | underlying workers.
|
56 | 60 | """
|
57 |
| - self.collective_rpc("initialize_cache", args=(kv_cache_configs, )) |
| 61 | + self.collective_rpc("initialize_from_config", |
| 62 | + args=(kv_cache_configs, )) |
58 | 63 | self.collective_rpc("compile_or_warm_up_model")
|
59 | 64 |
|
60 | 65 | def determine_available_memory(self) -> int: # in bytes
|
@@ -89,4 +94,13 @@ class UniProcExecutor(UniProcExecutorV0, Executor):
|
89 | 94 |
|
90 | 95 |
|
91 | 96 | class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
|
92 |
| - pass |
| 97 | + |
| 98 | + def determine_available_memory(self) -> int: # in bytes |
| 99 | + # same as determine_num_available_blocks in v0, |
| 100 | + # we need to get the min across all ranks. |
| 101 | + memory = super().determine_available_memory() |
| 102 | + from vllm.distributed.parallel_state import get_world_group |
| 103 | + cpu_group = get_world_group().cpu_group |
| 104 | + memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64) |
| 105 | + dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN) |
| 106 | + return memory_tensor.item() |
0 commit comments