11import asyncio
2+ import os
23from typing import Any , Callable , List , Optional , Union
34
45import cloudpickle
1011from vllm .logger import init_logger
1112from vllm .model_executor .layers .sampler import SamplerOutput
1213from vllm .sequence import ExecuteModelRequest
13- from vllm .utils import (_run_task_with_lock , get_distributed_init_method ,
14- get_ip , get_open_port , make_async , run_method )
14+ from vllm .utils import (_run_task_with_lock , cuda_device_count_stateless ,
15+ get_distributed_init_method , get_ip , get_open_port ,
16+ make_async , run_method , update_environment_variables )
1517from vllm .worker .worker_base import WorkerWrapperBase
1618
1719logger = init_logger (__name__ )
@@ -22,7 +24,39 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
2224
2325 uses_ray : bool = False
2426
27+ def _check_cuda (self ) -> None :
28+ """Check that the number of GPUs is sufficient for the parallel
29+ configuration. Separate from _init_executor to reduce the number of
30+ indented blocks.
31+ """
32+ parallel_config = self .parallel_config
33+ world_size = parallel_config .world_size
34+ tensor_parallel_size = parallel_config .tensor_parallel_size
35+
36+ cuda_device_count = cuda_device_count_stateless ()
37+ # Use confusing message for more common TP-only case.
38+ if tensor_parallel_size > cuda_device_count :
39+ raise RuntimeError (
40+ f"please set tensor_parallel_size ({ tensor_parallel_size } ) "
41+ f"to less than max local gpu count ({ cuda_device_count } )" )
42+
43+ if world_size > cuda_device_count :
44+ raise RuntimeError (
45+ f"please ensure that world_size ({ world_size } ) "
46+ f"is less than than max local gpu count ({ cuda_device_count } )" )
47+
48+ # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
49+ if "CUDA_VISIBLE_DEVICES" not in os .environ :
50+ update_environment_variables ({
51+ "CUDA_VISIBLE_DEVICES" : ("," .join (map (str , range (world_size ))))
52+ })
53+
2554 def _init_executor (self ) -> None :
55+
56+ from vllm .platforms import current_platform
57+ if current_platform .is_cuda_alike ():
58+ self ._check_cuda ()
59+
2660 # Create the parallel GPU workers.
2761 world_size = self .parallel_config .world_size
2862 tensor_parallel_size = self .parallel_config .tensor_parallel_size
0 commit comments