diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index e8847354bb09..82d575f24690 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -521,15 +521,11 @@ def __post_init__(self) -> None: current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size ): - if not ray_found: - raise ValueError( - "Unable to load Ray: " - f"{ray_utils.ray_import_err}. Ray is " - "required for multi-node inference, " - "please install Ray with `pip install " - "ray`." - ) - backend = "ray" + gpu_count = cuda_device_count_stateless() + raise ValueError( + f"Tensor parallel size ({self.world_size}) cannot be " + f"larger than the number of available GPUs ({gpu_count})." + ) elif self.data_parallel_backend == "ray": logger.info( "Using ray distributed inference because " diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index 518f1582faeb..382f008266e6 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -255,12 +255,33 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): try: ray.get(pg_ready_ref, timeout=0) except ray.exceptions.GetTimeoutError: - raise ValueError( - "Cannot provide a placement group of " - f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " - "`ray status` and `ray list nodes` to make sure the cluster has " - "enough resources." - ) from None + # Provide more helpful error message when GPU count is exceeded + total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs) + # If more than one GPU is required for the placement group, provide a + # more specific error message. + # We use >1 here because multi-GPU (tensor parallel) jobs are more + # likely to fail due to insufficient cluster resources, and users may + # need to adjust tensor_parallel_size to fit available GPUs. + if total_gpu_required > 1: + raise ValueError( + f"Cannot provide a placement group requiring " + f"{total_gpu_required} GPUs " + f"(placement_group_specs={placement_group_specs}) within " + f"{PG_WAIT_TIMEOUT} seconds.\n" + f"Tensor parallel size may exceed available GPUs in your " + f"cluster. Check resources with `ray status` and " + f"`ray list nodes`.\n" + f"If running on K8s with limited GPUs, consider reducing " + f"--tensor-parallel-size to match available GPU resources." + ) from None + else: + raise ValueError( + "Cannot provide a placement group of " + f"{placement_group_specs=} within " + f"{PG_WAIT_TIMEOUT} seconds. See " + "`ray status` and `ray list nodes` to make sure the cluster " + "has enough resources." + ) from None def _wait_until_pg_removed(current_placement_group: "PlacementGroup"): @@ -299,6 +320,23 @@ def initialize_ray_cluster( assert_ray_available() from vllm.platforms import current_platform + # Prevalidate GPU requirements before Ray processing + if current_platform.is_cuda() and parallel_config.world_size > 1: + from vllm.utils import cuda_device_count_stateless + + available_gpus = cuda_device_count_stateless() + if parallel_config.world_size > available_gpus: + logger.warning( + "Tensor parallel size (%d) exceeds available GPUs (%d). " + "This may result in Ray placement group allocation failures. " + "Consider reducing tensor_parallel_size to %d or less, " + "or ensure your Ray cluster has %d GPUs available.", + parallel_config.world_size, + available_gpus, + available_gpus, + parallel_config.world_size, + ) + if ray.is_initialized(): logger.info("Ray is already initialized. Skipping Ray initialization.") elif current_platform.is_rocm() or current_platform.is_xpu():