Skip to content

Commit f5e5ffc

Browse files
sairampillaiilmarkov
authored andcommitted
[Bugfix] Improve GPU validation logging in Ray fallback scenarios (vllm-project#25775)
Signed-off-by: Sairam Pillai <[email protected]>
1 parent 05fd241 commit f5e5ffc

File tree

2 files changed

+49
-15
lines changed

2 files changed

+49
-15
lines changed

vllm/config/parallel.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -531,15 +531,11 @@ def __post_init__(self) -> None:
531531
current_platform.is_cuda()
532532
and cuda_device_count_stateless() < self.world_size
533533
):
534-
if not ray_found:
535-
raise ValueError(
536-
"Unable to load Ray: "
537-
f"{ray_utils.ray_import_err}. Ray is "
538-
"required for multi-node inference, "
539-
"please install Ray with `pip install "
540-
"ray`."
541-
)
542-
backend = "ray"
534+
gpu_count = cuda_device_count_stateless()
535+
raise ValueError(
536+
f"Tensor parallel size ({self.world_size}) cannot be "
537+
f"larger than the number of available GPUs ({gpu_count})."
538+
)
543539
elif self.data_parallel_backend == "ray":
544540
logger.info(
545541
"Using ray distributed inference because "

vllm/v1/executor/ray_utils.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -255,12 +255,33 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
255255
try:
256256
ray.get(pg_ready_ref, timeout=0)
257257
except ray.exceptions.GetTimeoutError:
258-
raise ValueError(
259-
"Cannot provide a placement group of "
260-
f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
261-
"`ray status` and `ray list nodes` to make sure the cluster has "
262-
"enough resources."
263-
) from None
258+
# Provide more helpful error message when GPU count is exceeded
259+
total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs)
260+
# If more than one GPU is required for the placement group, provide a
261+
# more specific error message.
262+
# We use >1 here because multi-GPU (tensor parallel) jobs are more
263+
# likely to fail due to insufficient cluster resources, and users may
264+
# need to adjust tensor_parallel_size to fit available GPUs.
265+
if total_gpu_required > 1:
266+
raise ValueError(
267+
f"Cannot provide a placement group requiring "
268+
f"{total_gpu_required} GPUs "
269+
f"(placement_group_specs={placement_group_specs}) within "
270+
f"{PG_WAIT_TIMEOUT} seconds.\n"
271+
f"Tensor parallel size may exceed available GPUs in your "
272+
f"cluster. Check resources with `ray status` and "
273+
f"`ray list nodes`.\n"
274+
f"If running on K8s with limited GPUs, consider reducing "
275+
f"--tensor-parallel-size to match available GPU resources."
276+
) from None
277+
else:
278+
raise ValueError(
279+
"Cannot provide a placement group of "
280+
f"{placement_group_specs=} within "
281+
f"{PG_WAIT_TIMEOUT} seconds. See "
282+
"`ray status` and `ray list nodes` to make sure the cluster "
283+
"has enough resources."
284+
) from None
264285

265286

266287
def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
@@ -299,6 +320,23 @@ def initialize_ray_cluster(
299320
assert_ray_available()
300321
from vllm.platforms import current_platform
301322

323+
# Prevalidate GPU requirements before Ray processing
324+
if current_platform.is_cuda() and parallel_config.world_size > 1:
325+
from vllm.utils import cuda_device_count_stateless
326+
327+
available_gpus = cuda_device_count_stateless()
328+
if parallel_config.world_size > available_gpus:
329+
logger.warning(
330+
"Tensor parallel size (%d) exceeds available GPUs (%d). "
331+
"This may result in Ray placement group allocation failures. "
332+
"Consider reducing tensor_parallel_size to %d or less, "
333+
"or ensure your Ray cluster has %d GPUs available.",
334+
parallel_config.world_size,
335+
available_gpus,
336+
available_gpus,
337+
parallel_config.world_size,
338+
)
339+
302340
if ray.is_initialized():
303341
logger.info("Ray is already initialized. Skipping Ray initialization.")
304342
elif current_platform.is_rocm() or current_platform.is_xpu():

0 commit comments

Comments
 (0)