@@ -255,12 +255,33 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
255255 try :
256256 ray .get (pg_ready_ref , timeout = 0 )
257257 except ray .exceptions .GetTimeoutError :
258- raise ValueError (
259- "Cannot provide a placement group of "
260- f"{ placement_group_specs = } within { PG_WAIT_TIMEOUT } seconds. See "
261- "`ray status` and `ray list nodes` to make sure the cluster has "
262- "enough resources."
263- ) from None
258+ # Provide more helpful error message when GPU count is exceeded
259+ total_gpu_required = sum (spec .get ("GPU" , 0 ) for spec in placement_group_specs )
260+ # If more than one GPU is required for the placement group, provide a
261+ # more specific error message.
262+ # We use >1 here because multi-GPU (tensor parallel) jobs are more
263+ # likely to fail due to insufficient cluster resources, and users may
264+ # need to adjust tensor_parallel_size to fit available GPUs.
265+ if total_gpu_required > 1 :
266+ raise ValueError (
267+ f"Cannot provide a placement group requiring "
268+ f"{ total_gpu_required } GPUs "
269+ f"(placement_group_specs={ placement_group_specs } ) within "
270+ f"{ PG_WAIT_TIMEOUT } seconds.\n "
271+ f"Tensor parallel size may exceed available GPUs in your "
272+ f"cluster. Check resources with `ray status` and "
273+ f"`ray list nodes`.\n "
274+ f"If running on K8s with limited GPUs, consider reducing "
275+ f"--tensor-parallel-size to match available GPU resources."
276+ ) from None
277+ else :
278+ raise ValueError (
279+ "Cannot provide a placement group of "
280+ f"{ placement_group_specs = } within "
281+ f"{ PG_WAIT_TIMEOUT } seconds. See "
282+ "`ray status` and `ray list nodes` to make sure the cluster "
283+ "has enough resources."
284+ ) from None
264285
265286
266287def _wait_until_pg_removed (current_placement_group : "PlacementGroup" ):
@@ -299,6 +320,23 @@ def initialize_ray_cluster(
299320 assert_ray_available ()
300321 from vllm .platforms import current_platform
301322
323+ # Prevalidate GPU requirements before Ray processing
324+ if current_platform .is_cuda () and parallel_config .world_size > 1 :
325+ from vllm .utils import cuda_device_count_stateless
326+
327+ available_gpus = cuda_device_count_stateless ()
328+ if parallel_config .world_size > available_gpus :
329+ logger .warning (
330+ "Tensor parallel size (%d) exceeds available GPUs (%d). "
331+ "This may result in Ray placement group allocation failures. "
332+ "Consider reducing tensor_parallel_size to %d or less, "
333+ "or ensure your Ray cluster has %d GPUs available." ,
334+ parallel_config .world_size ,
335+ available_gpus ,
336+ available_gpus ,
337+ parallel_config .world_size ,
338+ )
339+
302340 if ray .is_initialized ():
303341 logger .info ("Ray is already initialized. Skipping Ray initialization." )
304342 elif current_platform .is_rocm () or current_platform .is_xpu ():
0 commit comments