From 3246dd7c95128e238e51be184788ff21f33a0417 Mon Sep 17 00:00:00 2001 From: Rui Qiao Date: Wed, 5 Mar 2025 23:41:57 +0000 Subject: [PATCH 1/3] [misc] Mention `ray list nodes` command to troubleshoot ray issues Signed-off-by: Rui Qiao --- vllm/executor/ray_utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 6067f9a3c13b..5d8b48ac67b1 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -184,8 +184,9 @@ def _verify_bundles(placement_group: "PlacementGroup", f"group {placement_group.id}. Node id -> bundles " f"{node_id_to_bundle}. " "You don't have enough GPUs available in a current node. Check " - "`ray status` to see if you have available GPUs in a node " - f"{driver_node_id} before starting an vLLM engine.") + "`ray status` and `ray list nodes` to see if you have available " + "GPUs in a node `{driver_node_id}` before starting an vLLM engine." + ) for node_id, bundles in node_id_to_bundle.items(): if len(bundles) < parallel_config.tensor_parallel_size: @@ -225,8 +226,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): wait_interval *= 2 logger.info( "Waiting for creating a placement group of specs for " - "%d seconds. specs=%s. Check " - "`ray status` to see if you have enough resources," + "%d seconds. specs=%s. Check `ray status` and " + "`ray list nodes` to see if you have enough resources," " and make sure the IP addresses used by ray cluster" " are the same as VLLM_HOST_IP environment variable" " specified in each node if you are running on a multi-node.", @@ -238,8 +239,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): raise ValueError( "Cannot provide a placement group of " f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " - "`ray status` to make sure the cluster has enough resources." - ) from None + "`ray status` and `ray list nodes` to make sure the cluster has " + "enough resources.") from None def _wait_until_pg_removed(current_placement_group: "PlacementGroup"): From 2a69d7b3a7e22a80201a9e5762ed24f67aec369d Mon Sep 17 00:00:00 2001 From: Rui Qiao Date: Wed, 5 Mar 2025 23:53:36 +0000 Subject: [PATCH 2/3] up Signed-off-by: Rui Qiao --- docs/source/serving/distributed_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 54c7ded20421..a395f9293824 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -81,7 +81,7 @@ Then you get a ray cluster of **containers**. Note that you need to keep the she Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`. ::: -Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. +Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: From 8f672c46132b7c46381a04721c156b732190e007 Mon Sep 17 00:00:00 2001 From: Rui Qiao Date: Wed, 5 Mar 2025 23:54:59 +0000 Subject: [PATCH 3/3] up Signed-off-by: Rui Qiao --- docs/source/serving/distributed_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index a395f9293824..e6be644b7393 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -111,5 +111,5 @@ When you use huggingface repo id to refer to the model, you should append your h ::: :::{warning} -If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See for more information. +If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See for more information. :::