Skip to content

Commit a4409a6

Browse files
khluuLeiWang1999
authored andcommitted
Add support for multi-node on CI (vllm-project#5955)
Signed-off-by: kevin <[email protected]> Signed-off-by: LeiWang1999 <[email protected]>
1 parent 2e8c6f6 commit a4409a6

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

.buildkite/run-multi-node-test.sh

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/bin/bash
2+
3+
set -euox pipefail
4+
5+
if [[ $# -lt 3 ]]; then
6+
echo "Please provide the number of nodes and GPU per node."
7+
exit 1
8+
fi
9+
10+
NUM_NODES=$1
11+
NUM_GPUS=$2
12+
DOCKER_IMAGE=$3
13+
14+
shift 3
15+
COMMANDS=("$@")
16+
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
17+
echo "The number of commands must be equal to the number of nodes."
18+
echo "Number of nodes: $NUM_NODES"
19+
echo "Number of commands: ${#COMMANDS[@]}"
20+
exit 1
21+
fi
22+
23+
echo "List of commands"
24+
for command in "${COMMANDS[@]}"; do
25+
echo $command
26+
done
27+
28+
start_network() {
29+
docker network create --subnet=192.168.10.0/24 docker-net
30+
}
31+
32+
start_nodes() {
33+
for node in $(seq 0 $(($NUM_NODES-1))); do
34+
GPU_DEVICES='"device='
35+
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
36+
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
37+
GPU_DEVICES+=$(($DEVICE_NUM))
38+
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
39+
GPU_DEVICES+=','
40+
fi
41+
done
42+
GPU_DEVICES+='"'
43+
# echo "Starting node$node with GPU devices: $GPU_DEVICES"
44+
docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
45+
done
46+
}
47+
48+
run_nodes() {
49+
for node in $(seq 0 $(($NUM_NODES-1))); do
50+
GPU_DEVICES='"device='
51+
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
52+
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
53+
GPU_DEVICES+=$(($DEVICE_NUM))
54+
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
55+
GPU_DEVICES+=','
56+
fi
57+
done
58+
GPU_DEVICES+='"'
59+
echo "Running node$node with GPU devices: $GPU_DEVICES"
60+
if [ $node -lt $(($NUM_NODES - 1)) ]; then
61+
docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
62+
else
63+
docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
64+
fi
65+
done
66+
}
67+
cleanup() {
68+
for node in $(seq 0 $(($NUM_NODES-1))); do
69+
docker stop node$node
70+
done
71+
docker network rm docker-net
72+
}
73+
trap cleanup EXIT
74+
start_network
75+
start_nodes
76+
run_nodes
77+

0 commit comments

Comments
 (0)