Skip to content

Commit ad91641

Browse files
author
Chenxi Yang
committed
[Cuda2CPU][P/D] Add cuda2cpu support in NixlConnector
1 parent 79ac59f commit ad91641

File tree

5 files changed

+1758
-1045
lines changed

5 files changed

+1758
-1045
lines changed
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#!/bin/bash
2+
set -xe
3+
4+
# Models to run
5+
MODELS=(
6+
"Qwen/Qwen3-0.6B"
7+
)
8+
9+
# Number of prefill and decode instances to create
10+
NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
11+
NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1
12+
PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
13+
DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
14+
15+
# Find the git repository root directory
16+
GIT_ROOT=$(git rev-parse --show-toplevel)
17+
18+
SMI_BIN=$(which nvidia-smi || which rocm-smi)
19+
20+
# Trap the SIGINT signal (triggered by Ctrl+C)
21+
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
22+
23+
# Waits for vLLM to start.
24+
wait_for_server() {
25+
local port=$1
26+
timeout 1200 bash -c "
27+
until curl -s localhost:${port}/v1/completions > /dev/null; do
28+
sleep 1
29+
done" && return 0 || return 1
30+
}
31+
32+
# Function to clean up previous instances
33+
cleanup_instances() {
34+
echo "Cleaning up any running vLLM instances..."
35+
pkill -f "vllm serve" || true
36+
sleep 2
37+
}
38+
39+
# Handle to get model-specific arguments for deepseek
40+
get_model_args() {
41+
local model_name=$1
42+
local extra_args=""
43+
44+
if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
45+
extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
46+
fi
47+
48+
echo "$extra_args"
49+
}
50+
51+
get_num_gpus() {
52+
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
53+
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
54+
else
55+
echo "$($SMI_BIN -l | grep GPU | wc -l)"
56+
fi
57+
}
58+
59+
# Function to run tests for a specific model
60+
run_tests_for_model() {
61+
local model_name=$1
62+
echo "================================"
63+
echo "Testing model: $model_name"
64+
echo "================================"
65+
66+
# Get model-specific arguments
67+
local model_args=$(get_model_args "$model_name")
68+
69+
# Arrays to store all hosts and ports
70+
PREFILL_HOSTS=()
71+
PREFILL_PORTS=()
72+
DECODE_HOSTS=()
73+
DECODE_PORTS=()
74+
75+
# Start prefill instances
76+
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
77+
# Calculate GPU ID - we'll distribute across available GPUs
78+
GPU_ID=$((i % $(get_num_gpus)))
79+
80+
# Calculate port number (base port + instance number)
81+
PORT=$((8100 + i))
82+
# Calculate side channel port. Avoid clash with with TP workers.
83+
SIDE_CHANNEL_PORT=$((5559 + i))
84+
85+
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
86+
87+
# Build the command with or without model-specific args
88+
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
89+
--port $PORT \
90+
--enforce-eager \
91+
--gpu-memory-utilization 0.2 \
92+
--tensor-parallel-size $PREFILLER_TP_SIZE \
93+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
94+
95+
if [ -n "$model_args" ]; then
96+
FULL_CMD="$BASE_CMD $model_args"
97+
else
98+
FULL_CMD="$BASE_CMD"
99+
fi
100+
101+
eval "$FULL_CMD &"
102+
103+
# Store host and port for proxy configuration
104+
PREFILL_HOSTS+=("localhost")
105+
PREFILL_PORTS+=($PORT)
106+
done
107+
108+
# Start decode instances
109+
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
110+
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
111+
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
112+
# Calculate port number (base port + instance number)
113+
PORT=$((8200 + i))
114+
# Calculate side channel port
115+
SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
116+
117+
echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
118+
119+
# Build the command with or without model-specific args
120+
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
121+
--port $PORT \
122+
--enforce-eager \
123+
--gpu-memory-utilization 0.2 \
124+
--tensor-parallel-size $DECODER_TP_SIZE \
125+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
126+
127+
if [ -n "$model_args" ]; then
128+
FULL_CMD="$BASE_CMD $model_args"
129+
else
130+
FULL_CMD="$BASE_CMD"
131+
fi
132+
133+
eval "$FULL_CMD &"
134+
135+
# Store host and port for proxy configuration
136+
DECODE_HOSTS+=("localhost")
137+
DECODE_PORTS+=($PORT)
138+
done
139+
140+
# Wait for all instances to start
141+
for PORT in "${PREFILL_PORTS[@]}"; do
142+
echo "Waiting for prefill instance on port $PORT to start..."
143+
wait_for_server $PORT
144+
done
145+
146+
for PORT in "${DECODE_PORTS[@]}"; do
147+
echo "Waiting for decode instance on port $PORT to start..."
148+
wait_for_server $PORT
149+
done
150+
151+
# Build the command for the proxy server with all the hosts and ports
152+
PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
153+
154+
# Add all prefill hosts and ports
155+
PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
156+
PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
157+
158+
# Add all decode hosts and ports
159+
PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
160+
PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
161+
162+
# Start the proxy server
163+
echo "Starting proxy server with command: $PROXY_CMD"
164+
$PROXY_CMD &
165+
166+
# Wait for the proxy to start
167+
sleep 5
168+
169+
# Run lm eval for this model
170+
echo "Running tests for $model_name"
171+
TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
172+
173+
# Clean up before running next model
174+
cleanup_instances
175+
sleep 3
176+
}
177+
178+
# Run tests for each model
179+
for model in "${MODELS[@]}"; do
180+
run_tests_for_model "$model"
181+
done
182+
183+
echo "All tests completed!"
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#!/bin/bash
2+
set -xe
3+
4+
# Models to run
5+
MODELS=(
6+
"Qwen/Qwen3-0.6B"
7+
)
8+
9+
# Find the git repository root directory
10+
GIT_ROOT=$(git rev-parse --show-toplevel)
11+
12+
# Trap the SIGINT signal (triggered by Ctrl+C)
13+
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
14+
15+
# Waits for vLLM to start.
16+
wait_for_server() {
17+
local port=$1
18+
timeout 1200 bash -c "
19+
until curl -s localhost:${port}/v1/completions > /dev/null; do
20+
sleep 1
21+
done" && return 0 || return 1
22+
}
23+
24+
# Function to clean up previous instances
25+
cleanup_instances() {
26+
echo "Cleaning up any running vLLM instances..."
27+
pkill -f "vllm serve" || true
28+
sleep 2
29+
}
30+
31+
# Handle to get model-specific arguments for deepseek
32+
get_model_args() {
33+
local model_name=$1
34+
local extra_args=""
35+
36+
if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
37+
extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
38+
fi
39+
40+
echo "$extra_args"
41+
}
42+
43+
44+
# Function to run tests for a specific model
45+
run_tests_for_model() {
46+
local model_name=$1
47+
echo "================================"
48+
echo "Testing model: $model_name"
49+
echo "================================"
50+
51+
# Get model-specific arguments
52+
local model_args=$(get_model_args "$model_name")
53+
54+
# Start prefill instance
55+
PREFILL_PORT=8001
56+
57+
BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
58+
--port $PREFILL_PORT \
59+
--enforce-eager \
60+
--gpu-memory-utilization 0.2 \
61+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
62+
63+
if [ -n "$model_args" ]; then
64+
FULL_CMD="$BASE_CMD $model_args"
65+
else
66+
FULL_CMD="$BASE_CMD"
67+
fi
68+
69+
eval "$FULL_CMD &"
70+
71+
# Start decode instance
72+
DECODE_PORT=8002
73+
74+
# Build the command with or without model-specific args
75+
BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
76+
--port $DECODE_PORT \
77+
--enforce-eager \
78+
--gpu-memory-utilization 0.2 \
79+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
80+
81+
if [ -n "$model_args" ]; then
82+
FULL_CMD="$BASE_CMD $model_args"
83+
else
84+
FULL_CMD="$BASE_CMD"
85+
fi
86+
87+
eval "$FULL_CMD &"
88+
89+
# Wait for all instances to start
90+
echo "Waiting for prefill instance on port $PORT to start..."
91+
wait_for_server $PREFILL_PORT
92+
echo "Waiting for decode instance on port $PORT to start..."
93+
wait_for_server $DECODE_PORT
94+
95+
# Build the command for the proxy server with all the hosts and ports
96+
PROXY_PORT=8192
97+
PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port $PROXY_PORT"
98+
PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
99+
PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
100+
# Start the proxy server
101+
echo "Starting proxy server with command: $PROXY_CMD"
102+
$PROXY_CMD &
103+
104+
# Wait for the proxy to start
105+
sleep 5
106+
107+
# Run lm eval for this model
108+
echo "Running tests for $model_name"
109+
PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
110+
111+
# Clean up before running next model
112+
cleanup_instances
113+
sleep 3
114+
}
115+
116+
# Run tests for each model
117+
for model in "${MODELS[@]}"; do
118+
run_tests_for_model "$model"
119+
done
120+
121+
echo "All tests completed!"

0 commit comments

Comments
 (0)