vllm-project
diff --git a/‎tests/v1/kv_connector/nixl_integration/run_cuda2cpu_accuracy_test.sh‎
Lines changed: 183 additions & 0 deletions b/‎tests/v1/kv_connector/nixl_integration/run_cuda2cpu_accuracy_test.sh‎
Lines changed: 183 additions & 0 deletions
diff --git a/‎tests/v1/kv_connector/nixl_integration/run_cuda2cpu_edge_case_test.sh‎
Lines changed: 121 additions & 0 deletions b/‎tests/v1/kv_connector/nixl_integration/run_cuda2cpu_edge_case_test.sh‎
Lines changed: 121 additions & 0 deletions
@@ -0,0 +1,183 @@
+#!/bin/bash
+set -xe
+
+# Models to run
+MODELS=(
+    "Qwen/Qwen3-0.6B"
+)
+
+# Number of prefill and decode instances to create
+NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
+
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+SMI_BIN=$(which nvidia-smi || which rocm-smi)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
+    extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+get_num_gpus() {
+  if [[ "$SMI_BIN" == *"nvidia"* ]]; then
+    echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
+  else
+    echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  fi
+}
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+
+  # Arrays to store all hosts and ports
+  PREFILL_HOSTS=()
+  PREFILL_PORTS=()
+  DECODE_HOSTS=()
+  DECODE_PORTS=()
+
+  # Start prefill instances
+  for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs
+    GPU_ID=$((i % $(get_num_gpus)))
+
+    # Calculate port number (base port + instance number)
+    PORT=$((8100 + i))
+    # Calculate side channel port. Avoid clash with with TP workers.
+    SIDE_CHANNEL_PORT=$((5559 + i))
+
+    echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --enforce-eager \
+    --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    PREFILL_HOSTS+=("localhost")
+    PREFILL_PORTS+=($PORT)
+  done
+
+  # Start decode instances
+  for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
+    # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
+    GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    # Calculate port number (base port + instance number)
+    PORT=$((8200 + i))
+    # Calculate side channel port
+    SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
+
+    echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
+
+    # Build the command with or without model-specific args
+    BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
+    --port $PORT \
+    --enforce-eager \
+    --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
+    --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
+
+    if [ -n "$model_args" ]; then
+    FULL_CMD="$BASE_CMD $model_args"
+    else
+    FULL_CMD="$BASE_CMD"
+    fi
+
+    eval "$FULL_CMD &"
+
+    # Store host and port for proxy configuration
+    DECODE_HOSTS+=("localhost")
+    DECODE_PORTS+=($PORT)
+  done
+
+  # Wait for all instances to start
+  for PORT in "${PREFILL_PORTS[@]}"; do
+    echo "Waiting for prefill instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  for PORT in "${DECODE_PORTS[@]}"; do
+    echo "Waiting for decode instance on port $PORT to start..."
+    wait_for_server $PORT
+  done
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
+
+  # Add all prefill hosts and ports
+  PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
+
+  # Add all decode hosts and ports
+  PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
+
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 5
+
+  # Run lm eval for this model
+  echo "Running tests for $model_name"
+  TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"
@@ -0,0 +1,121 @@
+#!/bin/bash
+set -xe
+
+# Models to run
+MODELS=(
+    "Qwen/Qwen3-0.6B"
+)
+
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Waits for vLLM to start.
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+# Function to clean up previous instances
+cleanup_instances() {
+  echo "Cleaning up any running vLLM instances..."
+  pkill -f "vllm serve" || true
+  sleep 2
+}
+
+# Handle to get model-specific arguments for deepseek
+get_model_args() {
+  local model_name=$1
+  local extra_args=""
+
+  if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
+    extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
+  fi
+
+  echo "$extra_args"
+}
+
+
+# Function to run tests for a specific model
+run_tests_for_model() {
+  local model_name=$1
+  echo "================================"
+  echo "Testing model: $model_name"
+  echo "================================"
+
+  # Get model-specific arguments
+  local model_args=$(get_model_args "$model_name")
+  
+  # Start prefill instance
+  PREFILL_PORT=8001
+
+  BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
+  --port $PREFILL_PORT \
+  --enforce-eager \
+  --gpu-memory-utilization 0.2 \
+  --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+
+  if [ -n "$model_args" ]; then
+  FULL_CMD="$BASE_CMD $model_args"
+  else
+  FULL_CMD="$BASE_CMD"
+  fi
+
+  eval "$FULL_CMD &"
+
+  # Start decode instance
+  DECODE_PORT=8002
+
+  # Build the command with or without model-specific args
+  BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
+  --port $DECODE_PORT \
+  --enforce-eager \
+  --gpu-memory-utilization 0.2 \
+  --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
+
+  if [ -n "$model_args" ]; then
+  FULL_CMD="$BASE_CMD $model_args"
+  else
+  FULL_CMD="$BASE_CMD"
+  fi
+
+  eval "$FULL_CMD &"
+
+  # Wait for all instances to start
+  echo "Waiting for prefill instance on port $PORT to start..."
+  wait_for_server $PREFILL_PORT
+  echo "Waiting for decode instance on port $PORT to start..."
+  wait_for_server $DECODE_PORT
+
+  # Build the command for the proxy server with all the hosts and ports
+  PROXY_PORT=8192
+  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port $PROXY_PORT"
+  PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
+  PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
+  # Start the proxy server
+  echo "Starting proxy server with command: $PROXY_CMD"
+  $PROXY_CMD &
+
+  # Wait for the proxy to start
+  sleep 5
+
+  # Run lm eval for this model
+  echo "Running tests for $model_name"
+  PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
+
+  # Clean up before running next model
+  cleanup_instances
+  sleep 3
+}
+
+# Run tests for each model
+for model in "${MODELS[@]}"; do
+  run_tests_for_model "$model"
+done
+
+echo "All tests completed!"