From 72714f32e1004c854149812d9997ac5eecc37244 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Mon, 21 Jul 2025 23:42:53 -0700
Subject: [PATCH 1/6] resolve merge conflict

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 .../scripts/run-nightly-benchmarks.sh         |  29 +-
 .../scripts/run-performance-benchmarks.sh     |   6 +-
 .../scripts/hardware_ci/run-cpu-test.sh       |  10 +-
 .buildkite/scripts/run-benchmarks.sh          |   6 +-
 .buildkite/scripts/tpu/run_bm.sh              |   2 +-
 benchmarks/README.md                          |  71 +--
 benchmarks/auto_tune/auto_tune.sh             |  33 +-
 docs/contributing/profiling.md                |  12 +-
 docs/design/v1/p2p_nccl_connector.md          |  18 +-
 .../disagg_example_p2p_nccl_xpyd.sh           |  10 +-
 .../disagg_example_nixl.sh                    |   4 +-
 vllm_onboarding_guide.md                      | 568 ++++++++++++++++++
 12 files changed, 669 insertions(+), 100 deletions(-)
 create mode 100644 vllm_onboarding_guide.md

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 4d01a314adc4..86153e8408cf 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
     echo "Container: vllm"
     # move to a completely irrelevant directory, to avoid import vllm from current folder
     export CURRENT_LLM_SERVING_ENGINE=vllm
-    
+
     return
   fi
 }
@@ -225,7 +225,7 @@ run_serving_tests() {
 
       if [[ "$dataset_name" = "sharegpt" ]]; then
 
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
           --backend $backend \
           --tokenizer /tokenizer_cache \
           --model $model \
@@ -235,9 +235,10 @@ run_serving_tests() {
           --port $port \
           --save-result \
           --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
+          --result-filename ${test_name}.json \
           --request-rate $qps \
-          --ignore-eos \
+          --metadata "tensor_parallel_size=$tp" \
+          $common_params_str"
           $client_args"
 
       elif [[ "$dataset_name" = "sonnet" ]]; then
@@ -246,7 +247,7 @@ run_serving_tests() {
         sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
         sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
 
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
           --backend $backend \
           --tokenizer /tokenizer_cache \
           --model $model \
@@ -265,13 +266,13 @@ run_serving_tests() {
           $client_args"
 
       else
-  
+
         echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
         exit 1
 
       fi
 
-        
+
 
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
@@ -302,7 +303,7 @@ run_serving_tests() {
 }
 
 run_genai_perf_tests() {
-  # run genai-perf tests 
+  # run genai-perf tests
 
   # $1: a json file specifying genai-perf test cases
   local genai_perf_test_file
@@ -311,14 +312,14 @@ run_genai_perf_tests() {
   # Iterate over genai-perf tests
   jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')    
-    
+    test_name=$(echo "$params" | jq -r '.test_name')
+
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
       continue
     fi
-    
+
     # prepend the current serving engine to the test name
     test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
 
@@ -369,10 +370,10 @@ run_genai_perf_tests() {
         qps=$num_prompts
         echo "now qps is $qps"
       fi
-    
+
       new_test_name=$test_name"_qps_"$qps
       backend=$CURRENT_LLM_SERVING_ENGINE
-      
+
       if [[ "$backend" == *"vllm"* ]]; then
         backend="vllm"
       fi
@@ -413,7 +414,7 @@ prepare_dataset() {
   do
     cat sonnet.txt >> sonnet_4x.txt
   done
-  
+
 }
 
 main() {
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index f05040618981..4eafe435f87f 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -205,7 +205,7 @@ run_latency_tests() {
       fi
     fi
 
-    latency_command=" $latency_envs python3 benchmark_latency.py \
+    latency_command=" $latency_envs vllm bench latency \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $latency_args"
 
@@ -272,7 +272,7 @@ run_throughput_tests() {
       fi
     fi
 
-    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
+    throughput_command=" $throughput_envs vllm bench throughput \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $throughput_args"
 
@@ -393,7 +393,7 @@ run_serving_tests() {
 
       # pass the tensor parallel size to the client so that it can be displayed
       # on the benchmark dashboard
-      client_command="python3 benchmark_serving.py \
+      client_command="vllm bench serve \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 90cc9c844622..7c7dbb461ce0 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
 export CMAKE_BUILD_PARALLEL_LEVEL=32
 
 # Setup cleanup
-remove_docker_container() { 
-    set -e; 
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
@@ -69,7 +69,7 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
 
   # Note: disable it until supports V1
   # Run AWQ test
@@ -83,7 +83,7 @@ function cpu_tests() {
     set -e
     VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    python3 benchmarks/benchmark_serving.py \
+    vllm bench serve \
       --backend vllm \
       --dataset-name random \
       --model meta-llama/Llama-3.2-3B-Instruct \
diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 195a8063fd74..72812218cb66 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 
 # run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
 
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 
 # run server-based benchmarks and upload the result to buildkite
@@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --dataset-name sharegpt \
     --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index 877669cd956a..beecaf7a740a 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -77,7 +77,7 @@ done
 echo "run benchmark test..."
 echo "logging to $BM_LOG"
 echo
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model $MODEL  \
     --dataset-name sonnet \
diff --git a/benchmarks/README.md b/benchmarks/README.md
index fb8690d42db9..ef2c57a7c079 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -98,38 +98,39 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend vllm \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --endpoint /v1/completions \
   --dataset-name sharegpt \
   --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --num-prompts 10
+  --num-prompts 1000 \
+  --request-rate 3 # By default <request_rate> is inf
 ```
 
 If successful, you will see the following output
 
 ```
 ============ Serving Benchmark Result ============
-Successful requests:                     10        
-Benchmark duration (s):                  5.78      
-Total input tokens:                      1369      
-Total generated tokens:                  2212      
-Request throughput (req/s):              1.73      
-Output token throughput (tok/s):         382.89    
-Total Token throughput (tok/s):          619.85    
+Successful requests:                     10
+Benchmark duration (s):                  5.78
+Total input tokens:                      1369
+Total generated tokens:                  2212
+Request throughput (req/s):              1.73
+Output token throughput (tok/s):         382.89
+Total Token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
-Mean TTFT (ms):                          71.54     
-Median TTFT (ms):                        73.88     
-P99 TTFT (ms):                           79.49     
+Mean TTFT (ms):                          71.54
+Median TTFT (ms):                        73.88
+P99 TTFT (ms):                           79.49
 -----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          7.91      
-Median TPOT (ms):                        7.96      
-P99 TPOT (ms):                           8.03      
+Mean TPOT (ms):                          7.91
+Median TPOT (ms):                        7.96
+P99 TPOT (ms):                           8.03
 ---------------Inter-token Latency----------------
-Mean ITL (ms):                           7.74      
-Median ITL (ms):                         7.70      
-P99 ITL (ms):                            8.39      
+Mean ITL (ms):                           7.74
+Median ITL (ms):                         7.70
+P99 ITL (ms):                            8.39
 ==================================================
 ```
 
@@ -141,7 +142,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
 {"prompt": "What is the capital of India?"}
 {"prompt": "What is the capital of Iran?"}
 {"prompt": "What is the capital of China?"}
-``` 
+```
 
 ```bash
 # start server
@@ -150,7 +151,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
 
 ```bash
 # run benchmarking script
-python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
+vllm bench serve --port 9001 --save-result --save-detailed \
   --backend vllm \
   --model meta-llama/Llama-3.1-8B-Instruct \
   --endpoint /v1/completions \
@@ -174,7 +175,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
@@ -194,7 +195,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
 ```
 
 ``` bash
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model meta-llama/Meta-Llama-3-8B-Instruct \
     --dataset-name hf \
     --dataset-path likaixin/InstructCoder \
@@ -210,7 +211,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 **`lmms-lab/LLaVA-OneVision-Data`**
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
@@ -224,20 +225,20 @@ python3 vllm/benchmarks/benchmark_serving.py \
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
   --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
   --hf-split train \
-  --num-prompts 10
+  --num-prompts 1000
 ```
 
 **`AI-MO/aimo-validation-aime`**
 
 ``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model Qwen/QwQ-32B \
     --dataset-name hf \
     --dataset-path AI-MO/aimo-validation-aime \
@@ -248,7 +249,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
 **`philschmid/mt-bench`**
 
 ``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model Qwen/QwQ-32B \
     --dataset-name hf \
     --dataset-path philschmid/mt-bench \
@@ -261,7 +262,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend vllm \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --endpoint /v1/completions \
@@ -296,7 +297,7 @@ The following arguments can be used to control the ramp-up:
 <br/>
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --dataset-name sonnet \
   --dataset-path vllm/benchmarks/sonnet.txt \
@@ -314,7 +315,7 @@ Total num output tokens:  1500
 **VisionArena Benchmark for Vision Language Models**
 
 ``` bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -336,7 +337,7 @@ Total num output tokens:  1280
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
 VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
     --dataset-name=hf \
     --dataset-path=likaixin/InstructCoder \
     --model=meta-llama/Meta-Llama-3-8B-Instruct \
@@ -360,7 +361,7 @@ Total num output tokens:  204800
 **`lmms-lab/LLaVA-OneVision-Data`**
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -373,7 +374,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -385,7 +386,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 **`AI-MO/aimo-validation-aime`**
 
 ```bash
-python3 benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/QwQ-32B \
   --backend vllm \
   --dataset-name hf \
@@ -399,7 +400,7 @@ python3 benchmarks/benchmark_throughput.py \
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model meta-llama/Llama-2-7b-hf \
   --backend vllm \
   --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index eaa28ea5c92b..ef7c7c94c883 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
 # See details in README (benchmarks/auto_tune/README.md).
 
 TAG=$(date +"%Y_%m_%d_%H_%M")
@@ -47,7 +47,7 @@ start_server() {
     local max_num_batched_tokens=$3
     local vllm_log=$4
     local profile_dir=$5
-    
+
     pkill -f vllm
 
     VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@@ -64,9 +64,9 @@ start_server() {
 
     # wait for 10 minutes...
     server_started=0
-    for i in {1..60}; do  
+    for i in {1..60}; do
         RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
-        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
         if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
             break
@@ -89,10 +89,10 @@ update_best_profile() {
     selected_profile_file=
     if [[ "$SYSTEM" == "TPU" ]]; then
         selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
-    fi 
+    fi
     if [[ "$SYSTEM" == "GPU" ]]; then
         selected_profile_file="${sorted_paths[$profile_index]}"
-    fi 
+    fi
     rm -f $PROFILE_PATH/*
     cp $selected_profile_file $PROFILE_PATH
 }
@@ -120,28 +120,28 @@ run_benchmark() {
         echo "server started."
     fi
     echo
-    
+
     echo "run benchmark test..."
     meet_latency_requirement=0
     # get a basic qps by using request-rate inf
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
     prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
 adjusted_input_len=$(( INPUT_LEN - prefix_len ))
-    python3 benchmarks/benchmark_serving.py \
+    vllm3 bench serve \
         --backend vllm \
         --model $MODEL  \
         --dataset-name random \
         --random-input-len $adjusted_input_len \
         --random-output-len $OUTPUT_LEN \
-        --ignore-eos \
-        --disable-tqdm \
-        --request-rate inf \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
-        --num-prompts 1000 \
         --random-prefix-len $prefix_len \
+        --num-prompts $NUM_PROMPTS \
         --port 8004 \
-        --profile &> "$bm_log"
+        --save-result \
+        --result-dir $LOG_FOLDER \
+        --result-filename bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.json \
+        --request-rate inf \
+        --ignore-eos \
+        2>&1 | tee $bm_log
     throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
     goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@@ -160,7 +160,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
             curl -X POST http://0.0.0.0:8004/reset_prefix_cache
             sleep 5
             bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
-            python3 benchmarks/benchmark_serving.py \
+            vllm bench serve \
                 --backend vllm \
                 --model $MODEL  \
                 --dataset-name random \
@@ -245,4 +245,3 @@ done
 echo "finish permutations"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
-
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index a5851cfe963d..bd450fb61e00 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -38,13 +38,13 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
 benchmark_serving.py:
 
 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model meta-llama/Meta-Llama-3-70B \
     --dataset-name sharegpt \
     --dataset-path sharegpt.json \
     --profile \
-    --num-prompts 2
+    --num-prompts 10
 ```
 
 ## Profile with NVIDIA Nsight Systems
@@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script:
 nsys profile -o report.nsys-rep \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
-    python benchmarks/benchmark_latency.py \
+vllm bench latency \
     --model meta-llama/Llama-3.1-8B-Instruct \
     --num-iters-warmup 5 \
     --num-iters 1 \
@@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \
     vllm serve meta-llama/Llama-3.1-8B-Instruct
 
 # client
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model meta-llama/Llama-3.1-8B-Instruct \
     --num-prompts 1 \
@@ -132,7 +132,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
     ...
     ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
 
-    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name
     --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
         46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
         14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
@@ -143,7 +143,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
         2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
         1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
         0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
-    ... 
+    ...
     ```
 
 GUI example:
diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md
index 9f6acf3291dd..9d334f8873d9 100644
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
 # Detailed Design
 
 ## Overall Process
-As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:  
-
-1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.  
-2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.  
-3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.  
-4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.  
-5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.  
-6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.  
+As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
+
+1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
+2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
+3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
+4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
+5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
+6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
 
 ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
@@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
 ??? console "Command"
 
     ```shell
-    python3 benchmark_serving.py \
+    vllm bench serve \
         --backend vllm \
         --model base_model \
         --tokenizer meta-llama/Llama-3.1-8B-Instruct \
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
index 2966f386c93a..8dbadae41007 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
 PREFILL_GPUS=${PREFILL_GPUS:-0}
 DECODE_GPUS=${DECODE_GPUS:-1,2,3}
 PREFILL_PORTS=${PREFILL_PORTS:-20003}
-DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} 
+DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
 
 echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
 echo ""
@@ -163,7 +163,7 @@ main() {
         local gpu_id=${PREFILL_GPU_ARRAY[$i]}
         local port=${PREFILL_PORT_ARRAY[$i]}
         local kv_port=$((21001 + i))
-        
+
         echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
         CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
         --enforce-eager \
@@ -192,7 +192,7 @@ main() {
         local gpu_id=${DECODE_GPU_ARRAY[$i]}
         local port=${DECODE_PORT_ARRAY[$i]}
         local kv_port=$((22001 + i))
-        
+
         echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
         VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
         --enforce-eager \
@@ -232,7 +232,7 @@ main() {
     # Run Benchmark
     # =============================================================================
     cd ../../../benchmarks/
-    python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
+    vllm bench serve --port 10001 --seed $(date +%s) \
         --model $MODEL \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
@@ -242,4 +242,4 @@ main() {
     cleanup
 }
 
-main
\ No newline at end of file
+main
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
index 0b6c9213ebff..1178681f1533 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -122,7 +122,7 @@ main() {
 
     # begin benchmark
     cd ../../../../benchmarks/
-    python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
+    vllm bench serve --port 9000 --seed $(date +%s) \
         --model meta-llama/Llama-3.1-8B-Instruct \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
@@ -133,4 +133,4 @@ main() {
 
 }
 
-main
\ No newline at end of file
+main
diff --git a/vllm_onboarding_guide.md b/vllm_onboarding_guide.md
new file mode 100644
index 000000000000..86254182ba57
--- /dev/null
+++ b/vllm_onboarding_guide.md
@@ -0,0 +1,568 @@
+# vLLM v1 Developer Onboarding Guide
+
+## 1. 🧭 Overview
+
+vLLM v1 is a high-performance large language model serving framework designed for **easy, fast, and cheap LLM serving**. It represents a major architectural upgrade from v0 with significant performance improvements and cleaner code organization.
+
+### Key Features
+- **High-throughput serving** with state-of-the-art performance
+- **PagedAttention** for efficient memory management of attention key-value pairs
+- **Continuous batching** of incoming requests for optimal resource utilization
+- **Speculative decoding** and **chunked prefill** for faster inference
+- **Multi-modal support** (text, vision, audio) with unified processing
+- **Distributed inference** with tensor and pipeline parallelism
+- **Prefix caching** for improved efficiency on repeated prompts
+- **Multiple hardware support** (NVIDIA GPUs, AMD, Intel, TPU, AWS Neuron)
+
+### Technologies Used
+- **Language**: Python 3.8+ with C++/CUDA kernels
+- **Framework**: PyTorch with custom CUDA kernels
+- **Distributed**: Ray for multi-node coordination, multiprocessing for local parallelism
+- **Memory Management**: Custom block-based KV cache with PagedAttention
+- **API**: OpenAI-compatible REST API server
+- **Build System**: CMake for C++/CUDA components, setuptools for Python
+
+---
+
+## 2. 🧱 High-Level Architecture Diagram
+
+```mermaid
+graph TB
+    subgraph "Client Layer"
+        API[API Server]
+        CLI[CLI Interface]
+        SDK[Python SDK]
+    end
+
+    subgraph "Engine Layer"
+        LLMEngine[LLM Engine]
+        Processor[Input Processor]
+        OutputProcessor[Output Processor]
+    end
+
+    subgraph "Core Execution"
+        EngineCore[Engine Core]
+        Scheduler[Scheduler]
+        KVManager[KV Cache Manager]
+        StructuredOutput[Structured Output Manager]
+    end
+
+    subgraph "Execution Layer"
+        Executor[Executor]
+        subgraph "Workers"
+            GPUWorker[GPU Worker]
+            CPUWorker[CPU Worker]
+            TPUWorker[TPU Worker]
+        end
+        ModelRunner[Model Runner]
+    end
+
+    subgraph "Storage & Cache"
+        KVCache[(KV Cache Blocks)]
+        EncoderCache[(Encoder Cache)]
+        PrefixCache[(Prefix Cache)]
+    end
+
+    subgraph "External Systems"
+        HF[Hugging Face Models]
+        Distributed[Ray Cluster]
+        Monitoring[Prometheus Metrics]
+    end
+
+    API --> LLMEngine
+    CLI --> LLMEngine
+    SDK --> LLMEngine
+
+    LLMEngine --> Processor
+    LLMEngine --> OutputProcessor
+    LLMEngine --> EngineCore
+
+    EngineCore --> Scheduler
+    EngineCore --> KVManager
+    EngineCore --> StructuredOutput
+
+    Scheduler --> Executor
+    Executor --> GPUWorker
+    Executor --> CPUWorker
+    Executor --> TPUWorker
+
+    GPUWorker --> ModelRunner
+    CPUWorker --> ModelRunner
+    TPUWorker --> ModelRunner
+
+    KVManager --> KVCache
+    KVManager --> EncoderCache
+    KVManager --> PrefixCache
+
+    ModelRunner --> HF
+    Executor --> Distributed
+    LLMEngine --> Monitoring
+```
+
+### Component Explanations
+
+- **LLM Engine**: Main orchestrator that coordinates all components and provides the public API
+- **Engine Core**: Core execution engine that manages request lifecycle and coordinates scheduling
+- **Scheduler**: Intelligent request scheduler that manages resource allocation and batching decisions
+- **KV Cache Manager**: Sophisticated memory manager using PagedAttention for efficient key-value storage
+- **Workers**: Hardware-specific execution units that run the actual model inference
+- **Executor**: Coordinates distributed execution across multiple workers/nodes
+
+---
+
+## 3. 🔎 Component Breakdown
+
+### Component: LLM Engine (`/data/users/yeq/gitrepos/vllm/vllm/v1/engine/llm_engine.py`)
+
+**Purpose**:
+Main entry point and orchestrator for the entire vLLM system. Provides backward compatibility with v0 API while leveraging v1 architecture improvements.
+
+**Key Elements**:
+- `LLMEngine.__init__()`: Initializes all core components and establishes communication channels
+- `add_request()`: Processes and queues new inference requests with validation
+- `step()`: Executes one inference iteration, coordinating scheduling and execution
+- `abort_request()`: Handles request cancellation and resource cleanup
+- `get_tokenizer_group()`: Provides access to tokenization services
+
+**Depends On**:
+- Internal: `EngineCoreClient`, `Processor`, `OutputProcessor`, `Executor`
+- External: PyTorch, Hugging Face Transformers, Ray (optional)
+
+---
+
+### Component: Engine Core (`/data/users/yeq/gitrepos/vllm/vllm/v1/engine/core.py`)
+
+**Purpose**:
+Core execution engine that manages the request lifecycle, coordinates between scheduler and workers, and handles distributed execution.
+
+**Key Elements**:
+- `EngineCore.add_request()`: Validates and queues requests for scheduling
+- `EngineCore.get_output()`: Retrieves completed inference results
+- `EngineCore.abort_requests()`: Handles request cancellation
+- `EngineCoreClient`: Client interface for multiprocess communication
+
+**Depends On**:
+- Internal: `Scheduler`, `Executor`, `ModelRunner`
+- External: Multiprocessing, asyncio
+
+---
+
+### Component: Scheduler (`/data/users/yeq/gitrepos/vllm/vllm/v1/core/sched/scheduler.py`)
+
+**Purpose**:
+Intelligent request scheduler that makes optimal batching decisions, manages resource allocation, and handles advanced features like speculative decoding and prefix caching.
+
+**Key Elements**:
+- `Scheduler.schedule()`: Core scheduling algorithm that batches requests optimally
+- `_try_schedule_encoder_inputs()`: Handles multi-modal input scheduling
+- `update_from_output()`: Processes model outputs and updates request states
+- `_make_cached_request_data()`: Optimizes data structures for cached requests
+
+**Depends On**:
+- Internal: `KVCacheManager`, `StructuredOutputManager`, `RequestQueue`
+- External: None (pure Python logic)
+
+---
+
+### Component: KV Cache Manager (`/data/users/yeq/gitrepos/vllm/vllm/v1/core/kv_cache_manager.py`)
+
+**Purpose**:
+Sophisticated memory management system implementing PagedAttention for efficient key-value cache storage and retrieval.
+
+**Key Elements**:
+- `KVCacheManager.allocate_slots()`: Allocates memory blocks for new requests
+- `get_computed_blocks()`: Retrieves cached computation results
+- `free()`: Releases memory blocks when requests complete
+- `cache_blocks()`: Implements prefix caching for repeated prompts
+
+**Depends On**:
+- Internal: `BlockPool`, `KVCacheUtils`
+- External: PyTorch tensors
+
+---
+
+### Component: Workers (`/data/users/yeq/gitrepos/vllm/vllm/v1/worker/`)
+
+**Purpose**:
+Hardware-specific execution units that perform the actual model inference on different accelerators.
+
+**Key Elements**:
+- `GPUWorker`: NVIDIA GPU-optimized execution with CUDA kernels
+- `CPUWorker`: CPU-based inference for cost-effective serving
+- `TPUWorker`: Google TPU integration for specialized workloads
+- `ModelRunner`: Coordinates model execution and batch processing
+
+**Depends On**:
+- Internal: `InputBatch`, `BlockTable`, model loading utilities
+- External: PyTorch, hardware-specific libraries (CUDA, TPU)
+
+---
+
+### Component: Executors (`/data/users/yeq/gitrepos/vllm/vllm/v1/executor/`)
+
+**Purpose**:
+Coordinates distributed execution across multiple workers and handles different parallelism strategies.
+
+**Key Elements**:
+- `MultiprocessExecutor`: Local multi-GPU execution
+- `RayDistributedExecutor`: Multi-node distributed execution via Ray
+- `AbstractExecutor`: Base interface for all execution strategies
+
+**Depends On**:
+- Internal: `Worker` implementations
+- External: Ray (for distributed), multiprocessing
+
+---
+
+### Component: Request Processing (`/data/users/yeq/gitrepos/vllm/vllm/v1/request.py`, `/data/users/yeq/gitrepos/vllm/vllm/v1/outputs.py`)
+
+**Purpose**:
+Handles request lifecycle management, input validation, and output formatting.
+
+**Key Elements**:
+- `Request`: Core request data structure with state management
+- `RequestStatus`: Enum tracking request lifecycle states
+- `ModelRunnerOutput`: Structured output from model execution
+- `SamplerOutput`: Token sampling results with logprobs
+
+**Depends On**:
+- Internal: Sampling/pooling parameters, multi-modal inputs
+- External: PyTorch tensors
+
+---
+
+## 4. 🔁 Data Flow & Call Flow Examples
+
+### Example Flow: Single Request Processing
+
+**Description**:
+A client submits a text generation request that goes through the complete vLLM pipeline from input processing to response generation.
+
+**Sequence Diagram**:
+
+```mermaid
+sequenceDiagram
+    participant Client
+    participant LLMEngine
+    participant Processor
+    participant EngineCore
+    participant Scheduler
+    participant KVManager
+    participant Executor
+    participant Worker
+    participant ModelRunner
+
+    Client->>LLMEngine: add_request(prompt, sampling_params)
+    LLMEngine->>Processor: process_inputs(prompt, params)
+    Processor-->>LLMEngine: EngineCoreRequest
+    LLMEngine->>EngineCore: add_request(core_request)
+    EngineCore->>Scheduler: add_request(request)
+
+    Note over Scheduler: Request queued in WAITING state
+
+    Client->>LLMEngine: step() - Execute inference
+    LLMEngine->>EngineCore: get_output()
+    EngineCore->>Scheduler: schedule()
+
+    Scheduler->>KVManager: allocate_slots(request, num_tokens)
+    KVManager-->>Scheduler: allocated_blocks
+    Scheduler-->>EngineCore: SchedulerOutput
+
+    EngineCore->>Executor: execute_model(scheduler_output)
+    Executor->>Worker: execute_model_async(model_input)
+    Worker->>ModelRunner: execute_model(model_input)
+
+    Note over ModelRunner: Forward pass through transformer
+
+    ModelRunner-->>Worker: ModelRunnerOutput
+    Worker-->>Executor: ModelRunnerOutput
+    Executor-->>EngineCore: ModelRunnerOutput
+
+    EngineCore->>Scheduler: update_from_output(output)
+    Scheduler-->>EngineCore: EngineCoreOutputs
+    EngineCore-->>LLMEngine: EngineCoreOutputs
+
+    LLMEngine->>LLMEngine: output_processor.process_outputs()
+    LLMEngine-->>Client: RequestOutput
+```
+
+---
+
+### Example Flow: Batched Request Processing
+
+**Description**:
+Multiple requests are intelligently batched together for efficient GPU utilization, demonstrating vLLM's continuous batching capabilities.
+
+**Sequence Diagram**:
+
+```mermaid
+sequenceDiagram
+    participant Client1
+    participant Client2
+    participant Client3
+    participant LLMEngine
+    participant Scheduler
+    participant KVManager
+    participant Worker
+
+    Client1->>LLMEngine: add_request(req1)
+    Client2->>LLMEngine: add_request(req2)
+    Client3->>LLMEngine: add_request(req3)
+
+    Note over LLMEngine: Multiple requests queued
+
+    LLMEngine->>Scheduler: schedule()
+
+    Note over Scheduler: Batch optimization logic
+    Scheduler->>Scheduler: calculate_token_budget()
+    Scheduler->>Scheduler: select_requests_for_batch()
+
+    loop For each selected request
+        Scheduler->>KVManager: allocate_slots(request)
+        KVManager-->>Scheduler: blocks_allocated
+    end
+
+    Scheduler-->>LLMEngine: SchedulerOutput(batched_requests)
+
+    LLMEngine->>Worker: execute_model(batch)
+
+    Note over Worker: Single forward pass for all requests
+
+    Worker-->>LLMEngine: ModelRunnerOutput(batch_results)
+
+    LLMEngine->>LLMEngine: split_batch_outputs()
+    LLMEngine-->>Client1: RequestOutput(req1_result)
+    LLMEngine-->>Client2: RequestOutput(req2_result)
+    LLMEngine-->>Client3: RequestOutput(req3_result)
+```
+
+---
+
+### Example Flow: Prefix Caching Hit
+
+**Description**:
+A request benefits from prefix caching when its prompt shares a common prefix with a previously processed request.
+
+**Sequence Diagram**:
+
+```mermaid
+sequenceDiagram
+    participant Client
+    participant LLMEngine
+    participant Scheduler
+    participant KVManager
+    participant PrefixCache
+
+    Client->>LLMEngine: add_request("Explain quantum physics...")
+    LLMEngine->>Scheduler: schedule()
+
+    Scheduler->>KVManager: get_computed_blocks(request)
+    KVManager->>PrefixCache: lookup_prefix_hash(prompt_tokens)
+
+    alt Cache Hit
+        PrefixCache-->>KVManager: cached_blocks(num_cached_tokens=50)
+        KVManager-->>Scheduler: computed_blocks + cache_info
+
+        Note over Scheduler: Skip computation for cached tokens
+        Scheduler->>Scheduler: schedule_remaining_tokens(total-cached)
+
+    else Cache Miss
+        PrefixCache-->>KVManager: no_cache_found
+        KVManager-->>Scheduler: empty_blocks
+
+        Note over Scheduler: Full computation required
+        Scheduler->>Scheduler: schedule_all_tokens()
+    end
+
+    Scheduler-->>LLMEngine: SchedulerOutput
+    Note over LLMEngine: Execution continues with optimized token count
+```
+
+---
+
+### Example Flow: Multi-Modal Request Processing
+
+**Description**:
+Processing a request that includes both text and image inputs, demonstrating vLLM's multi-modal capabilities.
+
+**Sequence Diagram**:
+
+```mermaid
+sequenceDiagram
+    participant Client
+    participant LLMEngine
+    participant Processor
+    participant Scheduler
+    participant EncoderCache
+    participant Worker
+    participant VisionEncoder
+
+    Client->>LLMEngine: add_request(text="Describe image", image=img_data)
+    LLMEngine->>Processor: process_inputs(multimodal_input)
+
+    Processor->>Processor: tokenize_text()
+    Processor->>Processor: process_image_placeholders()
+    Processor-->>LLMEngine: Request(mm_inputs, mm_positions)
+
+    LLMEngine->>Scheduler: schedule()
+    Scheduler->>Scheduler: _try_schedule_encoder_inputs()
+
+    alt Encoder Input Needed
+        Scheduler->>EncoderCache: can_allocate(request, input_id)
+        EncoderCache-->>Scheduler: cache_available
+
+        Scheduler->>EncoderCache: allocate(request, input_id)
+        Scheduler-->>LLMEngine: SchedulerOutput(encoder_inputs=[0])
+
+        LLMEngine->>Worker: execute_model(scheduler_output)
+        Worker->>VisionEncoder: encode_image(image_data)
+        VisionEncoder-->>Worker: image_embeddings
+
+        Worker->>Worker: merge_text_image_embeddings()
+        Worker-->>LLMEngine: ModelRunnerOutput
+
+    else Encoder Cached
+        Scheduler->>EncoderCache: get_cached_embeddings()
+        EncoderCache-->>Scheduler: cached_embeddings
+        Note over Scheduler: Skip encoder computation
+    end
+
+    LLMEngine-->>Client: RequestOutput(generated_text)
+```
+
+---
+
+## 5. 🗃️ Data Models (Entities)
+
+### Entity: Request
+
+- **Class**: `Request` in `/data/users/yeq/gitrepos/vllm/vllm/v1/request.py`
+- **Fields**:
+  - `request_id: str` – unique identifier for the request
+  - `prompt_token_ids: list[int]` – tokenized input prompt
+  - `sampling_params: SamplingParams` – generation parameters (temperature, top_p, etc.)
+  - `pooling_params: PoolingParams` – for embedding/pooling requests
+  - `status: RequestStatus` – current lifecycle state (WAITING, RUNNING, FINISHED_*)
+  - `num_computed_tokens: int` – number of tokens already processed
+  - `max_tokens: int` – maximum tokens to generate
+  - `arrival_time: float` – timestamp when request was received
+  - `priority: int` – scheduling priority (higher = more important)
+
+- **Relations**:
+  - Contains `MultiModalKwargs` for vision/audio inputs
+  - References `LoRARequest` for adapter-specific inference
+  - Links to `StructuredOutputRequest` for guided generation
+
+- **Notes**:
+  - Immutable token lists use `ConstantList` wrapper for safety
+  - Supports speculative decoding with `spec_token_ids`
+  - Tracks prefix cache hits via `num_cached_tokens`
+
+---
+
+### Entity: RequestStatus
+
+- **Enum**: `RequestStatus` in `/data/users/yeq/gitrepos/vllm/vllm/v1/request.py`
+- **Values**:
+  - `WAITING` – queued for scheduling
+  - `WAITING_FOR_FSM` – waiting for structured output compilation
+  - `WAITING_FOR_REMOTE_KVS` – waiting for distributed KV transfer
+  - `RUNNING` – actively being processed
+  - `PREEMPTED` – temporarily paused for higher priority requests
+  - `FINISHED_STOPPED` – completed normally (stop token/string)
+  - `FINISHED_LENGTH_CAPPED` – completed due to max length
+  - `FINISHED_ABORTED` – cancelled by client
+  - `FINISHED_IGNORED` – rejected due to constraints
+
+- **Relations**:
+  - Maps to `FinishReason` enum for API compatibility
+  - Used by scheduler for state transitions
+
+- **Notes**:
+  - States > PREEMPTED are considered finished
+  - Supports graceful degradation and error handling
+
+---
+
+### Entity: ModelRunnerOutput
+
+- **Class**: `ModelRunnerOutput` in `/data/users/yeq/gitrepos/vllm/vllm/v1/outputs.py`
+- **Fields**:
+  - `req_ids: list[str]` – request identifiers in batch order
+  - `req_id_to_index: dict[str, int]` – mapping for efficient lookup
+  - `sampled_token_ids: list[list[int]]` – generated tokens per request
+  - `spec_token_ids: list[list[int]]` – speculative tokens (if enabled)
+  - `logprobs: LogprobsLists` – token probabilities for each request
+  - `prompt_logprobs_dict: dict[str, LogprobsTensors]` – prompt token probabilities
+  - `pooler_output: list[torch.Tensor]` – embeddings for pooling requests
+
+- **Relations**:
+  - Consumed by `Scheduler.update_from_output()`
+  - Converted to `RequestOutput` by `OutputProcessor`
+
+- **Notes**:
+  - Uses lists instead of tensors for efficient serialization
+  - Supports variable-length outputs per request in batch
+
+---
+
+### Entity: SchedulerOutput
+
+- **Class**: `SchedulerOutput` in `/data/users/yeq/gitrepos/vllm/vllm/v1/core/sched/output.py`
+- **Fields**:
+  - `scheduled_new_reqs: list[NewRequestData]` – first-time scheduled requests
+  - `scheduled_cached_reqs: CachedRequestData` – continuing requests
+  - `num_scheduled_tokens: dict[str, int]` – tokens per request this step
+  - `total_num_scheduled_tokens: int` – total batch size
+  - `scheduled_encoder_inputs: dict[str, list[int]]` – multi-modal inputs to process
+  - `num_common_prefix_blocks: list[int]` – shared prefix optimization data
+
+- **Relations**:
+  - Produced by `Scheduler.schedule()`
+  - Consumed by `Executor.execute_model()`
+
+- **Notes**:
+  - Optimizes memory layout for different request types
+  - Includes metadata for advanced features (speculative decoding, prefix caching)
+
+---
+
+### Entity: KVCacheConfig
+
+- **Class**: `KVCacheConfig` in `/data/users/yeq/gitrepos/vllm/vllm/v1/kv_cache_interface.py`
+- **Fields**:
+  - `block_size: int` – tokens per memory block (typically 16)
+  - `num_gpu_blocks: int` – total GPU memory blocks available
+  - `num_cpu_blocks: int` – CPU memory blocks for offloading
+  - `cache_dtype: torch.dtype` – data type for cache storage
+  - `kv_cache_groups: list[KVCacheGroup]` – cache organization
+
+- **Relations**:
+  - Used by `KVCacheManager` for memory allocation
+  - Configured based on model and hardware constraints
+
+- **Notes**:
+  - Block-based design enables efficient memory management
+  - Supports heterogeneous memory hierarchies (GPU/CPU)
+
+---
+
+### Entity: SamplingParams
+
+- **Class**: `SamplingParams` in `vllm/sampling_params.py`
+- **Fields**:
+  - `n: int` – number of output sequences to generate
+  - `max_tokens: int` – maximum tokens to generate
+  - `temperature: float` – sampling randomness (0.0 = deterministic)
+  - `top_p: float` – nucleus sampling threshold
+  - `top_k: int` – top-k sampling limit
+  - `stop: list[str]` – stop strings to terminate generation
+  - `logprobs: int` – number of log probabilities to return
+
+- **Relations**:
+  - Embedded in `Request` objects
+  - Used by sampling kernels during generation
+
+- **Notes**:
+  - Supports advanced sampling strategies (beam search, parallel sampling)
+  - Extensible for custom sampling algorithms# vLLM Developer Onboarding Guide

From cd501306f964501394c5ab5a4574fa6a1cd09a8b Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Mon, 21 Jul 2025 23:44:27 -0700
Subject: [PATCH 2/6] remove unneeded files

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm_onboarding_guide.md | 568 ---------------------------------------
 1 file changed, 568 deletions(-)
 delete mode 100644 vllm_onboarding_guide.md

diff --git a/vllm_onboarding_guide.md b/vllm_onboarding_guide.md
deleted file mode 100644
index 86254182ba57..000000000000
--- a/vllm_onboarding_guide.md
+++ /dev/null
@@ -1,568 +0,0 @@
-# vLLM v1 Developer Onboarding Guide
-
-## 1. 🧭 Overview
-
-vLLM v1 is a high-performance large language model serving framework designed for **easy, fast, and cheap LLM serving**. It represents a major architectural upgrade from v0 with significant performance improvements and cleaner code organization.
-
-### Key Features
-- **High-throughput serving** with state-of-the-art performance
-- **PagedAttention** for efficient memory management of attention key-value pairs
-- **Continuous batching** of incoming requests for optimal resource utilization
-- **Speculative decoding** and **chunked prefill** for faster inference
-- **Multi-modal support** (text, vision, audio) with unified processing
-- **Distributed inference** with tensor and pipeline parallelism
-- **Prefix caching** for improved efficiency on repeated prompts
-- **Multiple hardware support** (NVIDIA GPUs, AMD, Intel, TPU, AWS Neuron)
-
-### Technologies Used
-- **Language**: Python 3.8+ with C++/CUDA kernels
-- **Framework**: PyTorch with custom CUDA kernels
-- **Distributed**: Ray for multi-node coordination, multiprocessing for local parallelism
-- **Memory Management**: Custom block-based KV cache with PagedAttention
-- **API**: OpenAI-compatible REST API server
-- **Build System**: CMake for C++/CUDA components, setuptools for Python
-
----
-
-## 2. 🧱 High-Level Architecture Diagram
-
-```mermaid
-graph TB
-    subgraph "Client Layer"
-        API[API Server]
-        CLI[CLI Interface]
-        SDK[Python SDK]
-    end
-
-    subgraph "Engine Layer"
-        LLMEngine[LLM Engine]
-        Processor[Input Processor]
-        OutputProcessor[Output Processor]
-    end
-
-    subgraph "Core Execution"
-        EngineCore[Engine Core]
-        Scheduler[Scheduler]
-        KVManager[KV Cache Manager]
-        StructuredOutput[Structured Output Manager]
-    end
-
-    subgraph "Execution Layer"
-        Executor[Executor]
-        subgraph "Workers"
-            GPUWorker[GPU Worker]
-            CPUWorker[CPU Worker]
-            TPUWorker[TPU Worker]
-        end
-        ModelRunner[Model Runner]
-    end
-
-    subgraph "Storage & Cache"
-        KVCache[(KV Cache Blocks)]
-        EncoderCache[(Encoder Cache)]
-        PrefixCache[(Prefix Cache)]
-    end
-
-    subgraph "External Systems"
-        HF[Hugging Face Models]
-        Distributed[Ray Cluster]
-        Monitoring[Prometheus Metrics]
-    end
-
-    API --> LLMEngine
-    CLI --> LLMEngine
-    SDK --> LLMEngine
-
-    LLMEngine --> Processor
-    LLMEngine --> OutputProcessor
-    LLMEngine --> EngineCore
-
-    EngineCore --> Scheduler
-    EngineCore --> KVManager
-    EngineCore --> StructuredOutput
-
-    Scheduler --> Executor
-    Executor --> GPUWorker
-    Executor --> CPUWorker
-    Executor --> TPUWorker
-
-    GPUWorker --> ModelRunner
-    CPUWorker --> ModelRunner
-    TPUWorker --> ModelRunner
-
-    KVManager --> KVCache
-    KVManager --> EncoderCache
-    KVManager --> PrefixCache
-
-    ModelRunner --> HF
-    Executor --> Distributed
-    LLMEngine --> Monitoring
-```
-
-### Component Explanations
-
-- **LLM Engine**: Main orchestrator that coordinates all components and provides the public API
-- **Engine Core**: Core execution engine that manages request lifecycle and coordinates scheduling
-- **Scheduler**: Intelligent request scheduler that manages resource allocation and batching decisions
-- **KV Cache Manager**: Sophisticated memory manager using PagedAttention for efficient key-value storage
-- **Workers**: Hardware-specific execution units that run the actual model inference
-- **Executor**: Coordinates distributed execution across multiple workers/nodes
-
----
-
-## 3. 🔎 Component Breakdown
-
-### Component: LLM Engine (`/data/users/yeq/gitrepos/vllm/vllm/v1/engine/llm_engine.py`)
-
-**Purpose**:
-Main entry point and orchestrator for the entire vLLM system. Provides backward compatibility with v0 API while leveraging v1 architecture improvements.
-
-**Key Elements**:
-- `LLMEngine.__init__()`: Initializes all core components and establishes communication channels
-- `add_request()`: Processes and queues new inference requests with validation
-- `step()`: Executes one inference iteration, coordinating scheduling and execution
-- `abort_request()`: Handles request cancellation and resource cleanup
-- `get_tokenizer_group()`: Provides access to tokenization services
-
-**Depends On**:
-- Internal: `EngineCoreClient`, `Processor`, `OutputProcessor`, `Executor`
-- External: PyTorch, Hugging Face Transformers, Ray (optional)
-
----
-
-### Component: Engine Core (`/data/users/yeq/gitrepos/vllm/vllm/v1/engine/core.py`)
-
-**Purpose**:
-Core execution engine that manages the request lifecycle, coordinates between scheduler and workers, and handles distributed execution.
-
-**Key Elements**:
-- `EngineCore.add_request()`: Validates and queues requests for scheduling
-- `EngineCore.get_output()`: Retrieves completed inference results
-- `EngineCore.abort_requests()`: Handles request cancellation
-- `EngineCoreClient`: Client interface for multiprocess communication
-
-**Depends On**:
-- Internal: `Scheduler`, `Executor`, `ModelRunner`
-- External: Multiprocessing, asyncio
-
----
-
-### Component: Scheduler (`/data/users/yeq/gitrepos/vllm/vllm/v1/core/sched/scheduler.py`)
-
-**Purpose**:
-Intelligent request scheduler that makes optimal batching decisions, manages resource allocation, and handles advanced features like speculative decoding and prefix caching.
-
-**Key Elements**:
-- `Scheduler.schedule()`: Core scheduling algorithm that batches requests optimally
-- `_try_schedule_encoder_inputs()`: Handles multi-modal input scheduling
-- `update_from_output()`: Processes model outputs and updates request states
-- `_make_cached_request_data()`: Optimizes data structures for cached requests
-
-**Depends On**:
-- Internal: `KVCacheManager`, `StructuredOutputManager`, `RequestQueue`
-- External: None (pure Python logic)
-
----
-
-### Component: KV Cache Manager (`/data/users/yeq/gitrepos/vllm/vllm/v1/core/kv_cache_manager.py`)
-
-**Purpose**:
-Sophisticated memory management system implementing PagedAttention for efficient key-value cache storage and retrieval.
-
-**Key Elements**:
-- `KVCacheManager.allocate_slots()`: Allocates memory blocks for new requests
-- `get_computed_blocks()`: Retrieves cached computation results
-- `free()`: Releases memory blocks when requests complete
-- `cache_blocks()`: Implements prefix caching for repeated prompts
-
-**Depends On**:
-- Internal: `BlockPool`, `KVCacheUtils`
-- External: PyTorch tensors
-
----
-
-### Component: Workers (`/data/users/yeq/gitrepos/vllm/vllm/v1/worker/`)
-
-**Purpose**:
-Hardware-specific execution units that perform the actual model inference on different accelerators.
-
-**Key Elements**:
-- `GPUWorker`: NVIDIA GPU-optimized execution with CUDA kernels
-- `CPUWorker`: CPU-based inference for cost-effective serving
-- `TPUWorker`: Google TPU integration for specialized workloads
-- `ModelRunner`: Coordinates model execution and batch processing
-
-**Depends On**:
-- Internal: `InputBatch`, `BlockTable`, model loading utilities
-- External: PyTorch, hardware-specific libraries (CUDA, TPU)
-
----
-
-### Component: Executors (`/data/users/yeq/gitrepos/vllm/vllm/v1/executor/`)
-
-**Purpose**:
-Coordinates distributed execution across multiple workers and handles different parallelism strategies.
-
-**Key Elements**:
-- `MultiprocessExecutor`: Local multi-GPU execution
-- `RayDistributedExecutor`: Multi-node distributed execution via Ray
-- `AbstractExecutor`: Base interface for all execution strategies
-
-**Depends On**:
-- Internal: `Worker` implementations
-- External: Ray (for distributed), multiprocessing
-
----
-
-### Component: Request Processing (`/data/users/yeq/gitrepos/vllm/vllm/v1/request.py`, `/data/users/yeq/gitrepos/vllm/vllm/v1/outputs.py`)
-
-**Purpose**:
-Handles request lifecycle management, input validation, and output formatting.
-
-**Key Elements**:
-- `Request`: Core request data structure with state management
-- `RequestStatus`: Enum tracking request lifecycle states
-- `ModelRunnerOutput`: Structured output from model execution
-- `SamplerOutput`: Token sampling results with logprobs
-
-**Depends On**:
-- Internal: Sampling/pooling parameters, multi-modal inputs
-- External: PyTorch tensors
-
----
-
-## 4. 🔁 Data Flow & Call Flow Examples
-
-### Example Flow: Single Request Processing
-
-**Description**:
-A client submits a text generation request that goes through the complete vLLM pipeline from input processing to response generation.
-
-**Sequence Diagram**:
-
-```mermaid
-sequenceDiagram
-    participant Client
-    participant LLMEngine
-    participant Processor
-    participant EngineCore
-    participant Scheduler
-    participant KVManager
-    participant Executor
-    participant Worker
-    participant ModelRunner
-
-    Client->>LLMEngine: add_request(prompt, sampling_params)
-    LLMEngine->>Processor: process_inputs(prompt, params)
-    Processor-->>LLMEngine: EngineCoreRequest
-    LLMEngine->>EngineCore: add_request(core_request)
-    EngineCore->>Scheduler: add_request(request)
-
-    Note over Scheduler: Request queued in WAITING state
-
-    Client->>LLMEngine: step() - Execute inference
-    LLMEngine->>EngineCore: get_output()
-    EngineCore->>Scheduler: schedule()
-
-    Scheduler->>KVManager: allocate_slots(request, num_tokens)
-    KVManager-->>Scheduler: allocated_blocks
-    Scheduler-->>EngineCore: SchedulerOutput
-
-    EngineCore->>Executor: execute_model(scheduler_output)
-    Executor->>Worker: execute_model_async(model_input)
-    Worker->>ModelRunner: execute_model(model_input)
-
-    Note over ModelRunner: Forward pass through transformer
-
-    ModelRunner-->>Worker: ModelRunnerOutput
-    Worker-->>Executor: ModelRunnerOutput
-    Executor-->>EngineCore: ModelRunnerOutput
-
-    EngineCore->>Scheduler: update_from_output(output)
-    Scheduler-->>EngineCore: EngineCoreOutputs
-    EngineCore-->>LLMEngine: EngineCoreOutputs
-
-    LLMEngine->>LLMEngine: output_processor.process_outputs()
-    LLMEngine-->>Client: RequestOutput
-```
-
----
-
-### Example Flow: Batched Request Processing
-
-**Description**:
-Multiple requests are intelligently batched together for efficient GPU utilization, demonstrating vLLM's continuous batching capabilities.
-
-**Sequence Diagram**:
-
-```mermaid
-sequenceDiagram
-    participant Client1
-    participant Client2
-    participant Client3
-    participant LLMEngine
-    participant Scheduler
-    participant KVManager
-    participant Worker
-
-    Client1->>LLMEngine: add_request(req1)
-    Client2->>LLMEngine: add_request(req2)
-    Client3->>LLMEngine: add_request(req3)
-
-    Note over LLMEngine: Multiple requests queued
-
-    LLMEngine->>Scheduler: schedule()
-
-    Note over Scheduler: Batch optimization logic
-    Scheduler->>Scheduler: calculate_token_budget()
-    Scheduler->>Scheduler: select_requests_for_batch()
-
-    loop For each selected request
-        Scheduler->>KVManager: allocate_slots(request)
-        KVManager-->>Scheduler: blocks_allocated
-    end
-
-    Scheduler-->>LLMEngine: SchedulerOutput(batched_requests)
-
-    LLMEngine->>Worker: execute_model(batch)
-
-    Note over Worker: Single forward pass for all requests
-
-    Worker-->>LLMEngine: ModelRunnerOutput(batch_results)
-
-    LLMEngine->>LLMEngine: split_batch_outputs()
-    LLMEngine-->>Client1: RequestOutput(req1_result)
-    LLMEngine-->>Client2: RequestOutput(req2_result)
-    LLMEngine-->>Client3: RequestOutput(req3_result)
-```
-
----
-
-### Example Flow: Prefix Caching Hit
-
-**Description**:
-A request benefits from prefix caching when its prompt shares a common prefix with a previously processed request.
-
-**Sequence Diagram**:
-
-```mermaid
-sequenceDiagram
-    participant Client
-    participant LLMEngine
-    participant Scheduler
-    participant KVManager
-    participant PrefixCache
-
-    Client->>LLMEngine: add_request("Explain quantum physics...")
-    LLMEngine->>Scheduler: schedule()
-
-    Scheduler->>KVManager: get_computed_blocks(request)
-    KVManager->>PrefixCache: lookup_prefix_hash(prompt_tokens)
-
-    alt Cache Hit
-        PrefixCache-->>KVManager: cached_blocks(num_cached_tokens=50)
-        KVManager-->>Scheduler: computed_blocks + cache_info
-
-        Note over Scheduler: Skip computation for cached tokens
-        Scheduler->>Scheduler: schedule_remaining_tokens(total-cached)
-
-    else Cache Miss
-        PrefixCache-->>KVManager: no_cache_found
-        KVManager-->>Scheduler: empty_blocks
-
-        Note over Scheduler: Full computation required
-        Scheduler->>Scheduler: schedule_all_tokens()
-    end
-
-    Scheduler-->>LLMEngine: SchedulerOutput
-    Note over LLMEngine: Execution continues with optimized token count
-```
-
----
-
-### Example Flow: Multi-Modal Request Processing
-
-**Description**:
-Processing a request that includes both text and image inputs, demonstrating vLLM's multi-modal capabilities.
-
-**Sequence Diagram**:
-
-```mermaid
-sequenceDiagram
-    participant Client
-    participant LLMEngine
-    participant Processor
-    participant Scheduler
-    participant EncoderCache
-    participant Worker
-    participant VisionEncoder
-
-    Client->>LLMEngine: add_request(text="Describe image", image=img_data)
-    LLMEngine->>Processor: process_inputs(multimodal_input)
-
-    Processor->>Processor: tokenize_text()
-    Processor->>Processor: process_image_placeholders()
-    Processor-->>LLMEngine: Request(mm_inputs, mm_positions)
-
-    LLMEngine->>Scheduler: schedule()
-    Scheduler->>Scheduler: _try_schedule_encoder_inputs()
-
-    alt Encoder Input Needed
-        Scheduler->>EncoderCache: can_allocate(request, input_id)
-        EncoderCache-->>Scheduler: cache_available
-
-        Scheduler->>EncoderCache: allocate(request, input_id)
-        Scheduler-->>LLMEngine: SchedulerOutput(encoder_inputs=[0])
-
-        LLMEngine->>Worker: execute_model(scheduler_output)
-        Worker->>VisionEncoder: encode_image(image_data)
-        VisionEncoder-->>Worker: image_embeddings
-
-        Worker->>Worker: merge_text_image_embeddings()
-        Worker-->>LLMEngine: ModelRunnerOutput
-
-    else Encoder Cached
-        Scheduler->>EncoderCache: get_cached_embeddings()
-        EncoderCache-->>Scheduler: cached_embeddings
-        Note over Scheduler: Skip encoder computation
-    end
-
-    LLMEngine-->>Client: RequestOutput(generated_text)
-```
-
----
-
-## 5. 🗃️ Data Models (Entities)
-
-### Entity: Request
-
-- **Class**: `Request` in `/data/users/yeq/gitrepos/vllm/vllm/v1/request.py`
-- **Fields**:
-  - `request_id: str` – unique identifier for the request
-  - `prompt_token_ids: list[int]` – tokenized input prompt
-  - `sampling_params: SamplingParams` – generation parameters (temperature, top_p, etc.)
-  - `pooling_params: PoolingParams` – for embedding/pooling requests
-  - `status: RequestStatus` – current lifecycle state (WAITING, RUNNING, FINISHED_*)
-  - `num_computed_tokens: int` – number of tokens already processed
-  - `max_tokens: int` – maximum tokens to generate
-  - `arrival_time: float` – timestamp when request was received
-  - `priority: int` – scheduling priority (higher = more important)
-
-- **Relations**:
-  - Contains `MultiModalKwargs` for vision/audio inputs
-  - References `LoRARequest` for adapter-specific inference
-  - Links to `StructuredOutputRequest` for guided generation
-
-- **Notes**:
-  - Immutable token lists use `ConstantList` wrapper for safety
-  - Supports speculative decoding with `spec_token_ids`
-  - Tracks prefix cache hits via `num_cached_tokens`
-
----
-
-### Entity: RequestStatus
-
-- **Enum**: `RequestStatus` in `/data/users/yeq/gitrepos/vllm/vllm/v1/request.py`
-- **Values**:
-  - `WAITING` – queued for scheduling
-  - `WAITING_FOR_FSM` – waiting for structured output compilation
-  - `WAITING_FOR_REMOTE_KVS` – waiting for distributed KV transfer
-  - `RUNNING` – actively being processed
-  - `PREEMPTED` – temporarily paused for higher priority requests
-  - `FINISHED_STOPPED` – completed normally (stop token/string)
-  - `FINISHED_LENGTH_CAPPED` – completed due to max length
-  - `FINISHED_ABORTED` – cancelled by client
-  - `FINISHED_IGNORED` – rejected due to constraints
-
-- **Relations**:
-  - Maps to `FinishReason` enum for API compatibility
-  - Used by scheduler for state transitions
-
-- **Notes**:
-  - States > PREEMPTED are considered finished
-  - Supports graceful degradation and error handling
-
----
-
-### Entity: ModelRunnerOutput
-
-- **Class**: `ModelRunnerOutput` in `/data/users/yeq/gitrepos/vllm/vllm/v1/outputs.py`
-- **Fields**:
-  - `req_ids: list[str]` – request identifiers in batch order
-  - `req_id_to_index: dict[str, int]` – mapping for efficient lookup
-  - `sampled_token_ids: list[list[int]]` – generated tokens per request
-  - `spec_token_ids: list[list[int]]` – speculative tokens (if enabled)
-  - `logprobs: LogprobsLists` – token probabilities for each request
-  - `prompt_logprobs_dict: dict[str, LogprobsTensors]` – prompt token probabilities
-  - `pooler_output: list[torch.Tensor]` – embeddings for pooling requests
-
-- **Relations**:
-  - Consumed by `Scheduler.update_from_output()`
-  - Converted to `RequestOutput` by `OutputProcessor`
-
-- **Notes**:
-  - Uses lists instead of tensors for efficient serialization
-  - Supports variable-length outputs per request in batch
-
----
-
-### Entity: SchedulerOutput
-
-- **Class**: `SchedulerOutput` in `/data/users/yeq/gitrepos/vllm/vllm/v1/core/sched/output.py`
-- **Fields**:
-  - `scheduled_new_reqs: list[NewRequestData]` – first-time scheduled requests
-  - `scheduled_cached_reqs: CachedRequestData` – continuing requests
-  - `num_scheduled_tokens: dict[str, int]` – tokens per request this step
-  - `total_num_scheduled_tokens: int` – total batch size
-  - `scheduled_encoder_inputs: dict[str, list[int]]` – multi-modal inputs to process
-  - `num_common_prefix_blocks: list[int]` – shared prefix optimization data
-
-- **Relations**:
-  - Produced by `Scheduler.schedule()`
-  - Consumed by `Executor.execute_model()`
-
-- **Notes**:
-  - Optimizes memory layout for different request types
-  - Includes metadata for advanced features (speculative decoding, prefix caching)
-
----
-
-### Entity: KVCacheConfig
-
-- **Class**: `KVCacheConfig` in `/data/users/yeq/gitrepos/vllm/vllm/v1/kv_cache_interface.py`
-- **Fields**:
-  - `block_size: int` – tokens per memory block (typically 16)
-  - `num_gpu_blocks: int` – total GPU memory blocks available
-  - `num_cpu_blocks: int` – CPU memory blocks for offloading
-  - `cache_dtype: torch.dtype` – data type for cache storage
-  - `kv_cache_groups: list[KVCacheGroup]` – cache organization
-
-- **Relations**:
-  - Used by `KVCacheManager` for memory allocation
-  - Configured based on model and hardware constraints
-
-- **Notes**:
-  - Block-based design enables efficient memory management
-  - Supports heterogeneous memory hierarchies (GPU/CPU)
-
----
-
-### Entity: SamplingParams
-
-- **Class**: `SamplingParams` in `vllm/sampling_params.py`
-- **Fields**:
-  - `n: int` – number of output sequences to generate
-  - `max_tokens: int` – maximum tokens to generate
-  - `temperature: float` – sampling randomness (0.0 = deterministic)
-  - `top_p: float` – nucleus sampling threshold
-  - `top_k: int` – top-k sampling limit
-  - `stop: list[str]` – stop strings to terminate generation
-  - `logprobs: int` – number of log probabilities to return
-
-- **Relations**:
-  - Embedded in `Request` objects
-  - Used by sampling kernels during generation
-
-- **Notes**:
-  - Supports advanced sampling strategies (beam search, parallel sampling)
-  - Extensible for custom sampling algorithms# vLLM Developer Onboarding Guide

From 3e7fd6aab9b8804cb620a6504538d259d685ac95 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Mon, 21 Jul 2025 23:53:21 -0700
Subject: [PATCH 3/6] revert some wrong changes

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 .../scripts/run-nightly-benchmarks.sh              |  5 ++---
 benchmarks/README.md                               |  2 +-
 benchmarks/auto_tune/auto_tune.sh                  | 14 +++++++-------
 docs/contributing/profiling.md                     |  2 +-
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 86153e8408cf..c8835643e24f 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -235,10 +235,9 @@ run_serving_tests() {
           --port $port \
           --save-result \
           --result-dir $RESULTS_FOLDER \
-          --result-filename ${test_name}.json \
+          --result-filename ${new_test_name}.json \
           --request-rate $qps \
-          --metadata "tensor_parallel_size=$tp" \
-          $common_params_str"
+          --ignore-eos \
           $client_args"
 
       elif [[ "$dataset_name" = "sonnet" ]]; then
diff --git a/benchmarks/README.md b/benchmarks/README.md
index ef2c57a7c079..ba17c31b2945 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -104,7 +104,7 @@ vllm bench serve \
   --endpoint /v1/completions \
   --dataset-name sharegpt \
   --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --num-prompts 1000 \
+  --num-prompts 10 \
   --request-rate 3 # By default <request_rate> is inf
 ```
 
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index ef7c7c94c883..d73860799325 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -133,15 +133,15 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
         --dataset-name random \
         --random-input-len $adjusted_input_len \
         --random-output-len $OUTPUT_LEN \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate inf \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --num-prompts 1000 \
         --random-prefix-len $prefix_len \
-        --num-prompts $NUM_PROMPTS \
         --port 8004 \
-        --save-result \
-        --result-dir $LOG_FOLDER \
-        --result-filename bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.json \
-        --request-rate inf \
-        --ignore-eos \
-        2>&1 | tee $bm_log
+        --profile &> "$bm_log"
     throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
     goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index bd450fb61e00..aa3de617e072 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -44,7 +44,7 @@ vllm bench serve \
     --dataset-name sharegpt \
     --dataset-path sharegpt.json \
     --profile \
-    --num-prompts 10
+    --num-prompts 2
 ```
 
 ## Profile with NVIDIA Nsight Systems

From 3feab9cf239cf18a36e39ec1531c086a5f6c46cf Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Mon, 21 Jul 2025 23:57:04 -0700
Subject: [PATCH 4/6] fix

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 benchmarks/README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index ba17c31b2945..3b10963c3e01 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -104,8 +104,7 @@ vllm bench serve \
   --endpoint /v1/completions \
   --dataset-name sharegpt \
   --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --num-prompts 10 \
-  --request-rate 3 # By default <request_rate> is inf
+  --num-prompts 10
 ```
 
 If successful, you will see the following output
@@ -232,7 +231,7 @@ vllm bench serve \
   --dataset-name hf \
   --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
   --hf-split train \
-  --num-prompts 1000
+  --num-prompts 10
 ```
 
 **`AI-MO/aimo-validation-aime`**

From c926f64a633dc202b156f33ad444d04403f2e1a3 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Tue, 22 Jul 2025 00:12:16 -0700
Subject: [PATCH 5/6] add deprecation warnings

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 benchmarks/benchmark_latency.py    | 5 +++++
 benchmarks/benchmark_serving.py    | 5 +++++
 benchmarks/benchmark_throughput.py | 5 +++++
 3 files changed, 15 insertions(+)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 4d2ea126b24a..d8b960edaa46 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,6 +11,7 @@
 
 import numpy as np
 from tqdm import tqdm
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
         write_to_json(pt_file, pt_records)
 
 
+@deprecated(
+    "benchmark_latency.py is deprecated and will be removed in a "
+    "future version. Please use 'vllm bench latency' instead.",
+)
 def main(args: argparse.Namespace):
     print(args)
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c597fb1068ab..a97fa280f37c 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -38,6 +38,7 @@
 import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
+from typing_extensions import deprecated
 
 from backend_request_func import (
     ASYNC_REQUEST_FUNCS,
@@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
         write_to_json(pt_file, pt_records)
 
 
+@deprecated(
+    "benchmark_serving.py is deprecated and will be removed in a future "
+    "version. Please use 'vllm bench serve' instead.",
+)
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 14461121fece..d4fc730d14f1 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -15,6 +15,7 @@
 import uvloop
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+from typing_extensions import deprecated
 
 from benchmark_dataset import (
     AIMODataset,
@@ -381,6 +382,10 @@ def get_requests(args, tokenizer):
     return dataset_cls(**common_kwargs).sample(**sample_kwargs)
 
 
+@deprecated(
+    "benchmark_throughput.py is deprecated and will be removed in a "
+    "future version. Please use 'vllm bench throughput' instead.",
+)
 def main(args: argparse.Namespace):
     if args.seed is None:
         args.seed = 0

From ef5a8ff15ab53ac8c4071e21de207d76d0f05baa Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Thu, 24 Jul 2025 10:12:04 -0700
Subject: [PATCH 6/6] vllm3 -> vllm

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 benchmarks/auto_tune/auto_tune.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index d73860799325..e0396b6ef5af 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -127,7 +127,7 @@ run_benchmark() {
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
     prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
 adjusted_input_len=$(( INPUT_LEN - prefix_len ))
-    vllm3 bench serve \
+    vllm bench serve \
         --backend vllm \
         --model $MODEL  \
         --dataset-name random \