From 72714f32e1004c854149812d9997ac5eecc37244 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Mon, 21 Jul 2025 23:42:53 -0700 Subject: [PATCH 1/6] resolve merge conflict Signed-off-by: Ye (Charlotte) Qi --- .../scripts/run-nightly-benchmarks.sh | 29 +- .../scripts/run-performance-benchmarks.sh | 6 +- .../scripts/hardware_ci/run-cpu-test.sh | 10 +- .buildkite/scripts/run-benchmarks.sh | 6 +- .buildkite/scripts/tpu/run_bm.sh | 2 +- benchmarks/README.md | 71 +-- benchmarks/auto_tune/auto_tune.sh | 33 +- docs/contributing/profiling.md | 12 +- docs/design/v1/p2p_nccl_connector.md | 18 +- .../disagg_example_p2p_nccl_xpyd.sh | 10 +- .../disagg_example_nixl.sh | 4 +- vllm_onboarding_guide.md | 568 ++++++++++++++++++ 12 files changed, 669 insertions(+), 100 deletions(-) create mode 100644 vllm_onboarding_guide.md diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 4d01a314adc4..86153e8408cf 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -73,7 +73,7 @@ get_current_llm_serving_engine() { echo "Container: vllm" # move to a completely irrelevant directory, to avoid import vllm from current folder export CURRENT_LLM_SERVING_ENGINE=vllm - + return fi } @@ -225,7 +225,7 @@ run_serving_tests() { if [[ "$dataset_name" = "sharegpt" ]]; then - client_command="python3 benchmark_serving.py \ + client_command="vllm bench serve \ --backend $backend \ --tokenizer /tokenizer_cache \ --model $model \ @@ -235,9 +235,10 @@ run_serving_tests() { --port $port \ --save-result \ --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ + --result-filename ${test_name}.json \ --request-rate $qps \ - --ignore-eos \ + --metadata "tensor_parallel_size=$tp" \ + $common_params_str" $client_args" elif [[ "$dataset_name" = "sonnet" ]]; then @@ -246,7 +247,7 @@ run_serving_tests() { sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') - client_command="python3 benchmark_serving.py \ + client_command="vllm bench serve \ --backend $backend \ --tokenizer /tokenizer_cache \ --model $model \ @@ -265,13 +266,13 @@ run_serving_tests() { $client_args" else - + echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." exit 1 fi - + echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" @@ -302,7 +303,7 @@ run_serving_tests() { } run_genai_perf_tests() { - # run genai-perf tests + # run genai-perf tests # $1: a json file specifying genai-perf test cases local genai_perf_test_file @@ -311,14 +312,14 @@ run_genai_perf_tests() { # Iterate over genai-perf tests jq -c '.[]' "$genai_perf_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - + test_name=$(echo "$params" | jq -r '.test_name') + # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." continue fi - + # prepend the current serving engine to the test name test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} @@ -369,10 +370,10 @@ run_genai_perf_tests() { qps=$num_prompts echo "now qps is $qps" fi - + new_test_name=$test_name"_qps_"$qps backend=$CURRENT_LLM_SERVING_ENGINE - + if [[ "$backend" == *"vllm"* ]]; then backend="vllm" fi @@ -413,7 +414,7 @@ prepare_dataset() { do cat sonnet.txt >> sonnet_4x.txt done - + } main() { diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index f05040618981..4eafe435f87f 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -205,7 +205,7 @@ run_latency_tests() { fi fi - latency_command=" $latency_envs python3 benchmark_latency.py \ + latency_command=" $latency_envs vllm bench latency \ --output-json $RESULTS_FOLDER/${test_name}.json \ $latency_args" @@ -272,7 +272,7 @@ run_throughput_tests() { fi fi - throughput_command=" $throughput_envs python3 benchmark_throughput.py \ + throughput_command=" $throughput_envs vllm bench throughput \ --output-json $RESULTS_FOLDER/${test_name}.json \ $throughput_args" @@ -393,7 +393,7 @@ run_serving_tests() { # pass the tensor parallel size to the client so that it can be displayed # on the benchmark dashboard - client_command="python3 benchmark_serving.py \ + client_command="vllm bench serve \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 90cc9c844622..7c7dbb461ce0 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1} export CMAKE_BUILD_PARALLEL_LEVEL=32 # Setup cleanup -remove_docker_container() { - set -e; - docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; +remove_docker_container() { + set -e; + docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; } trap remove_docker_container EXIT remove_docker_container @@ -69,7 +69,7 @@ function cpu_tests() { docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" # Note: disable it until supports V1 # Run AWQ test @@ -83,7 +83,7 @@ function cpu_tests() { set -e VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 - python3 benchmarks/benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --dataset-name random \ --model meta-llama/Llama-3.2-3B-Instruct \ diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh index 195a8063fd74..72812218cb66 100644 --- a/.buildkite/scripts/run-benchmarks.sh +++ b/.buildkite/scripts/run-benchmarks.sh @@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.." (which wget && which curl) || (apt-get update && apt-get install -y wget curl) # run python-based benchmarks and upload the result to buildkite -python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt +vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt bench_latency_exit_code=$? -python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt +vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt bench_throughput_exit_code=$? # run server-based benchmarks and upload the result to buildkite @@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r # wait for server to start, timeout after 600 seconds timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 -python3 benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --dataset-name sharegpt \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh index 877669cd956a..beecaf7a740a 100755 --- a/.buildkite/scripts/tpu/run_bm.sh +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -77,7 +77,7 @@ done echo "run benchmark test..." echo "logging to $BM_LOG" echo -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name sonnet \ diff --git a/benchmarks/README.md b/benchmarks/README.md index fb8690d42db9..ef2c57a7c079 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -98,38 +98,39 @@ Then run the benchmarking script ```bash # download dataset # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model NousResearch/Hermes-3-Llama-3.1-8B \ --endpoint /v1/completions \ --dataset-name sharegpt \ --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --num-prompts 10 + --num-prompts 1000 \ + --request-rate 3 # By default is inf ``` If successful, you will see the following output ``` ============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 5.78 -Total input tokens: 1369 -Total generated tokens: 2212 -Request throughput (req/s): 1.73 -Output token throughput (tok/s): 382.89 -Total Token throughput (tok/s): 619.85 +Successful requests: 10 +Benchmark duration (s): 5.78 +Total input tokens: 1369 +Total generated tokens: 2212 +Request throughput (req/s): 1.73 +Output token throughput (tok/s): 382.89 +Total Token throughput (tok/s): 619.85 ---------------Time to First Token---------------- -Mean TTFT (ms): 71.54 -Median TTFT (ms): 73.88 -P99 TTFT (ms): 79.49 +Mean TTFT (ms): 71.54 +Median TTFT (ms): 73.88 +P99 TTFT (ms): 79.49 -----Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 7.91 -Median TPOT (ms): 7.96 -P99 TPOT (ms): 8.03 +Mean TPOT (ms): 7.91 +Median TPOT (ms): 7.96 +P99 TPOT (ms): 8.03 ---------------Inter-token Latency---------------- -Mean ITL (ms): 7.74 -Median ITL (ms): 7.70 -P99 ITL (ms): 8.39 +Mean ITL (ms): 7.74 +Median ITL (ms): 7.70 +P99 ITL (ms): 8.39 ================================================== ``` @@ -141,7 +142,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you {"prompt": "What is the capital of India?"} {"prompt": "What is the capital of Iran?"} {"prompt": "What is the capital of China?"} -``` +``` ```bash # start server @@ -150,7 +151,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests ```bash # run benchmarking script -python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ +vllm bench serve --port 9001 --save-result --save-detailed \ --backend vllm \ --model meta-llama/Llama-3.1-8B-Instruct \ --endpoint /v1/completions \ @@ -174,7 +175,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests ``` ```bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ @@ -194,7 +195,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ ``` ``` bash -python3 benchmarks/benchmark_serving.py \ +vllm bench serve \ --model meta-llama/Meta-Llama-3-8B-Instruct \ --dataset-name hf \ --dataset-path likaixin/InstructCoder \ @@ -210,7 +211,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests **`lmms-lab/LLaVA-OneVision-Data`** ```bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ @@ -224,20 +225,20 @@ python3 vllm/benchmarks/benchmark_serving.py \ **`Aeala/ShareGPT_Vicuna_unfiltered`** ```bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ --hf-split train \ - --num-prompts 10 + --num-prompts 1000 ``` **`AI-MO/aimo-validation-aime`** ``` bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --model Qwen/QwQ-32B \ --dataset-name hf \ --dataset-path AI-MO/aimo-validation-aime \ @@ -248,7 +249,7 @@ python3 vllm/benchmarks/benchmark_serving.py \ **`philschmid/mt-bench`** ``` bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --model Qwen/QwQ-32B \ --dataset-name hf \ --dataset-path philschmid/mt-bench \ @@ -261,7 +262,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling parameters can be specified. Example client command: ```bash -python3 vllm/benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model NousResearch/Hermes-3-Llama-3.1-8B \ --endpoint /v1/completions \ @@ -296,7 +297,7 @@ The following arguments can be used to control the ramp-up:
```bash -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model NousResearch/Hermes-3-Llama-3.1-8B \ --dataset-name sonnet \ --dataset-path vllm/benchmarks/sonnet.txt \ @@ -314,7 +315,7 @@ Total num output tokens: 1500 **VisionArena Benchmark for Vision Language Models** ``` bash -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model Qwen/Qwen2-VL-7B-Instruct \ --backend vllm-chat \ --dataset-name hf \ @@ -336,7 +337,7 @@ Total num output tokens: 1280 ``` bash VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_USE_V1=1 \ -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --dataset-name=hf \ --dataset-path=likaixin/InstructCoder \ --model=meta-llama/Meta-Llama-3-8B-Instruct \ @@ -360,7 +361,7 @@ Total num output tokens: 204800 **`lmms-lab/LLaVA-OneVision-Data`** ```bash -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model Qwen/Qwen2-VL-7B-Instruct \ --backend vllm-chat \ --dataset-name hf \ @@ -373,7 +374,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \ **`Aeala/ShareGPT_Vicuna_unfiltered`** ```bash -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model Qwen/Qwen2-VL-7B-Instruct \ --backend vllm-chat \ --dataset-name hf \ @@ -385,7 +386,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \ **`AI-MO/aimo-validation-aime`** ```bash -python3 benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model Qwen/QwQ-32B \ --backend vllm \ --dataset-name hf \ @@ -399,7 +400,7 @@ python3 benchmarks/benchmark_throughput.py \ ``` bash # download dataset # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 vllm/benchmarks/benchmark_throughput.py \ +vllm bench throughput \ --model meta-llama/Llama-2-7b-hf \ --backend vllm \ --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \ diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index eaa28ea5c92b..ef7c7c94c883 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. +# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. # See details in README (benchmarks/auto_tune/README.md). TAG=$(date +"%Y_%m_%d_%H_%M") @@ -47,7 +47,7 @@ start_server() { local max_num_batched_tokens=$3 local vllm_log=$4 local profile_dir=$5 - + pkill -f vllm VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ @@ -64,9 +64,9 @@ start_server() { # wait for 10 minutes... server_started=0 - for i in {1..60}; do + for i in {1..60}; do RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) - STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) + STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) if [[ "$STATUS_CODE" -eq 200 ]]; then server_started=1 break @@ -89,10 +89,10 @@ update_best_profile() { selected_profile_file= if [[ "$SYSTEM" == "TPU" ]]; then selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb" - fi + fi if [[ "$SYSTEM" == "GPU" ]]; then selected_profile_file="${sorted_paths[$profile_index]}" - fi + fi rm -f $PROFILE_PATH/* cp $selected_profile_file $PROFILE_PATH } @@ -120,28 +120,28 @@ run_benchmark() { echo "server started." fi echo - + echo "run benchmark test..." meet_latency_requirement=0 # get a basic qps by using request-rate inf bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) adjusted_input_len=$(( INPUT_LEN - prefix_len )) - python3 benchmarks/benchmark_serving.py \ + vllm3 bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \ --random-input-len $adjusted_input_len \ --random-output-len $OUTPUT_LEN \ - --ignore-eos \ - --disable-tqdm \ - --request-rate inf \ - --percentile-metrics ttft,tpot,itl,e2el \ - --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ - --num-prompts 1000 \ --random-prefix-len $prefix_len \ + --num-prompts $NUM_PROMPTS \ --port 8004 \ - --profile &> "$bm_log" + --save-result \ + --result-dir $LOG_FOLDER \ + --result-filename bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.json \ + --request-rate inf \ + --ignore-eos \ + 2>&1 | tee $bm_log throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') @@ -160,7 +160,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len )) curl -X POST http://0.0.0.0:8004/reset_prefix_cache sleep 5 bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" - python3 benchmarks/benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \ @@ -245,4 +245,3 @@ done echo "finish permutations" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" - diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index a5851cfe963d..bd450fb61e00 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -38,13 +38,13 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \ benchmark_serving.py: ```bash -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model meta-llama/Meta-Llama-3-70B \ --dataset-name sharegpt \ --dataset-path sharegpt.json \ --profile \ - --num-prompts 2 + --num-prompts 10 ``` ## Profile with NVIDIA Nsight Systems @@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script: nsys profile -o report.nsys-rep \ --trace-fork-before-exec=true \ --cuda-graph-trace=node \ - python benchmarks/benchmark_latency.py \ +vllm bench latency \ --model meta-llama/Llama-3.1-8B-Instruct \ --num-iters-warmup 5 \ --num-iters 1 \ @@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \ vllm serve meta-llama/Llama-3.1-8B-Instruct # client -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model meta-llama/Llama-3.1-8B-Instruct \ --num-prompts 1 \ @@ -132,7 +132,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p ... ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): - Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name + Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- ----------- ----------- -------- --------- ----------- ---------------------------------------------------------------------------------------------------- 46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of… 14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of… @@ -143,7 +143,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p 2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if(int)0&&vllm::_typeConvert::exists, void>::type vllm::fused_add_rms_norm_kern… 1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel(const long *, T1 *, T1 *, const T1 *, in… 0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0… - ... + ... ``` GUI example: diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index 9f6acf3291dd..9d334f8873d9 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica # Detailed Design ## Overall Process -As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: - -1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. -2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**. -3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**. -4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. -5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**. -6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**. +As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: + +1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. +2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**. +3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**. +4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. +5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**. +6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**. 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**. ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7) @@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \ ??? console "Command" ```shell - python3 benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --model base_model \ --tokenizer meta-llama/Llama-3.1-8B-Instruct \ diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh index 2966f386c93a..8dbadae41007 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh @@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001} PREFILL_GPUS=${PREFILL_GPUS:-0} DECODE_GPUS=${DECODE_GPUS:-1,2,3} PREFILL_PORTS=${PREFILL_PORTS:-20003} -DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} +DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change." echo "" @@ -163,7 +163,7 @@ main() { local gpu_id=${PREFILL_GPU_ARRAY[$i]} local port=${PREFILL_PORT_ARRAY[$i]} local kv_port=$((21001 + i)) - + echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \ --enforce-eager \ @@ -192,7 +192,7 @@ main() { local gpu_id=${DECODE_GPU_ARRAY[$i]} local port=${DECODE_PORT_ARRAY[$i]} local kv_port=$((22001 + i)) - + echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ --enforce-eager \ @@ -232,7 +232,7 @@ main() { # Run Benchmark # ============================================================================= cd ../../../benchmarks/ - python3 benchmark_serving.py --port 10001 --seed $(date +%s) \ + vllm bench serve --port 10001 --seed $(date +%s) \ --model $MODEL \ --dataset-name random --random-input-len 7500 --random-output-len 200 \ --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log @@ -242,4 +242,4 @@ main() { cleanup } -main \ No newline at end of file +main diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh index 0b6c9213ebff..1178681f1533 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh @@ -122,7 +122,7 @@ main() { # begin benchmark cd ../../../../benchmarks/ - python3 benchmark_serving.py --port 9000 --seed $(date +%s) \ + vllm bench serve --port 9000 --seed $(date +%s) \ --model meta-llama/Llama-3.1-8B-Instruct \ --dataset-name random --random-input-len 7500 --random-output-len 200 \ --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log @@ -133,4 +133,4 @@ main() { } -main \ No newline at end of file +main diff --git a/vllm_onboarding_guide.md b/vllm_onboarding_guide.md new file mode 100644 index 000000000000..86254182ba57 --- /dev/null +++ b/vllm_onboarding_guide.md @@ -0,0 +1,568 @@ +# vLLM v1 Developer Onboarding Guide + +## 1. 🧭 Overview + +vLLM v1 is a high-performance large language model serving framework designed for **easy, fast, and cheap LLM serving**. It represents a major architectural upgrade from v0 with significant performance improvements and cleaner code organization. + +### Key Features +- **High-throughput serving** with state-of-the-art performance +- **PagedAttention** for efficient memory management of attention key-value pairs +- **Continuous batching** of incoming requests for optimal resource utilization +- **Speculative decoding** and **chunked prefill** for faster inference +- **Multi-modal support** (text, vision, audio) with unified processing +- **Distributed inference** with tensor and pipeline parallelism +- **Prefix caching** for improved efficiency on repeated prompts +- **Multiple hardware support** (NVIDIA GPUs, AMD, Intel, TPU, AWS Neuron) + +### Technologies Used +- **Language**: Python 3.8+ with C++/CUDA kernels +- **Framework**: PyTorch with custom CUDA kernels +- **Distributed**: Ray for multi-node coordination, multiprocessing for local parallelism +- **Memory Management**: Custom block-based KV cache with PagedAttention +- **API**: OpenAI-compatible REST API server +- **Build System**: CMake for C++/CUDA components, setuptools for Python + +--- + +## 2. 🧱 High-Level Architecture Diagram + +```mermaid +graph TB + subgraph "Client Layer" + API[API Server] + CLI[CLI Interface] + SDK[Python SDK] + end + + subgraph "Engine Layer" + LLMEngine[LLM Engine] + Processor[Input Processor] + OutputProcessor[Output Processor] + end + + subgraph "Core Execution" + EngineCore[Engine Core] + Scheduler[Scheduler] + KVManager[KV Cache Manager] + StructuredOutput[Structured Output Manager] + end + + subgraph "Execution Layer" + Executor[Executor] + subgraph "Workers" + GPUWorker[GPU Worker] + CPUWorker[CPU Worker] + TPUWorker[TPU Worker] + end + ModelRunner[Model Runner] + end + + subgraph "Storage & Cache" + KVCache[(KV Cache Blocks)] + EncoderCache[(Encoder Cache)] + PrefixCache[(Prefix Cache)] + end + + subgraph "External Systems" + HF[Hugging Face Models] + Distributed[Ray Cluster] + Monitoring[Prometheus Metrics] + end + + API --> LLMEngine + CLI --> LLMEngine + SDK --> LLMEngine + + LLMEngine --> Processor + LLMEngine --> OutputProcessor + LLMEngine --> EngineCore + + EngineCore --> Scheduler + EngineCore --> KVManager + EngineCore --> StructuredOutput + + Scheduler --> Executor + Executor --> GPUWorker + Executor --> CPUWorker + Executor --> TPUWorker + + GPUWorker --> ModelRunner + CPUWorker --> ModelRunner + TPUWorker --> ModelRunner + + KVManager --> KVCache + KVManager --> EncoderCache + KVManager --> PrefixCache + + ModelRunner --> HF + Executor --> Distributed + LLMEngine --> Monitoring +``` + +### Component Explanations + +- **LLM Engine**: Main orchestrator that coordinates all components and provides the public API +- **Engine Core**: Core execution engine that manages request lifecycle and coordinates scheduling +- **Scheduler**: Intelligent request scheduler that manages resource allocation and batching decisions +- **KV Cache Manager**: Sophisticated memory manager using PagedAttention for efficient key-value storage +- **Workers**: Hardware-specific execution units that run the actual model inference +- **Executor**: Coordinates distributed execution across multiple workers/nodes + +--- + +## 3. 🔎 Component Breakdown + +### Component: LLM Engine (`/data/users/yeq/gitrepos/vllm/vllm/v1/engine/llm_engine.py`) + +**Purpose**: +Main entry point and orchestrator for the entire vLLM system. Provides backward compatibility with v0 API while leveraging v1 architecture improvements. + +**Key Elements**: +- `LLMEngine.__init__()`: Initializes all core components and establishes communication channels +- `add_request()`: Processes and queues new inference requests with validation +- `step()`: Executes one inference iteration, coordinating scheduling and execution +- `abort_request()`: Handles request cancellation and resource cleanup +- `get_tokenizer_group()`: Provides access to tokenization services + +**Depends On**: +- Internal: `EngineCoreClient`, `Processor`, `OutputProcessor`, `Executor` +- External: PyTorch, Hugging Face Transformers, Ray (optional) + +--- + +### Component: Engine Core (`/data/users/yeq/gitrepos/vllm/vllm/v1/engine/core.py`) + +**Purpose**: +Core execution engine that manages the request lifecycle, coordinates between scheduler and workers, and handles distributed execution. + +**Key Elements**: +- `EngineCore.add_request()`: Validates and queues requests for scheduling +- `EngineCore.get_output()`: Retrieves completed inference results +- `EngineCore.abort_requests()`: Handles request cancellation +- `EngineCoreClient`: Client interface for multiprocess communication + +**Depends On**: +- Internal: `Scheduler`, `Executor`, `ModelRunner` +- External: Multiprocessing, asyncio + +--- + +### Component: Scheduler (`/data/users/yeq/gitrepos/vllm/vllm/v1/core/sched/scheduler.py`) + +**Purpose**: +Intelligent request scheduler that makes optimal batching decisions, manages resource allocation, and handles advanced features like speculative decoding and prefix caching. + +**Key Elements**: +- `Scheduler.schedule()`: Core scheduling algorithm that batches requests optimally +- `_try_schedule_encoder_inputs()`: Handles multi-modal input scheduling +- `update_from_output()`: Processes model outputs and updates request states +- `_make_cached_request_data()`: Optimizes data structures for cached requests + +**Depends On**: +- Internal: `KVCacheManager`, `StructuredOutputManager`, `RequestQueue` +- External: None (pure Python logic) + +--- + +### Component: KV Cache Manager (`/data/users/yeq/gitrepos/vllm/vllm/v1/core/kv_cache_manager.py`) + +**Purpose**: +Sophisticated memory management system implementing PagedAttention for efficient key-value cache storage and retrieval. + +**Key Elements**: +- `KVCacheManager.allocate_slots()`: Allocates memory blocks for new requests +- `get_computed_blocks()`: Retrieves cached computation results +- `free()`: Releases memory blocks when requests complete +- `cache_blocks()`: Implements prefix caching for repeated prompts + +**Depends On**: +- Internal: `BlockPool`, `KVCacheUtils` +- External: PyTorch tensors + +--- + +### Component: Workers (`/data/users/yeq/gitrepos/vllm/vllm/v1/worker/`) + +**Purpose**: +Hardware-specific execution units that perform the actual model inference on different accelerators. + +**Key Elements**: +- `GPUWorker`: NVIDIA GPU-optimized execution with CUDA kernels +- `CPUWorker`: CPU-based inference for cost-effective serving +- `TPUWorker`: Google TPU integration for specialized workloads +- `ModelRunner`: Coordinates model execution and batch processing + +**Depends On**: +- Internal: `InputBatch`, `BlockTable`, model loading utilities +- External: PyTorch, hardware-specific libraries (CUDA, TPU) + +--- + +### Component: Executors (`/data/users/yeq/gitrepos/vllm/vllm/v1/executor/`) + +**Purpose**: +Coordinates distributed execution across multiple workers and handles different parallelism strategies. + +**Key Elements**: +- `MultiprocessExecutor`: Local multi-GPU execution +- `RayDistributedExecutor`: Multi-node distributed execution via Ray +- `AbstractExecutor`: Base interface for all execution strategies + +**Depends On**: +- Internal: `Worker` implementations +- External: Ray (for distributed), multiprocessing + +--- + +### Component: Request Processing (`/data/users/yeq/gitrepos/vllm/vllm/v1/request.py`, `/data/users/yeq/gitrepos/vllm/vllm/v1/outputs.py`) + +**Purpose**: +Handles request lifecycle management, input validation, and output formatting. + +**Key Elements**: +- `Request`: Core request data structure with state management +- `RequestStatus`: Enum tracking request lifecycle states +- `ModelRunnerOutput`: Structured output from model execution +- `SamplerOutput`: Token sampling results with logprobs + +**Depends On**: +- Internal: Sampling/pooling parameters, multi-modal inputs +- External: PyTorch tensors + +--- + +## 4. 🔁 Data Flow & Call Flow Examples + +### Example Flow: Single Request Processing + +**Description**: +A client submits a text generation request that goes through the complete vLLM pipeline from input processing to response generation. + +**Sequence Diagram**: + +```mermaid +sequenceDiagram + participant Client + participant LLMEngine + participant Processor + participant EngineCore + participant Scheduler + participant KVManager + participant Executor + participant Worker + participant ModelRunner + + Client->>LLMEngine: add_request(prompt, sampling_params) + LLMEngine->>Processor: process_inputs(prompt, params) + Processor-->>LLMEngine: EngineCoreRequest + LLMEngine->>EngineCore: add_request(core_request) + EngineCore->>Scheduler: add_request(request) + + Note over Scheduler: Request queued in WAITING state + + Client->>LLMEngine: step() - Execute inference + LLMEngine->>EngineCore: get_output() + EngineCore->>Scheduler: schedule() + + Scheduler->>KVManager: allocate_slots(request, num_tokens) + KVManager-->>Scheduler: allocated_blocks + Scheduler-->>EngineCore: SchedulerOutput + + EngineCore->>Executor: execute_model(scheduler_output) + Executor->>Worker: execute_model_async(model_input) + Worker->>ModelRunner: execute_model(model_input) + + Note over ModelRunner: Forward pass through transformer + + ModelRunner-->>Worker: ModelRunnerOutput + Worker-->>Executor: ModelRunnerOutput + Executor-->>EngineCore: ModelRunnerOutput + + EngineCore->>Scheduler: update_from_output(output) + Scheduler-->>EngineCore: EngineCoreOutputs + EngineCore-->>LLMEngine: EngineCoreOutputs + + LLMEngine->>LLMEngine: output_processor.process_outputs() + LLMEngine-->>Client: RequestOutput +``` + +--- + +### Example Flow: Batched Request Processing + +**Description**: +Multiple requests are intelligently batched together for efficient GPU utilization, demonstrating vLLM's continuous batching capabilities. + +**Sequence Diagram**: + +```mermaid +sequenceDiagram + participant Client1 + participant Client2 + participant Client3 + participant LLMEngine + participant Scheduler + participant KVManager + participant Worker + + Client1->>LLMEngine: add_request(req1) + Client2->>LLMEngine: add_request(req2) + Client3->>LLMEngine: add_request(req3) + + Note over LLMEngine: Multiple requests queued + + LLMEngine->>Scheduler: schedule() + + Note over Scheduler: Batch optimization logic + Scheduler->>Scheduler: calculate_token_budget() + Scheduler->>Scheduler: select_requests_for_batch() + + loop For each selected request + Scheduler->>KVManager: allocate_slots(request) + KVManager-->>Scheduler: blocks_allocated + end + + Scheduler-->>LLMEngine: SchedulerOutput(batched_requests) + + LLMEngine->>Worker: execute_model(batch) + + Note over Worker: Single forward pass for all requests + + Worker-->>LLMEngine: ModelRunnerOutput(batch_results) + + LLMEngine->>LLMEngine: split_batch_outputs() + LLMEngine-->>Client1: RequestOutput(req1_result) + LLMEngine-->>Client2: RequestOutput(req2_result) + LLMEngine-->>Client3: RequestOutput(req3_result) +``` + +--- + +### Example Flow: Prefix Caching Hit + +**Description**: +A request benefits from prefix caching when its prompt shares a common prefix with a previously processed request. + +**Sequence Diagram**: + +```mermaid +sequenceDiagram + participant Client + participant LLMEngine + participant Scheduler + participant KVManager + participant PrefixCache + + Client->>LLMEngine: add_request("Explain quantum physics...") + LLMEngine->>Scheduler: schedule() + + Scheduler->>KVManager: get_computed_blocks(request) + KVManager->>PrefixCache: lookup_prefix_hash(prompt_tokens) + + alt Cache Hit + PrefixCache-->>KVManager: cached_blocks(num_cached_tokens=50) + KVManager-->>Scheduler: computed_blocks + cache_info + + Note over Scheduler: Skip computation for cached tokens + Scheduler->>Scheduler: schedule_remaining_tokens(total-cached) + + else Cache Miss + PrefixCache-->>KVManager: no_cache_found + KVManager-->>Scheduler: empty_blocks + + Note over Scheduler: Full computation required + Scheduler->>Scheduler: schedule_all_tokens() + end + + Scheduler-->>LLMEngine: SchedulerOutput + Note over LLMEngine: Execution continues with optimized token count +``` + +--- + +### Example Flow: Multi-Modal Request Processing + +**Description**: +Processing a request that includes both text and image inputs, demonstrating vLLM's multi-modal capabilities. + +**Sequence Diagram**: + +```mermaid +sequenceDiagram + participant Client + participant LLMEngine + participant Processor + participant Scheduler + participant EncoderCache + participant Worker + participant VisionEncoder + + Client->>LLMEngine: add_request(text="Describe image", image=img_data) + LLMEngine->>Processor: process_inputs(multimodal_input) + + Processor->>Processor: tokenize_text() + Processor->>Processor: process_image_placeholders() + Processor-->>LLMEngine: Request(mm_inputs, mm_positions) + + LLMEngine->>Scheduler: schedule() + Scheduler->>Scheduler: _try_schedule_encoder_inputs() + + alt Encoder Input Needed + Scheduler->>EncoderCache: can_allocate(request, input_id) + EncoderCache-->>Scheduler: cache_available + + Scheduler->>EncoderCache: allocate(request, input_id) + Scheduler-->>LLMEngine: SchedulerOutput(encoder_inputs=[0]) + + LLMEngine->>Worker: execute_model(scheduler_output) + Worker->>VisionEncoder: encode_image(image_data) + VisionEncoder-->>Worker: image_embeddings + + Worker->>Worker: merge_text_image_embeddings() + Worker-->>LLMEngine: ModelRunnerOutput + + else Encoder Cached + Scheduler->>EncoderCache: get_cached_embeddings() + EncoderCache-->>Scheduler: cached_embeddings + Note over Scheduler: Skip encoder computation + end + + LLMEngine-->>Client: RequestOutput(generated_text) +``` + +--- + +## 5. 🗃️ Data Models (Entities) + +### Entity: Request + +- **Class**: `Request` in `/data/users/yeq/gitrepos/vllm/vllm/v1/request.py` +- **Fields**: + - `request_id: str` – unique identifier for the request + - `prompt_token_ids: list[int]` – tokenized input prompt + - `sampling_params: SamplingParams` – generation parameters (temperature, top_p, etc.) + - `pooling_params: PoolingParams` – for embedding/pooling requests + - `status: RequestStatus` – current lifecycle state (WAITING, RUNNING, FINISHED_*) + - `num_computed_tokens: int` – number of tokens already processed + - `max_tokens: int` – maximum tokens to generate + - `arrival_time: float` – timestamp when request was received + - `priority: int` – scheduling priority (higher = more important) + +- **Relations**: + - Contains `MultiModalKwargs` for vision/audio inputs + - References `LoRARequest` for adapter-specific inference + - Links to `StructuredOutputRequest` for guided generation + +- **Notes**: + - Immutable token lists use `ConstantList` wrapper for safety + - Supports speculative decoding with `spec_token_ids` + - Tracks prefix cache hits via `num_cached_tokens` + +--- + +### Entity: RequestStatus + +- **Enum**: `RequestStatus` in `/data/users/yeq/gitrepos/vllm/vllm/v1/request.py` +- **Values**: + - `WAITING` – queued for scheduling + - `WAITING_FOR_FSM` – waiting for structured output compilation + - `WAITING_FOR_REMOTE_KVS` – waiting for distributed KV transfer + - `RUNNING` – actively being processed + - `PREEMPTED` – temporarily paused for higher priority requests + - `FINISHED_STOPPED` – completed normally (stop token/string) + - `FINISHED_LENGTH_CAPPED` – completed due to max length + - `FINISHED_ABORTED` – cancelled by client + - `FINISHED_IGNORED` – rejected due to constraints + +- **Relations**: + - Maps to `FinishReason` enum for API compatibility + - Used by scheduler for state transitions + +- **Notes**: + - States > PREEMPTED are considered finished + - Supports graceful degradation and error handling + +--- + +### Entity: ModelRunnerOutput + +- **Class**: `ModelRunnerOutput` in `/data/users/yeq/gitrepos/vllm/vllm/v1/outputs.py` +- **Fields**: + - `req_ids: list[str]` – request identifiers in batch order + - `req_id_to_index: dict[str, int]` – mapping for efficient lookup + - `sampled_token_ids: list[list[int]]` – generated tokens per request + - `spec_token_ids: list[list[int]]` – speculative tokens (if enabled) + - `logprobs: LogprobsLists` – token probabilities for each request + - `prompt_logprobs_dict: dict[str, LogprobsTensors]` – prompt token probabilities + - `pooler_output: list[torch.Tensor]` – embeddings for pooling requests + +- **Relations**: + - Consumed by `Scheduler.update_from_output()` + - Converted to `RequestOutput` by `OutputProcessor` + +- **Notes**: + - Uses lists instead of tensors for efficient serialization + - Supports variable-length outputs per request in batch + +--- + +### Entity: SchedulerOutput + +- **Class**: `SchedulerOutput` in `/data/users/yeq/gitrepos/vllm/vllm/v1/core/sched/output.py` +- **Fields**: + - `scheduled_new_reqs: list[NewRequestData]` – first-time scheduled requests + - `scheduled_cached_reqs: CachedRequestData` – continuing requests + - `num_scheduled_tokens: dict[str, int]` – tokens per request this step + - `total_num_scheduled_tokens: int` – total batch size + - `scheduled_encoder_inputs: dict[str, list[int]]` – multi-modal inputs to process + - `num_common_prefix_blocks: list[int]` – shared prefix optimization data + +- **Relations**: + - Produced by `Scheduler.schedule()` + - Consumed by `Executor.execute_model()` + +- **Notes**: + - Optimizes memory layout for different request types + - Includes metadata for advanced features (speculative decoding, prefix caching) + +--- + +### Entity: KVCacheConfig + +- **Class**: `KVCacheConfig` in `/data/users/yeq/gitrepos/vllm/vllm/v1/kv_cache_interface.py` +- **Fields**: + - `block_size: int` – tokens per memory block (typically 16) + - `num_gpu_blocks: int` – total GPU memory blocks available + - `num_cpu_blocks: int` – CPU memory blocks for offloading + - `cache_dtype: torch.dtype` – data type for cache storage + - `kv_cache_groups: list[KVCacheGroup]` – cache organization + +- **Relations**: + - Used by `KVCacheManager` for memory allocation + - Configured based on model and hardware constraints + +- **Notes**: + - Block-based design enables efficient memory management + - Supports heterogeneous memory hierarchies (GPU/CPU) + +--- + +### Entity: SamplingParams + +- **Class**: `SamplingParams` in `vllm/sampling_params.py` +- **Fields**: + - `n: int` – number of output sequences to generate + - `max_tokens: int` – maximum tokens to generate + - `temperature: float` – sampling randomness (0.0 = deterministic) + - `top_p: float` – nucleus sampling threshold + - `top_k: int` – top-k sampling limit + - `stop: list[str]` – stop strings to terminate generation + - `logprobs: int` – number of log probabilities to return + +- **Relations**: + - Embedded in `Request` objects + - Used by sampling kernels during generation + +- **Notes**: + - Supports advanced sampling strategies (beam search, parallel sampling) + - Extensible for custom sampling algorithms# vLLM Developer Onboarding Guide From cd501306f964501394c5ab5a4574fa6a1cd09a8b Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Mon, 21 Jul 2025 23:44:27 -0700 Subject: [PATCH 2/6] remove unneeded files Signed-off-by: Ye (Charlotte) Qi --- vllm_onboarding_guide.md | 568 --------------------------------------- 1 file changed, 568 deletions(-) delete mode 100644 vllm_onboarding_guide.md diff --git a/vllm_onboarding_guide.md b/vllm_onboarding_guide.md deleted file mode 100644 index 86254182ba57..000000000000 --- a/vllm_onboarding_guide.md +++ /dev/null @@ -1,568 +0,0 @@ -# vLLM v1 Developer Onboarding Guide - -## 1. 🧭 Overview - -vLLM v1 is a high-performance large language model serving framework designed for **easy, fast, and cheap LLM serving**. It represents a major architectural upgrade from v0 with significant performance improvements and cleaner code organization. - -### Key Features -- **High-throughput serving** with state-of-the-art performance -- **PagedAttention** for efficient memory management of attention key-value pairs -- **Continuous batching** of incoming requests for optimal resource utilization -- **Speculative decoding** and **chunked prefill** for faster inference -- **Multi-modal support** (text, vision, audio) with unified processing -- **Distributed inference** with tensor and pipeline parallelism -- **Prefix caching** for improved efficiency on repeated prompts -- **Multiple hardware support** (NVIDIA GPUs, AMD, Intel, TPU, AWS Neuron) - -### Technologies Used -- **Language**: Python 3.8+ with C++/CUDA kernels -- **Framework**: PyTorch with custom CUDA kernels -- **Distributed**: Ray for multi-node coordination, multiprocessing for local parallelism -- **Memory Management**: Custom block-based KV cache with PagedAttention -- **API**: OpenAI-compatible REST API server -- **Build System**: CMake for C++/CUDA components, setuptools for Python - ---- - -## 2. 🧱 High-Level Architecture Diagram - -```mermaid -graph TB - subgraph "Client Layer" - API[API Server] - CLI[CLI Interface] - SDK[Python SDK] - end - - subgraph "Engine Layer" - LLMEngine[LLM Engine] - Processor[Input Processor] - OutputProcessor[Output Processor] - end - - subgraph "Core Execution" - EngineCore[Engine Core] - Scheduler[Scheduler] - KVManager[KV Cache Manager] - StructuredOutput[Structured Output Manager] - end - - subgraph "Execution Layer" - Executor[Executor] - subgraph "Workers" - GPUWorker[GPU Worker] - CPUWorker[CPU Worker] - TPUWorker[TPU Worker] - end - ModelRunner[Model Runner] - end - - subgraph "Storage & Cache" - KVCache[(KV Cache Blocks)] - EncoderCache[(Encoder Cache)] - PrefixCache[(Prefix Cache)] - end - - subgraph "External Systems" - HF[Hugging Face Models] - Distributed[Ray Cluster] - Monitoring[Prometheus Metrics] - end - - API --> LLMEngine - CLI --> LLMEngine - SDK --> LLMEngine - - LLMEngine --> Processor - LLMEngine --> OutputProcessor - LLMEngine --> EngineCore - - EngineCore --> Scheduler - EngineCore --> KVManager - EngineCore --> StructuredOutput - - Scheduler --> Executor - Executor --> GPUWorker - Executor --> CPUWorker - Executor --> TPUWorker - - GPUWorker --> ModelRunner - CPUWorker --> ModelRunner - TPUWorker --> ModelRunner - - KVManager --> KVCache - KVManager --> EncoderCache - KVManager --> PrefixCache - - ModelRunner --> HF - Executor --> Distributed - LLMEngine --> Monitoring -``` - -### Component Explanations - -- **LLM Engine**: Main orchestrator that coordinates all components and provides the public API -- **Engine Core**: Core execution engine that manages request lifecycle and coordinates scheduling -- **Scheduler**: Intelligent request scheduler that manages resource allocation and batching decisions -- **KV Cache Manager**: Sophisticated memory manager using PagedAttention for efficient key-value storage -- **Workers**: Hardware-specific execution units that run the actual model inference -- **Executor**: Coordinates distributed execution across multiple workers/nodes - ---- - -## 3. 🔎 Component Breakdown - -### Component: LLM Engine (`/data/users/yeq/gitrepos/vllm/vllm/v1/engine/llm_engine.py`) - -**Purpose**: -Main entry point and orchestrator for the entire vLLM system. Provides backward compatibility with v0 API while leveraging v1 architecture improvements. - -**Key Elements**: -- `LLMEngine.__init__()`: Initializes all core components and establishes communication channels -- `add_request()`: Processes and queues new inference requests with validation -- `step()`: Executes one inference iteration, coordinating scheduling and execution -- `abort_request()`: Handles request cancellation and resource cleanup -- `get_tokenizer_group()`: Provides access to tokenization services - -**Depends On**: -- Internal: `EngineCoreClient`, `Processor`, `OutputProcessor`, `Executor` -- External: PyTorch, Hugging Face Transformers, Ray (optional) - ---- - -### Component: Engine Core (`/data/users/yeq/gitrepos/vllm/vllm/v1/engine/core.py`) - -**Purpose**: -Core execution engine that manages the request lifecycle, coordinates between scheduler and workers, and handles distributed execution. - -**Key Elements**: -- `EngineCore.add_request()`: Validates and queues requests for scheduling -- `EngineCore.get_output()`: Retrieves completed inference results -- `EngineCore.abort_requests()`: Handles request cancellation -- `EngineCoreClient`: Client interface for multiprocess communication - -**Depends On**: -- Internal: `Scheduler`, `Executor`, `ModelRunner` -- External: Multiprocessing, asyncio - ---- - -### Component: Scheduler (`/data/users/yeq/gitrepos/vllm/vllm/v1/core/sched/scheduler.py`) - -**Purpose**: -Intelligent request scheduler that makes optimal batching decisions, manages resource allocation, and handles advanced features like speculative decoding and prefix caching. - -**Key Elements**: -- `Scheduler.schedule()`: Core scheduling algorithm that batches requests optimally -- `_try_schedule_encoder_inputs()`: Handles multi-modal input scheduling -- `update_from_output()`: Processes model outputs and updates request states -- `_make_cached_request_data()`: Optimizes data structures for cached requests - -**Depends On**: -- Internal: `KVCacheManager`, `StructuredOutputManager`, `RequestQueue` -- External: None (pure Python logic) - ---- - -### Component: KV Cache Manager (`/data/users/yeq/gitrepos/vllm/vllm/v1/core/kv_cache_manager.py`) - -**Purpose**: -Sophisticated memory management system implementing PagedAttention for efficient key-value cache storage and retrieval. - -**Key Elements**: -- `KVCacheManager.allocate_slots()`: Allocates memory blocks for new requests -- `get_computed_blocks()`: Retrieves cached computation results -- `free()`: Releases memory blocks when requests complete -- `cache_blocks()`: Implements prefix caching for repeated prompts - -**Depends On**: -- Internal: `BlockPool`, `KVCacheUtils` -- External: PyTorch tensors - ---- - -### Component: Workers (`/data/users/yeq/gitrepos/vllm/vllm/v1/worker/`) - -**Purpose**: -Hardware-specific execution units that perform the actual model inference on different accelerators. - -**Key Elements**: -- `GPUWorker`: NVIDIA GPU-optimized execution with CUDA kernels -- `CPUWorker`: CPU-based inference for cost-effective serving -- `TPUWorker`: Google TPU integration for specialized workloads -- `ModelRunner`: Coordinates model execution and batch processing - -**Depends On**: -- Internal: `InputBatch`, `BlockTable`, model loading utilities -- External: PyTorch, hardware-specific libraries (CUDA, TPU) - ---- - -### Component: Executors (`/data/users/yeq/gitrepos/vllm/vllm/v1/executor/`) - -**Purpose**: -Coordinates distributed execution across multiple workers and handles different parallelism strategies. - -**Key Elements**: -- `MultiprocessExecutor`: Local multi-GPU execution -- `RayDistributedExecutor`: Multi-node distributed execution via Ray -- `AbstractExecutor`: Base interface for all execution strategies - -**Depends On**: -- Internal: `Worker` implementations -- External: Ray (for distributed), multiprocessing - ---- - -### Component: Request Processing (`/data/users/yeq/gitrepos/vllm/vllm/v1/request.py`, `/data/users/yeq/gitrepos/vllm/vllm/v1/outputs.py`) - -**Purpose**: -Handles request lifecycle management, input validation, and output formatting. - -**Key Elements**: -- `Request`: Core request data structure with state management -- `RequestStatus`: Enum tracking request lifecycle states -- `ModelRunnerOutput`: Structured output from model execution -- `SamplerOutput`: Token sampling results with logprobs - -**Depends On**: -- Internal: Sampling/pooling parameters, multi-modal inputs -- External: PyTorch tensors - ---- - -## 4. 🔁 Data Flow & Call Flow Examples - -### Example Flow: Single Request Processing - -**Description**: -A client submits a text generation request that goes through the complete vLLM pipeline from input processing to response generation. - -**Sequence Diagram**: - -```mermaid -sequenceDiagram - participant Client - participant LLMEngine - participant Processor - participant EngineCore - participant Scheduler - participant KVManager - participant Executor - participant Worker - participant ModelRunner - - Client->>LLMEngine: add_request(prompt, sampling_params) - LLMEngine->>Processor: process_inputs(prompt, params) - Processor-->>LLMEngine: EngineCoreRequest - LLMEngine->>EngineCore: add_request(core_request) - EngineCore->>Scheduler: add_request(request) - - Note over Scheduler: Request queued in WAITING state - - Client->>LLMEngine: step() - Execute inference - LLMEngine->>EngineCore: get_output() - EngineCore->>Scheduler: schedule() - - Scheduler->>KVManager: allocate_slots(request, num_tokens) - KVManager-->>Scheduler: allocated_blocks - Scheduler-->>EngineCore: SchedulerOutput - - EngineCore->>Executor: execute_model(scheduler_output) - Executor->>Worker: execute_model_async(model_input) - Worker->>ModelRunner: execute_model(model_input) - - Note over ModelRunner: Forward pass through transformer - - ModelRunner-->>Worker: ModelRunnerOutput - Worker-->>Executor: ModelRunnerOutput - Executor-->>EngineCore: ModelRunnerOutput - - EngineCore->>Scheduler: update_from_output(output) - Scheduler-->>EngineCore: EngineCoreOutputs - EngineCore-->>LLMEngine: EngineCoreOutputs - - LLMEngine->>LLMEngine: output_processor.process_outputs() - LLMEngine-->>Client: RequestOutput -``` - ---- - -### Example Flow: Batched Request Processing - -**Description**: -Multiple requests are intelligently batched together for efficient GPU utilization, demonstrating vLLM's continuous batching capabilities. - -**Sequence Diagram**: - -```mermaid -sequenceDiagram - participant Client1 - participant Client2 - participant Client3 - participant LLMEngine - participant Scheduler - participant KVManager - participant Worker - - Client1->>LLMEngine: add_request(req1) - Client2->>LLMEngine: add_request(req2) - Client3->>LLMEngine: add_request(req3) - - Note over LLMEngine: Multiple requests queued - - LLMEngine->>Scheduler: schedule() - - Note over Scheduler: Batch optimization logic - Scheduler->>Scheduler: calculate_token_budget() - Scheduler->>Scheduler: select_requests_for_batch() - - loop For each selected request - Scheduler->>KVManager: allocate_slots(request) - KVManager-->>Scheduler: blocks_allocated - end - - Scheduler-->>LLMEngine: SchedulerOutput(batched_requests) - - LLMEngine->>Worker: execute_model(batch) - - Note over Worker: Single forward pass for all requests - - Worker-->>LLMEngine: ModelRunnerOutput(batch_results) - - LLMEngine->>LLMEngine: split_batch_outputs() - LLMEngine-->>Client1: RequestOutput(req1_result) - LLMEngine-->>Client2: RequestOutput(req2_result) - LLMEngine-->>Client3: RequestOutput(req3_result) -``` - ---- - -### Example Flow: Prefix Caching Hit - -**Description**: -A request benefits from prefix caching when its prompt shares a common prefix with a previously processed request. - -**Sequence Diagram**: - -```mermaid -sequenceDiagram - participant Client - participant LLMEngine - participant Scheduler - participant KVManager - participant PrefixCache - - Client->>LLMEngine: add_request("Explain quantum physics...") - LLMEngine->>Scheduler: schedule() - - Scheduler->>KVManager: get_computed_blocks(request) - KVManager->>PrefixCache: lookup_prefix_hash(prompt_tokens) - - alt Cache Hit - PrefixCache-->>KVManager: cached_blocks(num_cached_tokens=50) - KVManager-->>Scheduler: computed_blocks + cache_info - - Note over Scheduler: Skip computation for cached tokens - Scheduler->>Scheduler: schedule_remaining_tokens(total-cached) - - else Cache Miss - PrefixCache-->>KVManager: no_cache_found - KVManager-->>Scheduler: empty_blocks - - Note over Scheduler: Full computation required - Scheduler->>Scheduler: schedule_all_tokens() - end - - Scheduler-->>LLMEngine: SchedulerOutput - Note over LLMEngine: Execution continues with optimized token count -``` - ---- - -### Example Flow: Multi-Modal Request Processing - -**Description**: -Processing a request that includes both text and image inputs, demonstrating vLLM's multi-modal capabilities. - -**Sequence Diagram**: - -```mermaid -sequenceDiagram - participant Client - participant LLMEngine - participant Processor - participant Scheduler - participant EncoderCache - participant Worker - participant VisionEncoder - - Client->>LLMEngine: add_request(text="Describe image", image=img_data) - LLMEngine->>Processor: process_inputs(multimodal_input) - - Processor->>Processor: tokenize_text() - Processor->>Processor: process_image_placeholders() - Processor-->>LLMEngine: Request(mm_inputs, mm_positions) - - LLMEngine->>Scheduler: schedule() - Scheduler->>Scheduler: _try_schedule_encoder_inputs() - - alt Encoder Input Needed - Scheduler->>EncoderCache: can_allocate(request, input_id) - EncoderCache-->>Scheduler: cache_available - - Scheduler->>EncoderCache: allocate(request, input_id) - Scheduler-->>LLMEngine: SchedulerOutput(encoder_inputs=[0]) - - LLMEngine->>Worker: execute_model(scheduler_output) - Worker->>VisionEncoder: encode_image(image_data) - VisionEncoder-->>Worker: image_embeddings - - Worker->>Worker: merge_text_image_embeddings() - Worker-->>LLMEngine: ModelRunnerOutput - - else Encoder Cached - Scheduler->>EncoderCache: get_cached_embeddings() - EncoderCache-->>Scheduler: cached_embeddings - Note over Scheduler: Skip encoder computation - end - - LLMEngine-->>Client: RequestOutput(generated_text) -``` - ---- - -## 5. 🗃️ Data Models (Entities) - -### Entity: Request - -- **Class**: `Request` in `/data/users/yeq/gitrepos/vllm/vllm/v1/request.py` -- **Fields**: - - `request_id: str` – unique identifier for the request - - `prompt_token_ids: list[int]` – tokenized input prompt - - `sampling_params: SamplingParams` – generation parameters (temperature, top_p, etc.) - - `pooling_params: PoolingParams` – for embedding/pooling requests - - `status: RequestStatus` – current lifecycle state (WAITING, RUNNING, FINISHED_*) - - `num_computed_tokens: int` – number of tokens already processed - - `max_tokens: int` – maximum tokens to generate - - `arrival_time: float` – timestamp when request was received - - `priority: int` – scheduling priority (higher = more important) - -- **Relations**: - - Contains `MultiModalKwargs` for vision/audio inputs - - References `LoRARequest` for adapter-specific inference - - Links to `StructuredOutputRequest` for guided generation - -- **Notes**: - - Immutable token lists use `ConstantList` wrapper for safety - - Supports speculative decoding with `spec_token_ids` - - Tracks prefix cache hits via `num_cached_tokens` - ---- - -### Entity: RequestStatus - -- **Enum**: `RequestStatus` in `/data/users/yeq/gitrepos/vllm/vllm/v1/request.py` -- **Values**: - - `WAITING` – queued for scheduling - - `WAITING_FOR_FSM` – waiting for structured output compilation - - `WAITING_FOR_REMOTE_KVS` – waiting for distributed KV transfer - - `RUNNING` – actively being processed - - `PREEMPTED` – temporarily paused for higher priority requests - - `FINISHED_STOPPED` – completed normally (stop token/string) - - `FINISHED_LENGTH_CAPPED` – completed due to max length - - `FINISHED_ABORTED` – cancelled by client - - `FINISHED_IGNORED` – rejected due to constraints - -- **Relations**: - - Maps to `FinishReason` enum for API compatibility - - Used by scheduler for state transitions - -- **Notes**: - - States > PREEMPTED are considered finished - - Supports graceful degradation and error handling - ---- - -### Entity: ModelRunnerOutput - -- **Class**: `ModelRunnerOutput` in `/data/users/yeq/gitrepos/vllm/vllm/v1/outputs.py` -- **Fields**: - - `req_ids: list[str]` – request identifiers in batch order - - `req_id_to_index: dict[str, int]` – mapping for efficient lookup - - `sampled_token_ids: list[list[int]]` – generated tokens per request - - `spec_token_ids: list[list[int]]` – speculative tokens (if enabled) - - `logprobs: LogprobsLists` – token probabilities for each request - - `prompt_logprobs_dict: dict[str, LogprobsTensors]` – prompt token probabilities - - `pooler_output: list[torch.Tensor]` – embeddings for pooling requests - -- **Relations**: - - Consumed by `Scheduler.update_from_output()` - - Converted to `RequestOutput` by `OutputProcessor` - -- **Notes**: - - Uses lists instead of tensors for efficient serialization - - Supports variable-length outputs per request in batch - ---- - -### Entity: SchedulerOutput - -- **Class**: `SchedulerOutput` in `/data/users/yeq/gitrepos/vllm/vllm/v1/core/sched/output.py` -- **Fields**: - - `scheduled_new_reqs: list[NewRequestData]` – first-time scheduled requests - - `scheduled_cached_reqs: CachedRequestData` – continuing requests - - `num_scheduled_tokens: dict[str, int]` – tokens per request this step - - `total_num_scheduled_tokens: int` – total batch size - - `scheduled_encoder_inputs: dict[str, list[int]]` – multi-modal inputs to process - - `num_common_prefix_blocks: list[int]` – shared prefix optimization data - -- **Relations**: - - Produced by `Scheduler.schedule()` - - Consumed by `Executor.execute_model()` - -- **Notes**: - - Optimizes memory layout for different request types - - Includes metadata for advanced features (speculative decoding, prefix caching) - ---- - -### Entity: KVCacheConfig - -- **Class**: `KVCacheConfig` in `/data/users/yeq/gitrepos/vllm/vllm/v1/kv_cache_interface.py` -- **Fields**: - - `block_size: int` – tokens per memory block (typically 16) - - `num_gpu_blocks: int` – total GPU memory blocks available - - `num_cpu_blocks: int` – CPU memory blocks for offloading - - `cache_dtype: torch.dtype` – data type for cache storage - - `kv_cache_groups: list[KVCacheGroup]` – cache organization - -- **Relations**: - - Used by `KVCacheManager` for memory allocation - - Configured based on model and hardware constraints - -- **Notes**: - - Block-based design enables efficient memory management - - Supports heterogeneous memory hierarchies (GPU/CPU) - ---- - -### Entity: SamplingParams - -- **Class**: `SamplingParams` in `vllm/sampling_params.py` -- **Fields**: - - `n: int` – number of output sequences to generate - - `max_tokens: int` – maximum tokens to generate - - `temperature: float` – sampling randomness (0.0 = deterministic) - - `top_p: float` – nucleus sampling threshold - - `top_k: int` – top-k sampling limit - - `stop: list[str]` – stop strings to terminate generation - - `logprobs: int` – number of log probabilities to return - -- **Relations**: - - Embedded in `Request` objects - - Used by sampling kernels during generation - -- **Notes**: - - Supports advanced sampling strategies (beam search, parallel sampling) - - Extensible for custom sampling algorithms# vLLM Developer Onboarding Guide From 3e7fd6aab9b8804cb620a6504538d259d685ac95 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Mon, 21 Jul 2025 23:53:21 -0700 Subject: [PATCH 3/6] revert some wrong changes Signed-off-by: Ye (Charlotte) Qi --- .../scripts/run-nightly-benchmarks.sh | 5 ++--- benchmarks/README.md | 2 +- benchmarks/auto_tune/auto_tune.sh | 14 +++++++------- docs/contributing/profiling.md | 2 +- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 86153e8408cf..c8835643e24f 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -235,10 +235,9 @@ run_serving_tests() { --port $port \ --save-result \ --result-dir $RESULTS_FOLDER \ - --result-filename ${test_name}.json \ + --result-filename ${new_test_name}.json \ --request-rate $qps \ - --metadata "tensor_parallel_size=$tp" \ - $common_params_str" + --ignore-eos \ $client_args" elif [[ "$dataset_name" = "sonnet" ]]; then diff --git a/benchmarks/README.md b/benchmarks/README.md index ef2c57a7c079..ba17c31b2945 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -104,7 +104,7 @@ vllm bench serve \ --endpoint /v1/completions \ --dataset-name sharegpt \ --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --num-prompts 1000 \ + --num-prompts 10 \ --request-rate 3 # By default is inf ``` diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index ef7c7c94c883..d73860799325 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -133,15 +133,15 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len )) --dataset-name random \ --random-input-len $adjusted_input_len \ --random-output-len $OUTPUT_LEN \ + --ignore-eos \ + --disable-tqdm \ + --request-rate inf \ + --percentile-metrics ttft,tpot,itl,e2el \ + --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --num-prompts 1000 \ --random-prefix-len $prefix_len \ - --num-prompts $NUM_PROMPTS \ --port 8004 \ - --save-result \ - --result-dir $LOG_FOLDER \ - --result-filename bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.json \ - --request-rate inf \ - --ignore-eos \ - 2>&1 | tee $bm_log + --profile &> "$bm_log" throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index bd450fb61e00..aa3de617e072 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -44,7 +44,7 @@ vllm bench serve \ --dataset-name sharegpt \ --dataset-path sharegpt.json \ --profile \ - --num-prompts 10 + --num-prompts 2 ``` ## Profile with NVIDIA Nsight Systems From 3feab9cf239cf18a36e39ec1531c086a5f6c46cf Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Mon, 21 Jul 2025 23:57:04 -0700 Subject: [PATCH 4/6] fix Signed-off-by: Ye (Charlotte) Qi --- benchmarks/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index ba17c31b2945..3b10963c3e01 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -104,8 +104,7 @@ vllm bench serve \ --endpoint /v1/completions \ --dataset-name sharegpt \ --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --num-prompts 10 \ - --request-rate 3 # By default is inf + --num-prompts 10 ``` If successful, you will see the following output @@ -232,7 +231,7 @@ vllm bench serve \ --dataset-name hf \ --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ --hf-split train \ - --num-prompts 1000 + --num-prompts 10 ``` **`AI-MO/aimo-validation-aime`** From c926f64a633dc202b156f33ad444d04403f2e1a3 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Tue, 22 Jul 2025 00:12:16 -0700 Subject: [PATCH 5/6] add deprecation warnings Signed-off-by: Ye (Charlotte) Qi --- benchmarks/benchmark_latency.py | 5 +++++ benchmarks/benchmark_serving.py | 5 +++++ benchmarks/benchmark_throughput.py | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 4d2ea126b24a..d8b960edaa46 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -11,6 +11,7 @@ import numpy as np from tqdm import tqdm +from typing_extensions import deprecated import vllm.envs as envs from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json @@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format( write_to_json(pt_file, pt_records) +@deprecated( + "benchmark_latency.py is deprecated and will be removed in a " + "future version. Please use 'vllm bench latency' instead.", +) def main(args: argparse.Namespace): print(args) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index c597fb1068ab..a97fa280f37c 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -38,6 +38,7 @@ import numpy as np from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase +from typing_extensions import deprecated from backend_request_func import ( ASYNC_REQUEST_FUNCS, @@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format( write_to_json(pt_file, pt_records) +@deprecated( + "benchmark_serving.py is deprecated and will be removed in a future " + "version. Please use 'vllm bench serve' instead.", +) def main(args: argparse.Namespace): print(args) random.seed(args.seed) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 14461121fece..d4fc730d14f1 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -15,6 +15,7 @@ import uvloop from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase +from typing_extensions import deprecated from benchmark_dataset import ( AIMODataset, @@ -381,6 +382,10 @@ def get_requests(args, tokenizer): return dataset_cls(**common_kwargs).sample(**sample_kwargs) +@deprecated( + "benchmark_throughput.py is deprecated and will be removed in a " + "future version. Please use 'vllm bench throughput' instead.", +) def main(args: argparse.Namespace): if args.seed is None: args.seed = 0 From ef5a8ff15ab53ac8c4071e21de207d76d0f05baa Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Thu, 24 Jul 2025 10:12:04 -0700 Subject: [PATCH 6/6] vllm3 -> vllm Signed-off-by: Ye (Charlotte) Qi --- benchmarks/auto_tune/auto_tune.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index d73860799325..e0396b6ef5af 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -127,7 +127,7 @@ run_benchmark() { bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) adjusted_input_len=$(( INPUT_LEN - prefix_len )) - vllm3 bench serve \ + vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \