From d34cbda3b5545366f7cc76edb1bed13761fc2965 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 22 Apr 2025 01:29:13 +0000 Subject: [PATCH] updates Signed-off-by: Lucas Wilkinson --- README.md | 11 ++++++----- benchmark_1000_in_100_out.sh | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 3e7fd0d..9d98c30 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ uv pip install vllm==0.8.3 ### Launch ```bash -MODEL=meta-llama/Llama-3.1-8B-Instruct -vllm serve $MODEL --disable-log-requests +export PORT=8000 +export MODEL=meta-llama/Llama-3.1-8B-Instruct +vllm serve $MODEL --port $PORT --disable-log-requests --no-enable-prefix-caching --max-model-len 65536 ``` > When inspecting logs, make sure prefix cache hit rate is low! @@ -33,7 +34,7 @@ uv pip install "sglang[all]==0.4.4.post1" --find-links https://flashinfer.ai/whl ```bash MODEL=meta-llama/Llama-3.1-8B-Instruct -python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port 8000 # --enable-mixed-chunk --enable-torch-compile +python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT$ # --enable-mixed-chunk --enable-torch-compile ``` > When inspecting logs, make sure cached-tokens is small! @@ -55,8 +56,8 @@ cd .. ### Run Benchmark ```bash -FRAMEWORK=vllm bash ./benchmark_1000_in_100_out.sh -FRAMEWORK=sgl bash ./benchmark_1000_in_100_out.sh +VLLM_BENCHMARKS=../vllm/benchmarks FRAMEWORK=vllm bash ./benchmark_1000_in_100_out.sh +VLLM_BENCHMARKS=../vllm/benchmarks FRAMEWORK=sgl bash ./benchmark_1000_in_100_out.sh python3 convert_to_csv.py --input-path results.json --output-path results.csv ``` diff --git a/benchmark_1000_in_100_out.sh b/benchmark_1000_in_100_out.sh index d25176e..8a55fad 100755 --- a/benchmark_1000_in_100_out.sh +++ b/benchmark_1000_in_100_out.sh @@ -5,6 +5,7 @@ TOTAL_SECONDS=120 PORT=${PORT:-8000} MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct} FRAMEWORK=${FRAMEWORK:-vllm} +VLLM_BENCHMARK_FOLDER=${VLLM_BENCHMARK_FOLDER:-../vllm/benchmark} for REQUEST_RATE in "${REQUEST_RATES[@]}"; do @@ -14,7 +15,7 @@ do echo "===== $FRAMEWORK - RUNNING $MODEL FOR $NUM_PROMPTS PROMPTS WITH $REQUEST_RATE QPS =====" echo "" - python3 vllm/benchmarks/benchmark_serving.py \ + python3 $VLLM_BENCHMARK_FOLDER/benchmark_serving.py \ --model $MODEL \ --dataset-name random \ --random-input-len $INPUT_LEN \ @@ -25,13 +26,13 @@ do --ignore-eos \ --result-filename "results.json" \ --metadata "framework=$FRAMEWORK" \ - --port ${PORT} \ - --save-result + --port ${PORT} \ + --append-result done # inf request rate.pth -python3 vllm/benchmarks/benchmark_serving.py \ +python3 $VLLM_BENCHMARK_FOLDER/benchmark_serving.py \ --model $MODEL \ --dataset-name random \ --random-input-len $INPUT_LEN \ @@ -42,4 +43,4 @@ python3 vllm/benchmarks/benchmark_serving.py \ --result-filename "results.json" \ --metadata "framework=$FRAMEWORK" \ --port ${PORT} \ - --save-result + --append-result