Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ uv pip install vllm==0.8.3
### Launch

```bash
MODEL=meta-llama/Llama-3.1-8B-Instruct
vllm serve $MODEL --disable-log-requests
export PORT=8000
export MODEL=meta-llama/Llama-3.1-8B-Instruct
vllm serve $MODEL --port $PORT --disable-log-requests --no-enable-prefix-caching --max-model-len 65536
```

> When inspecting logs, make sure prefix cache hit rate is low!
Expand All @@ -33,7 +34,7 @@ uv pip install "sglang[all]==0.4.4.post1" --find-links https://flashinfer.ai/whl

```bash
MODEL=meta-llama/Llama-3.1-8B-Instruct
python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port 8000 # --enable-mixed-chunk --enable-torch-compile
python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT$ # --enable-mixed-chunk --enable-torch-compile
```

> When inspecting logs, make sure cached-tokens is small!
Expand All @@ -55,8 +56,8 @@ cd ..
### Run Benchmark

```bash
FRAMEWORK=vllm bash ./benchmark_1000_in_100_out.sh
FRAMEWORK=sgl bash ./benchmark_1000_in_100_out.sh
VLLM_BENCHMARKS=../vllm/benchmarks FRAMEWORK=vllm bash ./benchmark_1000_in_100_out.sh
VLLM_BENCHMARKS=../vllm/benchmarks FRAMEWORK=sgl bash ./benchmark_1000_in_100_out.sh
python3 convert_to_csv.py --input-path results.json --output-path results.csv
```

Expand Down
11 changes: 6 additions & 5 deletions benchmark_1000_in_100_out.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ TOTAL_SECONDS=120
PORT=${PORT:-8000}
MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
FRAMEWORK=${FRAMEWORK:-vllm}
VLLM_BENCHMARK_FOLDER=${VLLM_BENCHMARK_FOLDER:-../vllm/benchmark}

for REQUEST_RATE in "${REQUEST_RATES[@]}";
do
Expand All @@ -14,7 +15,7 @@ do
echo "===== $FRAMEWORK - RUNNING $MODEL FOR $NUM_PROMPTS PROMPTS WITH $REQUEST_RATE QPS ====="
echo ""

python3 vllm/benchmarks/benchmark_serving.py \
python3 $VLLM_BENCHMARK_FOLDER/benchmark_serving.py \
--model $MODEL \
--dataset-name random \
--random-input-len $INPUT_LEN \
Expand All @@ -25,13 +26,13 @@ do
--ignore-eos \
--result-filename "results.json" \
--metadata "framework=$FRAMEWORK" \
--port ${PORT} \
--save-result
--port ${PORT} \
--append-result

done

# inf request rate.pth
python3 vllm/benchmarks/benchmark_serving.py \
python3 $VLLM_BENCHMARK_FOLDER/benchmark_serving.py \
--model $MODEL \
--dataset-name random \
--random-input-len $INPUT_LEN \
Expand All @@ -42,4 +43,4 @@ python3 vllm/benchmarks/benchmark_serving.py \
--result-filename "results.json" \
--metadata "framework=$FRAMEWORK" \
--port ${PORT} \
--save-result
--append-result