diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000000..d8b98db499 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,54 @@ +# Introduction +This document outlines the benchmarking process for vllm-ascend, designed to evaluate its performance under various workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance.To maintain consistency with the vllm community, we have reused the vllm community [benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks) script. +# Overview +**Benchmarking Coverage**: We measure latency, throughput, and fixed-QPS serving on the Atlas800I A2 (see [quick_start](../docs/source/quick_start.md) to learn more supported devices list), with different models(coming soon). +- Latency tests + - Input length: 32 tokens. + - Output length: 128 tokens. + - Batch size: fixed (8). + - Models: llama-3.1 8B. + - Evaluation metrics: end-to-end latency (mean, median, p99). + +- Throughput tests + - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). + - Output length: the corresponding output length of these 200 prompts. + - Batch size: dynamically determined by vllm to achieve maximum throughput. + - Models: llama-3.1 8B . + - Evaluation metrics: throughput. +- Serving tests + - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). + - Output length: the corresponding output length of these 200 prompts. + - Batch size: dynamically determined by vllm and the arrival pattern of the requests. + - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). + - Models: llama-3.1 8B. + - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + +**Benchmarking Duration**: about 800senond for single model. + + +# Quick Use +## Prerequisites +Before running the benchmarks, ensure the following: +- vllm and vllm-ascend are installed and properly set up in an NPU environment, as these scripts are specifically designed for NPU devices. +- Install necessary dependencies for benchmarks: + ``` + pip install -r benchmarks/requirements-bench.txt + ``` + +- Models and datasets are cached locally to accelerate execution. Modify the paths in the JSON files located in benchmarks/tests accordingly. feel free to add your own models and parameters in the JSON to run your customized benchmarks. + +## Run benchmarks +The provided scripts automatically execute performance tests for serving, throughput, and latency. To start the benchmarking process, run command in the vllm-ascend root directory: +``` +bash benchmarks/scripts/run-performance-benchmarks.sh +``` +Once the script completes, you can find the results in the benchmarks/results folder. The output files may resemble the following: +``` +|-- latency_llama8B_tp1.json +|-- serving_llama8B_tp1_sharegpt_qps_1.json +|-- serving_llama8B_tp1_sharegpt_qps_16.json +|-- serving_llama8B_tp1_sharegpt_qps_4.json +|-- serving_llama8B_tp1_sharegpt_qps_inf.json +|-- throughput_llama8B_tp1.json +``` +These files contain detailed benchmarking results for further analysis. diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt new file mode 100644 index 0000000000..e9af75e220 --- /dev/null +++ b/benchmarks/requirements-bench.txt @@ -0,0 +1,2 @@ +pandas +datasets \ No newline at end of file diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh new file mode 100644 index 0000000000..489c59e8b1 --- /dev/null +++ b/benchmarks/scripts/run-performance-benchmarks.sh @@ -0,0 +1,289 @@ +#!/bin/bash + + +check_npus() { + # shellcheck disable=SC2155 + declare -g npu_count=$(npu-smi info -l | grep "Total Count" | awk -F ':' '{print $2}' | tr -d ' ') + + if [[ -z "$npu_count" || "$npu_count" -eq 0 ]]; then + echo "Need at least 1 NPU to run benchmarking." + exit 1 + else + echo "found NPU conut: $npu_count" + fi + + npu_type=$(npu-smi info | grep -E "^\| [0-9]+" | awk -F '|' '{print $2}' | awk '{$1=$1;print}' | awk '{print $2}') + + echo "NPU type is: $npu_type" +} + +ensure_sharegpt_downloaded() { + local FILE=ShareGPT_V3_unfiltered_cleaned_split.json + if [ ! -f "$FILE" ]; then + echo "$FILE not found, downloading from hf-mirror ..." + wget https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE + else + echo "$FILE already exists." + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args + args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl -X POST localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + +get_cur_npu_id() { + npu-smi info -l | awk -F ':' '/NPU ID/ {print $2+0; exit}' +} + +kill_npu_processes() { + ps -aux + lsof -t -i:8000 | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + + sleep 4 + rm -rf ~/.config/vllm + +} + + +run_latency_tests() { + # run latency tests using `benchmark_latency.py` + # $1: a json file specifying latency test cases + + local latency_test_file + latency_test_file=$1 + + # Iterate over latency tests + jq -c '.[]' "$latency_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^latency_ ]]; then + echo "In latency-test.json, test_name must start with \"latency_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + latency_params=$(echo "$params" | jq -r '.parameters') + latency_args=$(json2args "$latency_params") + + latency_command="python3 vllm_benchmarks/benchmark_latency.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $latency_args" + + echo "Running test case $test_name" + echo "Latency command: $latency_command" + + # run the benchmark + eval "$latency_command" + + kill_npu_processes + + done +} + +run_throughput_tests() { + # run throughput tests using `benchmark_throughput.py` + # $1: a json file specifying throughput test cases + + local throughput_test_file + throughput_test_file=$1 + + # Iterate over throughput tests + jq -c '.[]' "$throughput_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^throughput_ ]]; then + echo "In throughput-test.json, test_name must start with \"throughput_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + throughput_params=$(echo "$params" | jq -r '.parameters') + throughput_args=$(json2args "$throughput_params") + + throughput_command="python3 vllm_benchmarks/benchmark_throughput.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $throughput_args" + + echo "Running test case $test_name" + echo "Throughput command: $throughput_command" + + # run the benchmark + eval "$throughput_command" + + kill_npu_processes + + done +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^serving_ ]]; then + echo "In serving-test.json, test_name must start with \"serving_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.server_parameters') + client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if server model and client model is aligned + server_model=$(echo "$server_params" | jq -r '.model') + client_model=$(echo "$client_params" | jq -r '.model') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $test_name." + continue + fi + + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + bash -c "$server_command" & + server_pid=$! + + # wait until the server is alive + if wait_for_server; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 vllm_benchmarks/benchmark_serving.py \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + bash -c "$client_command" + done + + # clean up + kill -9 $server_pid + kill_npu_processes + done +} + +cleanup() { + rm -rf ./vllm_benchmarks +} + +get_benchmarks_scripts() { + git clone -b main --depth=1 git@github.com:vllm-project/vllm.git && \ + mv vllm/benchmarks vllm_benchmarks + rm -rf ./vllm +} + +main() { + + START_TIME=$(date +%s) + check_npus + + # dependencies + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + (which lsof) || (apt-get update && apt-get install -y lsof) + + # get the current IP address, required by benchmark_serving.py + # shellcheck disable=SC2155 + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + # turn of the reporting of the status of each request, to clean up the terminal output + export VLLM_LOG_LEVEL="WARNING" + + # prepare for benchmarking + cd benchmarks || exit 1 + get_benchmarks_scripts + trap cleanup EXIT + + QUICK_BENCHMARK_ROOT=./ + + declare -g RESULTS_FOLDER=results + mkdir -p $RESULTS_FOLDER + + ensure_sharegpt_downloaded + # benchmarks + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json + + END_TIME=$(date +%s) + ELAPSED_TIME=$((END_TIME - START_TIME)) + echo "Total execution time: $ELAPSED_TIME seconds" + +} + +main "$@" diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json new file mode 100644 index 0000000000..0033bf5292 --- /dev/null +++ b/benchmarks/tests/latency-tests.json @@ -0,0 +1,12 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json new file mode 100644 index 0000000000..5eb9ac0a88 --- /dev/null +++ b/benchmarks/tests/serving-tests.json @@ -0,0 +1,26 @@ +[ + { + "test_name": "serving_llama8B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json new file mode 100644 index 0000000000..16a8cd132b --- /dev/null +++ b/benchmarks/tests/throughput-tests.json @@ -0,0 +1,14 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] +