Skip to content

Commit 6fb2788

Browse files
authored
[CI/Build][Doc] Fully deprecate old bench scripts for serving / throughput / latency (#24411)
Signed-off-by: Ye (Charlotte) Qi <[email protected]>
1 parent 3d2a2de commit 6fb2788

File tree

4 files changed

+35
-2240
lines changed

4 files changed

+35
-2240
lines changed

benchmarks/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,7 @@ python -m vllm.entrypoints.openai.api_server \
694694
Send requests with images:
695695

696696
```bash
697-
python benchmarks/benchmark_serving.py \
697+
vllm bench serve \
698698
--backend openai-chat \
699699
--model Qwen/Qwen2.5-VL-7B-Instruct \
700700
--dataset-name sharegpt \
@@ -721,7 +721,7 @@ python -m vllm.entrypoints.openai.api_server \
721721
Send requests with videos:
722722

723723
```bash
724-
python benchmarks/benchmark_serving.py \
724+
vllm bench serve \
725725
--backend openai-chat \
726726
--model Qwen/Qwen2.5-VL-7B-Instruct \
727727
--dataset-name sharegpt \

benchmarks/benchmark_latency.py

Lines changed: 11 additions & 185 deletions
Original file line numberDiff line numberDiff line change
@@ -1,191 +1,17 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3-
"""Benchmark the latency of processing a single batch of requests."""
3+
import sys
44

5-
import argparse
6-
import dataclasses
7-
import json
8-
import os
9-
import time
10-
from typing import Any, Optional
11-
12-
import numpy as np
13-
from tqdm import tqdm
14-
from typing_extensions import deprecated
15-
16-
import vllm.envs as envs
17-
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
18-
from vllm import LLM, SamplingParams
19-
from vllm.engine.arg_utils import EngineArgs
20-
from vllm.inputs import PromptType
21-
from vllm.sampling_params import BeamSearchParams
22-
from vllm.utils import FlexibleArgumentParser
23-
24-
25-
def save_to_pytorch_benchmark_format(
26-
args: argparse.Namespace, results: dict[str, Any]
27-
) -> None:
28-
pt_records = convert_to_pytorch_benchmark_format(
29-
args=args,
30-
metrics={"latency": results["latencies"]},
31-
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
32-
)
33-
if pt_records:
34-
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
35-
write_to_json(pt_file, pt_records)
36-
37-
38-
@deprecated(
39-
"benchmark_latency.py is deprecated and will be removed in a "
40-
"future version. Please use 'vllm bench latency' instead.",
41-
)
42-
def main(args: argparse.Namespace):
43-
print(args)
44-
45-
engine_args = EngineArgs.from_cli_args(args)
46-
47-
# NOTE(woosuk): If the request cannot be processed in a single batch,
48-
# the engine will automatically process the request in multiple batches.
49-
llm = LLM(**dataclasses.asdict(engine_args))
50-
assert llm.llm_engine.model_config.max_model_len >= (
51-
args.input_len + args.output_len
52-
), (
53-
"Please ensure that max_model_len is greater than"
54-
" the sum of input_len and output_len."
55-
)
56-
57-
sampling_params = SamplingParams(
58-
n=args.n,
59-
temperature=1.0,
60-
top_p=1.0,
61-
ignore_eos=True,
62-
max_tokens=args.output_len,
63-
detokenize=not args.disable_detokenize,
64-
)
65-
print(sampling_params)
66-
dummy_prompt_token_ids = np.random.randint(
67-
10000, size=(args.batch_size, args.input_len)
68-
)
69-
dummy_prompts: list[PromptType] = [
70-
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
71-
]
72-
73-
def llm_generate():
74-
if not args.use_beam_search:
75-
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
76-
else:
77-
llm.beam_search(
78-
dummy_prompts,
79-
BeamSearchParams(
80-
beam_width=args.n,
81-
max_tokens=args.output_len,
82-
ignore_eos=True,
83-
),
84-
)
85-
86-
def run_to_completion(profile_dir: Optional[str] = None):
87-
if profile_dir:
88-
llm.start_profile()
89-
llm_generate()
90-
llm.stop_profile()
91-
else:
92-
start_time = time.perf_counter()
93-
llm_generate()
94-
end_time = time.perf_counter()
95-
latency = end_time - start_time
96-
return latency
97-
98-
print("Warming up...")
99-
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
100-
run_to_completion(profile_dir=None)
101-
102-
if args.profile:
103-
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
104-
print(f"Profiling (results will be saved to '{profile_dir}')...")
105-
run_to_completion(profile_dir=profile_dir)
106-
return
107-
108-
# Benchmark.
109-
latencies = []
110-
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
111-
latencies.append(run_to_completion(profile_dir=None))
112-
latencies = np.array(latencies)
113-
percentages = [10, 25, 50, 75, 90, 99]
114-
percentiles = np.percentile(latencies, percentages)
115-
print(f"Avg latency: {np.mean(latencies)} seconds")
116-
for percentage, percentile in zip(percentages, percentiles):
117-
print(f"{percentage}% percentile latency: {percentile} seconds")
118-
119-
# Output JSON results if specified
120-
if args.output_json:
121-
results = {
122-
"avg_latency": np.mean(latencies),
123-
"latencies": latencies.tolist(),
124-
"percentiles": dict(zip(percentages, percentiles.tolist())),
125-
}
126-
with open(args.output_json, "w") as f:
127-
json.dump(results, f, indent=4)
128-
save_to_pytorch_benchmark_format(args, results)
129-
130-
131-
def create_argument_parser():
132-
parser = FlexibleArgumentParser(
133-
description="Benchmark the latency of processing a single batch of "
134-
"requests till completion."
135-
)
136-
parser.add_argument("--input-len", type=int, default=32)
137-
parser.add_argument("--output-len", type=int, default=128)
138-
parser.add_argument("--batch-size", type=int, default=8)
139-
parser.add_argument(
140-
"--n",
141-
type=int,
142-
default=1,
143-
help="Number of generated sequences per prompt.",
144-
)
145-
parser.add_argument("--use-beam-search", action="store_true")
146-
parser.add_argument(
147-
"--num-iters-warmup",
148-
type=int,
149-
default=10,
150-
help="Number of iterations to run for warmup.",
151-
)
152-
parser.add_argument(
153-
"--num-iters", type=int, default=30, help="Number of iterations to run."
154-
)
155-
parser.add_argument(
156-
"--profile",
157-
action="store_true",
158-
help="profile the generation process of a single batch",
159-
)
160-
parser.add_argument(
161-
"--output-json",
162-
type=str,
163-
default=None,
164-
help="Path to save the latency results in JSON format.",
165-
)
166-
parser.add_argument(
167-
"--disable-detokenize",
168-
action="store_true",
169-
help=(
170-
"Do not detokenize responses (i.e. do not include "
171-
"detokenization time in the latency measurement)"
172-
),
173-
)
174-
175-
parser = EngineArgs.add_cli_args(parser)
176-
# V1 enables prefix caching by default which skews the latency
177-
# numbers. We need to disable prefix caching by default.
178-
parser.set_defaults(enable_prefix_caching=False)
5+
if __name__ == "__main__":
6+
print("""DEPRECATED: This script has been moved to the vLLM CLI.
1797
180-
return parser
8+
Please use the following command instead:
9+
vllm bench latency
18110
11+
For help with the new command, run:
12+
vllm bench latency --help
18213
183-
if __name__ == "__main__":
184-
parser = create_argument_parser()
185-
args = parser.parse_args()
186-
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
187-
raise OSError(
188-
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
189-
"Please set it to a valid path to use torch profiler."
190-
)
191-
main(args)
14+
Alternatively, you can run the new command directly with:
15+
python -m vllm.entrypoints.cli.main bench latency --help
16+
""")
17+
sys.exit(1)

0 commit comments

Comments
 (0)