|
1 | 1 | # SPDX-License-Identifier: Apache-2.0
|
2 | 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 |
| -"""Benchmark the latency of processing a single batch of requests.""" |
| 3 | +import sys |
4 | 4 |
|
5 |
| -import argparse |
6 |
| -import dataclasses |
7 |
| -import json |
8 |
| -import os |
9 |
| -import time |
10 |
| -from typing import Any, Optional |
11 |
| - |
12 |
| -import numpy as np |
13 |
| -from tqdm import tqdm |
14 |
| -from typing_extensions import deprecated |
15 |
| - |
16 |
| -import vllm.envs as envs |
17 |
| -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json |
18 |
| -from vllm import LLM, SamplingParams |
19 |
| -from vllm.engine.arg_utils import EngineArgs |
20 |
| -from vllm.inputs import PromptType |
21 |
| -from vllm.sampling_params import BeamSearchParams |
22 |
| -from vllm.utils import FlexibleArgumentParser |
23 |
| - |
24 |
| - |
25 |
| -def save_to_pytorch_benchmark_format( |
26 |
| - args: argparse.Namespace, results: dict[str, Any] |
27 |
| -) -> None: |
28 |
| - pt_records = convert_to_pytorch_benchmark_format( |
29 |
| - args=args, |
30 |
| - metrics={"latency": results["latencies"]}, |
31 |
| - extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, |
32 |
| - ) |
33 |
| - if pt_records: |
34 |
| - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" |
35 |
| - write_to_json(pt_file, pt_records) |
36 |
| - |
37 |
| - |
38 |
| -@deprecated( |
39 |
| - "benchmark_latency.py is deprecated and will be removed in a " |
40 |
| - "future version. Please use 'vllm bench latency' instead.", |
41 |
| -) |
42 |
| -def main(args: argparse.Namespace): |
43 |
| - print(args) |
44 |
| - |
45 |
| - engine_args = EngineArgs.from_cli_args(args) |
46 |
| - |
47 |
| - # NOTE(woosuk): If the request cannot be processed in a single batch, |
48 |
| - # the engine will automatically process the request in multiple batches. |
49 |
| - llm = LLM(**dataclasses.asdict(engine_args)) |
50 |
| - assert llm.llm_engine.model_config.max_model_len >= ( |
51 |
| - args.input_len + args.output_len |
52 |
| - ), ( |
53 |
| - "Please ensure that max_model_len is greater than" |
54 |
| - " the sum of input_len and output_len." |
55 |
| - ) |
56 |
| - |
57 |
| - sampling_params = SamplingParams( |
58 |
| - n=args.n, |
59 |
| - temperature=1.0, |
60 |
| - top_p=1.0, |
61 |
| - ignore_eos=True, |
62 |
| - max_tokens=args.output_len, |
63 |
| - detokenize=not args.disable_detokenize, |
64 |
| - ) |
65 |
| - print(sampling_params) |
66 |
| - dummy_prompt_token_ids = np.random.randint( |
67 |
| - 10000, size=(args.batch_size, args.input_len) |
68 |
| - ) |
69 |
| - dummy_prompts: list[PromptType] = [ |
70 |
| - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() |
71 |
| - ] |
72 |
| - |
73 |
| - def llm_generate(): |
74 |
| - if not args.use_beam_search: |
75 |
| - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) |
76 |
| - else: |
77 |
| - llm.beam_search( |
78 |
| - dummy_prompts, |
79 |
| - BeamSearchParams( |
80 |
| - beam_width=args.n, |
81 |
| - max_tokens=args.output_len, |
82 |
| - ignore_eos=True, |
83 |
| - ), |
84 |
| - ) |
85 |
| - |
86 |
| - def run_to_completion(profile_dir: Optional[str] = None): |
87 |
| - if profile_dir: |
88 |
| - llm.start_profile() |
89 |
| - llm_generate() |
90 |
| - llm.stop_profile() |
91 |
| - else: |
92 |
| - start_time = time.perf_counter() |
93 |
| - llm_generate() |
94 |
| - end_time = time.perf_counter() |
95 |
| - latency = end_time - start_time |
96 |
| - return latency |
97 |
| - |
98 |
| - print("Warming up...") |
99 |
| - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): |
100 |
| - run_to_completion(profile_dir=None) |
101 |
| - |
102 |
| - if args.profile: |
103 |
| - profile_dir = envs.VLLM_TORCH_PROFILER_DIR |
104 |
| - print(f"Profiling (results will be saved to '{profile_dir}')...") |
105 |
| - run_to_completion(profile_dir=profile_dir) |
106 |
| - return |
107 |
| - |
108 |
| - # Benchmark. |
109 |
| - latencies = [] |
110 |
| - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): |
111 |
| - latencies.append(run_to_completion(profile_dir=None)) |
112 |
| - latencies = np.array(latencies) |
113 |
| - percentages = [10, 25, 50, 75, 90, 99] |
114 |
| - percentiles = np.percentile(latencies, percentages) |
115 |
| - print(f"Avg latency: {np.mean(latencies)} seconds") |
116 |
| - for percentage, percentile in zip(percentages, percentiles): |
117 |
| - print(f"{percentage}% percentile latency: {percentile} seconds") |
118 |
| - |
119 |
| - # Output JSON results if specified |
120 |
| - if args.output_json: |
121 |
| - results = { |
122 |
| - "avg_latency": np.mean(latencies), |
123 |
| - "latencies": latencies.tolist(), |
124 |
| - "percentiles": dict(zip(percentages, percentiles.tolist())), |
125 |
| - } |
126 |
| - with open(args.output_json, "w") as f: |
127 |
| - json.dump(results, f, indent=4) |
128 |
| - save_to_pytorch_benchmark_format(args, results) |
129 |
| - |
130 |
| - |
131 |
| -def create_argument_parser(): |
132 |
| - parser = FlexibleArgumentParser( |
133 |
| - description="Benchmark the latency of processing a single batch of " |
134 |
| - "requests till completion." |
135 |
| - ) |
136 |
| - parser.add_argument("--input-len", type=int, default=32) |
137 |
| - parser.add_argument("--output-len", type=int, default=128) |
138 |
| - parser.add_argument("--batch-size", type=int, default=8) |
139 |
| - parser.add_argument( |
140 |
| - "--n", |
141 |
| - type=int, |
142 |
| - default=1, |
143 |
| - help="Number of generated sequences per prompt.", |
144 |
| - ) |
145 |
| - parser.add_argument("--use-beam-search", action="store_true") |
146 |
| - parser.add_argument( |
147 |
| - "--num-iters-warmup", |
148 |
| - type=int, |
149 |
| - default=10, |
150 |
| - help="Number of iterations to run for warmup.", |
151 |
| - ) |
152 |
| - parser.add_argument( |
153 |
| - "--num-iters", type=int, default=30, help="Number of iterations to run." |
154 |
| - ) |
155 |
| - parser.add_argument( |
156 |
| - "--profile", |
157 |
| - action="store_true", |
158 |
| - help="profile the generation process of a single batch", |
159 |
| - ) |
160 |
| - parser.add_argument( |
161 |
| - "--output-json", |
162 |
| - type=str, |
163 |
| - default=None, |
164 |
| - help="Path to save the latency results in JSON format.", |
165 |
| - ) |
166 |
| - parser.add_argument( |
167 |
| - "--disable-detokenize", |
168 |
| - action="store_true", |
169 |
| - help=( |
170 |
| - "Do not detokenize responses (i.e. do not include " |
171 |
| - "detokenization time in the latency measurement)" |
172 |
| - ), |
173 |
| - ) |
174 |
| - |
175 |
| - parser = EngineArgs.add_cli_args(parser) |
176 |
| - # V1 enables prefix caching by default which skews the latency |
177 |
| - # numbers. We need to disable prefix caching by default. |
178 |
| - parser.set_defaults(enable_prefix_caching=False) |
| 5 | +if __name__ == "__main__": |
| 6 | + print("""DEPRECATED: This script has been moved to the vLLM CLI. |
179 | 7 |
|
180 |
| - return parser |
| 8 | +Please use the following command instead: |
| 9 | + vllm bench latency |
181 | 10 |
|
| 11 | +For help with the new command, run: |
| 12 | + vllm bench latency --help |
182 | 13 |
|
183 |
| -if __name__ == "__main__": |
184 |
| - parser = create_argument_parser() |
185 |
| - args = parser.parse_args() |
186 |
| - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: |
187 |
| - raise OSError( |
188 |
| - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " |
189 |
| - "Please set it to a valid path to use torch profiler." |
190 |
| - ) |
191 |
| - main(args) |
| 14 | +Alternatively, you can run the new command directly with: |
| 15 | + python -m vllm.entrypoints.cli.main bench latency --help |
| 16 | +""") |
| 17 | + sys.exit(1) |
0 commit comments