|
1 |
| -import time |
| 1 | +import argparse |
| 2 | +import dataclasses |
2 | 3 |
|
| 4 | +# from transformers import AutoTokenizer |
3 | 5 | from vllm import LLM, SamplingParams
|
| 6 | +from vllm.engine.arg_utils import EngineArgs |
| 7 | +from vllm.utils import FlexibleArgumentParser |
4 | 8 |
|
5 | 9 |
|
6 |
| -def main(): |
7 |
| - llm = LLM( |
8 |
| - '/data/AI-ModelScope/Mixtral-8x7B-Instruct-v0___1/', |
9 |
| - tensor_parallel_size=1, |
10 |
| - #quantization="serenity", |
11 |
| - dtype='float16', |
12 |
| - #swap_space=16, |
13 |
| - #enforce_eager=True, |
14 |
| - #kv_cache_dtype="fp8", |
15 |
| - #quantization="fp8", |
16 |
| - #quantized_weights_path="/quantized/quark/llama.safetensors", |
17 |
| - #worker_use_ray=True, |
18 |
| - #trust_remote_code=True, |
19 |
| - #distributed_executor_backend="mp", |
20 |
| - ) |
21 |
| - batch_size = 5 |
22 |
| - max_tokens = 256 |
23 |
| - prompt = """The sun is a""" |
24 |
| - sampling_params = SamplingParams(temperature=0, |
25 |
| - top_p=0.95, |
26 |
| - max_tokens=max_tokens) |
27 |
| - |
28 |
| - start_time = time.perf_counter() |
29 |
| - outs = llm.generate([prompt] * batch_size, sampling_params=sampling_params) |
30 |
| - end_time = time.perf_counter() |
31 |
| - elapsed_time = end_time - start_time |
32 |
| - |
33 |
| - out_lengths = [len(x.token_ids) for out in outs for x in out.outputs] |
34 |
| - num_tokens = sum(out_lengths) |
35 |
| - |
36 |
| - print( |
37 |
| - f"{num_tokens} tokens. {num_tokens / batch_size} on average. {num_tokens / elapsed_time:.2f} tokens/s. {elapsed_time} seconds" # noqa: E501 |
| 10 | +def main(args: argparse.Namespace): |
| 11 | + print(args) |
| 12 | + |
| 13 | + engine_args = EngineArgs.from_cli_args(args) |
| 14 | + |
| 15 | + # NOTE(woosuk): If the request cannot be processed in a single batch, |
| 16 | + # the engine will automatically process the request in multiple batches. |
| 17 | + llm = LLM(**dataclasses.asdict(engine_args)) |
| 18 | + |
| 19 | + sampling_params = SamplingParams( |
| 20 | + n=args.n, |
| 21 | + temperature=1.0, |
| 22 | + top_p=1.0, |
| 23 | + ignore_eos=True, |
| 24 | + max_tokens=args.output_len, |
38 | 25 | )
|
39 |
| - for out in outs: |
40 |
| - print("===========") |
41 |
| - print(out.outputs[0].text) |
| 26 | + print(sampling_params) |
| 27 | + |
| 28 | + # tokenizer = AutoTokenizer.from_pretrained(engine_args.model) |
| 29 | + # inputs = tokenizer('Hello, world!', return_tensors='pt').input_ids |
| 30 | + inputs = [ |
| 31 | + 'Where is the capital of China?', |
| 32 | + 'The capital of Russia is ', |
| 33 | + 'The CEO of DeepSeek is ', |
| 34 | + 'The future of AI is', |
| 35 | + ] * 32 |
| 36 | + outputs = llm.generate(inputs, sampling_params) |
| 37 | + for i, output in enumerate(outputs): |
| 38 | + prompt = output.prompt |
| 39 | + generated_text = output.outputs[0].text |
| 40 | + print(f"Prompt {i}: {prompt!r}, Generated text: {generated_text!r}") |
| 41 | + # print(tokenizer.decode(outputs[0])) |
| 42 | + |
42 | 43 |
|
| 44 | +if __name__ == '__main__': |
| 45 | + parser = FlexibleArgumentParser( |
| 46 | + description='Benchmark the latency of processing a single batch of ' |
| 47 | + 'requests till completion.') |
| 48 | + parser.add_argument('--input-len', type=int, default=32) |
| 49 | + parser.add_argument('--output-len', type=int, default=128) |
| 50 | + parser.add_argument('--batch-size', type=int, default=8) |
| 51 | + parser.add_argument('--n', |
| 52 | + type=int, |
| 53 | + default=1, |
| 54 | + help='Number of generated sequences per prompt.') |
| 55 | + parser.add_argument('--use-beam-search', action='store_true') |
| 56 | + parser.add_argument('--num-iters-warmup', |
| 57 | + type=int, |
| 58 | + default=10, |
| 59 | + help='Number of iterations to run for warmup.') |
| 60 | + parser.add_argument('--num-iters', |
| 61 | + type=int, |
| 62 | + default=30, |
| 63 | + help='Number of iterations to run.') |
| 64 | + parser.add_argument( |
| 65 | + '--profile', |
| 66 | + action='store_true', |
| 67 | + help='profile the generation process of a single batch') |
| 68 | + parser.add_argument( |
| 69 | + '--profile-result-dir', |
| 70 | + type=str, |
| 71 | + default=None, |
| 72 | + help=('path to save the pytorch profiler output. Can be visualized ' |
| 73 | + 'with ui.perfetto.dev or Tensorboard.')) |
| 74 | + parser.add_argument( |
| 75 | + '--output-json', |
| 76 | + type=str, |
| 77 | + default=None, |
| 78 | + help='Path to save the latency results in JSON format.') |
43 | 79 |
|
44 |
| -if __name__ == "__main__": |
45 |
| - main() |
| 80 | + parser = EngineArgs.add_cli_args(parser) |
| 81 | + args = parser.parse_args() |
| 82 | + main(args) |
0 commit comments