Skip to content

Commit cb1ed47

Browse files
ApostaCKuntaiDu
authored andcommitted
[Benchmark] Add benchmark script for CPU offloading (vllm-project#11533)
Signed-off-by: ApostaC <[email protected]> Co-authored-by: KuntaiDu <[email protected]>
1 parent b000b09 commit cb1ed47

File tree

1 file changed

+184
-0
lines changed

1 file changed

+184
-0
lines changed
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""
2+
Offline benchmark to test the long document QA throughput.
3+
4+
Example usage:
5+
# This command run the vllm with 50GB CPU memory for offloading
6+
# The workload samples 8 different prompts with a default input
7+
# length of 20000 tokens, then replicates each prompt 2 times
8+
# in random order.
9+
python benchmark_long_document_qa_throughput.py \
10+
--model meta-llama/Llama-2-7b-chat-hf \
11+
--enable-prefix-caching \
12+
--num-documents 8 \
13+
--repeat-count 2
14+
15+
Commandline arguments:
16+
--num-documents: The number of documents to sample prompts from.
17+
18+
--document-length: The length of each document in tokens.
19+
(Optional, default: 20000)
20+
21+
--output-len: The number of tokens to generate for each prompt.
22+
(Optional, default: 10)
23+
24+
--repeat-count: The number of times to repeat each prompt.
25+
(Optional, default: 2)
26+
27+
--repeat-mode: The mode to repeat prompts. The supported modes are:
28+
- 'random': shuffle the prompts randomly. (Default)
29+
- 'tile': the entire prompt list is repeated in sequence. (Potentially
30+
lowest cache hit)
31+
- 'interleave': each prompt is repeated consecutively before
32+
moving to the next element. (Highest cache hit)
33+
34+
--shuffle-seed: Random seed when the repeat mode is "random".
35+
(Optional, default: 0)
36+
37+
In the meantime, it also supports all the vLLM engine args to initialize the
38+
LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
39+
details.
40+
"""
41+
42+
import dataclasses
43+
import random
44+
import time
45+
46+
from vllm import LLM, SamplingParams
47+
from vllm.engine.arg_utils import EngineArgs
48+
from vllm.utils import FlexibleArgumentParser
49+
50+
51+
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
52+
"""
53+
Test long document QA with the given prompts and sampling parameters.
54+
Print the time spent in processing all the prompts.
55+
56+
Args:
57+
llm: The language model used for generating responses.
58+
sampling_params: Sampling parameter used to generate the response.
59+
prompts: A list of prompt strings to be processed by the LLM.
60+
"""
61+
start_time = time.time()
62+
llm.generate(prompts, sampling_params=sampling_params)
63+
end_time = time.time()
64+
print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
65+
66+
67+
def repeat_prompts(prompts, repeat_count, mode: str):
68+
"""
69+
Repeat each prompt in the list for a specified number of times.
70+
The order of prompts in the output list depends on the mode.
71+
72+
Args:
73+
prompts: A list of prompts to be repeated.
74+
repeat_count: The number of times each prompt is repeated.
75+
mode: The mode of repetition. Supported modes are:
76+
- 'random': Shuffle the prompts randomly after repetition.
77+
- 'tile': Repeat the entire prompt list in sequence.
78+
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
79+
- 'interleave': Repeat each prompt consecutively before moving to
80+
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
81+
82+
Returns:
83+
A list of repeated prompts in the specified order.
84+
85+
Raises:
86+
ValueError: If an invalid mode is provided.
87+
"""
88+
print("Repeat mode: ", mode)
89+
if mode == 'random':
90+
repeated_prompts = prompts * repeat_count
91+
random.shuffle(repeated_prompts)
92+
return repeated_prompts
93+
elif mode == 'tile':
94+
return prompts * repeat_count
95+
elif mode == 'interleave':
96+
repeated_prompts = []
97+
for prompt in prompts:
98+
repeated_prompts.extend([prompt] * repeat_count)
99+
return repeated_prompts
100+
else:
101+
raise ValueError(f"Invalid mode: {mode}, only support "
102+
"'random', 'tile', 'interleave'")
103+
104+
105+
def main(args):
106+
random.seed(args.shuffle_seed)
107+
108+
# Prepare the prompts:
109+
# we append the document id at the beginning to avoid any of the document
110+
# being the prefix of other documents
111+
prompts = [
112+
str(i) + ' '.join(['hi'] * args.document_length)
113+
for i in range(args.num_documents)
114+
]
115+
116+
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
117+
118+
warmup_prompts = [
119+
"This is warm up request " + str(i) + \
120+
' '.join(['hi'] * args.document_length)
121+
for i in range(args.num_documents)]
122+
123+
# Create the LLM engine
124+
engine_args = EngineArgs.from_cli_args(args)
125+
llm = LLM(**dataclasses.asdict(engine_args))
126+
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
127+
128+
print("------warm up------")
129+
test_long_document_qa(
130+
llm=llm,
131+
prompts=warmup_prompts,
132+
sampling_params=sampling_params,
133+
)
134+
135+
print("------start generating------")
136+
test_long_document_qa(
137+
llm=llm,
138+
prompts=prompts,
139+
sampling_params=sampling_params,
140+
)
141+
142+
143+
if __name__ == "__main__":
144+
parser = FlexibleArgumentParser(
145+
description=
146+
'Benchmark the performance with or without automatic prefix caching.')
147+
148+
parser.add_argument(
149+
'--document-length',
150+
type=int,
151+
# Roughly the number of tokens for a system paper,
152+
# excluding images
153+
default=20000,
154+
help='Range of input lengths for sampling prompts,'
155+
'specified as "min:max" (e.g., "128:256").')
156+
157+
parser.add_argument('--num-documents',
158+
type=int,
159+
default=8,
160+
help='Range of input lengths for sampling prompts,'
161+
'specified as "min:max" (e.g., "128:256").')
162+
163+
parser.add_argument('--output-len', type=int, default=10)
164+
165+
parser.add_argument('--repeat-count',
166+
type=int,
167+
default=2,
168+
help='Number of times to repeat each prompt')
169+
170+
parser.add_argument("--repeat-mode",
171+
type=str,
172+
default='random',
173+
help='The mode to repeat prompts. The supported '
174+
'modes are "random", "tile", and "interleave". '
175+
'See repeat_prompts() in the source code for details.')
176+
177+
parser.add_argument("--shuffle-seed",
178+
type=int,
179+
default=0,
180+
help='Random seed when the repeat mode is "random"')
181+
182+
parser = EngineArgs.add_cli_args(parser)
183+
args = parser.parse_args()
184+
main(args)

0 commit comments

Comments
 (0)