Skip to content

Commit 42b1b9a

Browse files
authored
Add distributed executor backend to benchmark scripts (#118)
1 parent 3e480e9 commit 42b1b9a

File tree

4 files changed

+47
-25
lines changed

4 files changed

+47
-25
lines changed

benchmarks/benchmark_latency.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,30 @@ def main(args: argparse.Namespace):
1919

2020
# NOTE(woosuk): If the request cannot be processed in a single batch,
2121
# the engine will automatically process the request in multiple batches.
22-
llm = LLM(model=args.model,
23-
speculative_model=args.speculative_model,
24-
num_speculative_tokens=args.num_speculative_tokens,
25-
tokenizer=args.tokenizer,
26-
quantization=args.quantization,
27-
quantized_weights_path=args.quantized_weights_path,
28-
tensor_parallel_size=args.tensor_parallel_size,
29-
trust_remote_code=args.trust_remote_code,
30-
dtype=args.dtype,
31-
enforce_eager=args.enforce_eager,
32-
kv_cache_dtype=args.kv_cache_dtype,
33-
quantization_param_path=args.quantization_param_path,
34-
device=args.device,
35-
ray_workers_use_nsight=args.ray_workers_use_nsight,
36-
worker_use_ray=args.worker_use_ray,
37-
use_v2_block_manager=args.use_v2_block_manager,
38-
enable_chunked_prefill=args.enable_chunked_prefill,
39-
download_dir=args.download_dir,
40-
block_size=args.block_size,
41-
disable_custom_all_reduce=args.disable_custom_all_reduce,
42-
gpu_memory_utilization=args.gpu_memory_utilization)
22+
llm = LLM(
23+
model=args.model,
24+
speculative_model=args.speculative_model,
25+
num_speculative_tokens=args.num_speculative_tokens,
26+
tokenizer=args.tokenizer,
27+
quantization=args.quantization,
28+
quantized_weights_path=args.quantized_weights_path,
29+
tensor_parallel_size=args.tensor_parallel_size,
30+
trust_remote_code=args.trust_remote_code,
31+
dtype=args.dtype,
32+
enforce_eager=args.enforce_eager,
33+
kv_cache_dtype=args.kv_cache_dtype,
34+
quantization_param_path=args.quantization_param_path,
35+
device=args.device,
36+
ray_workers_use_nsight=args.ray_workers_use_nsight,
37+
worker_use_ray=args.worker_use_ray,
38+
use_v2_block_manager=args.use_v2_block_manager,
39+
enable_chunked_prefill=args.enable_chunked_prefill,
40+
download_dir=args.download_dir,
41+
block_size=args.block_size,
42+
disable_custom_all_reduce=args.disable_custom_all_reduce,
43+
gpu_memory_utilization=args.gpu_memory_utilization,
44+
distributed_executor_backend=args.distributed_executor_backend,
45+
)
4346

4447
sampling_params = SamplingParams(
4548
n=args.n,
@@ -237,5 +240,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
237240
help='the fraction of GPU memory to be used for '
238241
'the model executor, which can range from 0 to 1.'
239242
'If unspecified, will use the default value of 0.9.')
243+
parser.add_argument(
244+
'--distributed-executor-backend',
245+
choices=['ray', 'mp', 'torchrun'],
246+
default=None,
247+
help='Backend to use for distributed serving. When more than 1 GPU '
248+
'is used, on CUDA this will be automatically set to "ray" if '
249+
'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
250+
'instead set to torchrun by default.')
240251
args = parser.parse_args()
241252
main(args)

benchmarks/benchmark_throughput.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def run_vllm(
7979
enable_prefix_caching: bool,
8080
enable_chunked_prefill: bool,
8181
max_num_batched_tokens: int,
82+
distributed_executor_backend: Optional[str],
8283
gpu_memory_utilization: float = 0.9,
8384
worker_use_ray: bool = False,
8485
download_dir: Optional[str] = None,
@@ -104,6 +105,7 @@ def run_vllm(
104105
download_dir=download_dir,
105106
enable_chunked_prefill=enable_chunked_prefill,
106107
max_num_batched_tokens=max_num_batched_tokens,
108+
distributed_executor_backend=distributed_executor_backend,
107109
)
108110

109111
# Add the requests to the engine.
@@ -229,8 +231,9 @@ def main(args: argparse.Namespace):
229231
args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
230232
args.quantization_param_path, args.device,
231233
args.enable_prefix_caching, args.enable_chunked_prefill,
232-
args.max_num_batched_tokens, args.gpu_memory_utilization,
233-
args.worker_use_ray, args.download_dir)
234+
args.max_num_batched_tokens, args.distributed_executor_backend,
235+
args.gpu_memory_utilization, args.worker_use_ray,
236+
args.download_dir)
234237
elif args.backend == "hf":
235238
assert args.tensor_parallel_size == 1
236239
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -384,6 +387,14 @@ def main(args: argparse.Namespace):
384387
type=str,
385388
default=None,
386389
help='Path to save the throughput results in JSON format.')
390+
parser.add_argument(
391+
'--distributed-executor-backend',
392+
choices=['ray', 'mp', 'torchrun'],
393+
default=None,
394+
help='Backend to use for distributed serving. When more than 1 GPU '
395+
'is used, on CUDA this will be automatically set to "ray" if '
396+
'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
397+
'instead set to torchrun by default.')
387398
args = parser.parse_args()
388399
if args.tokenizer is None:
389400
args.tokenizer = args.model

vllm/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ def __init__(
599599
if self.distributed_executor_backend is None and self.world_size > 1:
600600
if is_hip():
601601
logger.info("Using torchrun for multi-GPU on "
602-
"ROCM platform. Use --worker-use-ray or "
602+
"ROCm platform. Use --worker-use-ray or "
603603
"--distributed-executor-backend={ray, mp} to "
604604
"override")
605605
if not os.environ.get("RANK"):

vllm/engine/arg_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ def add_cli_args(
233233
help='Backend to use for distributed serving. When more than 1 GPU '
234234
'is used, on CUDA this will be automatically set to "ray" if '
235235
'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
236-
'instead automatically set to torchrun.')
236+
'instead set to torchrun by default.')
237237
parser.add_argument(
238238
'--worker-use-ray',
239239
action='store_true',

0 commit comments

Comments
 (0)