Add distributed executor backend to benchmark scripts (#118)

mawong-amd · web-flow · commit 42b1b9aeacd7 · 2024-08-02T14:38:34.000-05:00
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -19,27 +19,30 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(model=args.model,
-              speculative_model=args.speculative_model,
-              num_speculative_tokens=args.num_speculative_tokens,
-              tokenizer=args.tokenizer,
-              quantization=args.quantization,
-              quantized_weights_path=args.quantized_weights_path,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              enforce_eager=args.enforce_eager,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.quantization_param_path,
-              device=args.device,
-              ray_workers_use_nsight=args.ray_workers_use_nsight,
-              worker_use_ray=args.worker_use_ray,
-              use_v2_block_manager=args.use_v2_block_manager,
-              enable_chunked_prefill=args.enable_chunked_prefill,
-              download_dir=args.download_dir,
-              block_size=args.block_size,
-              disable_custom_all_reduce=args.disable_custom_all_reduce,
-              gpu_memory_utilization=args.gpu_memory_utilization)
+    llm = LLM(
+        model=args.model,
+        speculative_model=args.speculative_model,
+        num_speculative_tokens=args.num_speculative_tokens,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        quantized_weights_path=args.quantized_weights_path,
+        tensor_parallel_size=args.tensor_parallel_size,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        quantization_param_path=args.quantization_param_path,
+        device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        worker_use_ray=args.worker_use_ray,
+        use_v2_block_manager=args.use_v2_block_manager,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        download_dir=args.download_dir,
+        block_size=args.block_size,
+        disable_custom_all_reduce=args.disable_custom_all_reduce,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        distributed_executor_backend=args.distributed_executor_backend,
+    )
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -237,5 +240,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         help='the fraction of GPU memory to be used for '
                         'the model executor, which can range from 0 to 1.'
                         'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp', 'torchrun'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, on CUDA this will be automatically set to "ray" if '
+        'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
+        'instead set to torchrun by default.')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -79,6 +79,7 @@ def run_vllm(
     enable_prefix_caching: bool,
     enable_chunked_prefill: bool,
     max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
     gpu_memory_utilization: float = 0.9,
     worker_use_ray: bool = False,
     download_dir: Optional[str] = None,
@@ -104,6 +105,7 @@ def run_vllm(
         download_dir=download_dir,
         enable_chunked_prefill=enable_chunked_prefill,
         max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
     )
 
     # Add the requests to the engine.
@@ -229,8 +231,9 @@ def main(args: argparse.Namespace):
             args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
             args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.worker_use_ray, args.download_dir)
+            args.max_num_batched_tokens, args.distributed_executor_backend,
+            args.gpu_memory_utilization, args.worker_use_ray,
+            args.download_dir)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -384,6 +387,14 @@ def main(args: argparse.Namespace):
         type=str,
         default=None,
         help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp', 'torchrun'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, on CUDA this will be automatically set to "ray" if '
+        'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
+        'instead set to torchrun by default.')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/vllm/config.py b/vllm/config.py
@@ -599,7 +599,7 @@ def __init__(
         if self.distributed_executor_backend is None and self.world_size > 1:
             if is_hip():
                 logger.info("Using torchrun for multi-GPU on "
-                            "ROCM platform. Use --worker-use-ray or "
+                            "ROCm platform. Use --worker-use-ray or "
                             "--distributed-executor-backend={ray, mp} to "
                             "override")
                 if not os.environ.get("RANK"):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -233,7 +233,7 @@ def add_cli_args(
             help='Backend to use for distributed serving. When more than 1 GPU '
             'is used, on CUDA this will be automatically set to "ray" if '
             'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
-            'instead automatically set to torchrun.')
+            'instead set to torchrun by default.')
         parser.add_argument(
             '--worker-use-ray',
             action='store_true',