@@ -19,27 +19,30 @@ def main(args: argparse.Namespace):
1919
2020 # NOTE(woosuk): If the request cannot be processed in a single batch,
2121 # the engine will automatically process the request in multiple batches.
22- llm = LLM (model = args .model ,
23- speculative_model = args .speculative_model ,
24- num_speculative_tokens = args .num_speculative_tokens ,
25- tokenizer = args .tokenizer ,
26- quantization = args .quantization ,
27- quantized_weights_path = args .quantized_weights_path ,
28- tensor_parallel_size = args .tensor_parallel_size ,
29- trust_remote_code = args .trust_remote_code ,
30- dtype = args .dtype ,
31- enforce_eager = args .enforce_eager ,
32- kv_cache_dtype = args .kv_cache_dtype ,
33- quantization_param_path = args .quantization_param_path ,
34- device = args .device ,
35- ray_workers_use_nsight = args .ray_workers_use_nsight ,
36- worker_use_ray = args .worker_use_ray ,
37- use_v2_block_manager = args .use_v2_block_manager ,
38- enable_chunked_prefill = args .enable_chunked_prefill ,
39- download_dir = args .download_dir ,
40- block_size = args .block_size ,
41- disable_custom_all_reduce = args .disable_custom_all_reduce ,
42- gpu_memory_utilization = args .gpu_memory_utilization )
22+ llm = LLM (
23+ model = args .model ,
24+ speculative_model = args .speculative_model ,
25+ num_speculative_tokens = args .num_speculative_tokens ,
26+ tokenizer = args .tokenizer ,
27+ quantization = args .quantization ,
28+ quantized_weights_path = args .quantized_weights_path ,
29+ tensor_parallel_size = args .tensor_parallel_size ,
30+ trust_remote_code = args .trust_remote_code ,
31+ dtype = args .dtype ,
32+ enforce_eager = args .enforce_eager ,
33+ kv_cache_dtype = args .kv_cache_dtype ,
34+ quantization_param_path = args .quantization_param_path ,
35+ device = args .device ,
36+ ray_workers_use_nsight = args .ray_workers_use_nsight ,
37+ worker_use_ray = args .worker_use_ray ,
38+ use_v2_block_manager = args .use_v2_block_manager ,
39+ enable_chunked_prefill = args .enable_chunked_prefill ,
40+ download_dir = args .download_dir ,
41+ block_size = args .block_size ,
42+ disable_custom_all_reduce = args .disable_custom_all_reduce ,
43+ gpu_memory_utilization = args .gpu_memory_utilization ,
44+ distributed_executor_backend = args .distributed_executor_backend ,
45+ )
4346
4447 sampling_params = SamplingParams (
4548 n = args .n ,
@@ -237,5 +240,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
237240 help = 'the fraction of GPU memory to be used for '
238241 'the model executor, which can range from 0 to 1.'
239242 'If unspecified, will use the default value of 0.9.' )
243+ parser .add_argument (
244+ '--distributed-executor-backend' ,
245+ choices = ['ray' , 'mp' , 'torchrun' ],
246+ default = None ,
247+ help = 'Backend to use for distributed serving. When more than 1 GPU '
248+ 'is used, on CUDA this will be automatically set to "ray" if '
249+ 'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
250+ 'instead set to torchrun by default.' )
240251 args = parser .parse_args ()
241252 main (args )
0 commit comments