diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 7382782f1165..2a042802d0d5 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -531,18 +531,22 @@ async def benchmark( extra_body=extra_body, ) - test_output = await wait_for_endpoint( - request_func, - test_input, - session, - timeout_seconds=ready_check_timeout_sec, - ) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}") + if ready_check_timeout_sec > 0: + test_output = await wait_for_endpoint( + request_func, + test_input, + session, + timeout_seconds=ready_check_timeout_sec, + ) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark " + "arguments are correctly specified. " + f"Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") else: - print("Initial test run completed. Starting main benchmark run...") + print("Skipping endpoint ready check.") if lora_modules: # For each input request, choose a LoRA module at random. @@ -1151,7 +1155,8 @@ def add_cli_args(parser: argparse.ArgumentParser): type=int, default=600, help="Maximum time to wait for the endpoint to become ready " - "in seconds (default: 600 seconds / 10 minutes).", + "in seconds (default: 600 seconds / 10 minutes). If set to 0, " + "the ready check will be skipped." )