diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index d04b1d1136a1..2a03ce1dffd6 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -156,7 +156,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -230,7 +229,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -245,7 +243,6 @@ vllm bench serve \ ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 5471d6b8e4a5..fafbef5f3718 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -68,7 +68,7 @@ def test_bench_serve_chat(server): "5", "--endpoint", "/v1/chat/completions", - "--endpoint-type", + "--backend", "openai-chat", ] result = subprocess.run(command, capture_output=True, text=True) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1831539a6adb..47e87d1eb9e9 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1357,7 +1357,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: elif args.dataset_name == "sonnet": dataset = SonnetDataset(dataset_path=args.dataset_path) # For the "sonnet" dataset, formatting depends on the backend. - if args.endpoint_type == "openai-chat": + if args.backend == "openai-chat": input_requests = dataset.sample( num_requests=args.num_prompts, input_len=args.sonnet_input_len, @@ -1461,7 +1461,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: "Please consider contributing if you would " "like to add support for additional dataset formats.") - if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [ + if dataset_class.IS_MULTIMODAL and args.backend not in [ "openai-chat", "openai-audio", ]: @@ -1469,7 +1469,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: # endpoint-type. raise ValueError( "Multi-modal content is only supported on 'openai-chat' and " - "'openai-audio' endpoint-type.") + "'openai-audio' backends.") input_requests = dataset_class( dataset_path=args.dataset_path, dataset_subset=args.hf_subset, @@ -1562,7 +1562,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: try: # Enforce endpoint compatibility for multimodal datasets. - if args.dataset_name == "random-mm" and args.endpoint_type not in [ + if args.dataset_name == "random-mm" and args.backend not in [ "openai-chat"]: raise ValueError( "Multi-modal content (images) is only supported on " diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index d8784340eba1..7382782f1165 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -8,8 +8,8 @@ On the client side, run: vllm bench serve \ - --endpoint-type \ - --label \ + --backend \ + --label \ --model \ --dataset-name \ --request-rate \ @@ -52,6 +52,21 @@ and (shutil.which("gnuplot") is not None)) +# TODO: Remove this in v0.11.0 +class DeprecatedEndpointTypeAction(argparse.Action): + """Argparse action for the deprecated --endpoint-type flag. + """ + + def __call__(self, _, namespace, values, option_string=None): + warnings.warn( + "'--endpoint-type' is deprecated and will be removed in v0.11.0. " + "Please use '--backend' instead or remove this argument if you " + "have already set it.", + stacklevel=1, + ) + setattr(namespace, self.dest, values) + + class TaskType(Enum): GENERATION = "generation" EMBEDDING = "embedding" @@ -470,7 +485,7 @@ async def benchmark( else: request_func = ASYNC_REQUEST_FUNCS[endpoint_type] else: - raise ValueError(f"Unknown endpoint_type: {endpoint_type}") + raise ValueError(f"Unknown backend: {endpoint_type}") # Reuses connections across requests to reduce TLS handshake overhead. connector = aiohttp.TCPConnector( @@ -850,24 +865,28 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, def add_cli_args(parser: argparse.ArgumentParser): add_dataset_parser(parser) - parser.add_argument( - "--endpoint-type", - type=str, - default="openai", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) parser.add_argument( "--label", type=str, default=None, help="The label (prefix) of the benchmark results. If not specified, " - "the endpoint type will be used as the label.", + "the value of '--backend' will be used as the label.", ) parser.add_argument( "--backend", type=str, - default="vllm", + default="openai", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + help="The type of backend or endpoint to use for the benchmark." + ) + parser.add_argument( + "--endpoint-type", + type=str, + default=None, choices=list(ASYNC_REQUEST_FUNCS.keys()), + action=DeprecatedEndpointTypeAction, + help="'--endpoint-type' is deprecated and will be removed in v0.11.0. " + "Please use '--backend' instead.", ) parser.add_argument( "--base-url", @@ -1165,7 +1184,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: raise ValueError( "For exponential ramp-up, the start RPS cannot be 0.") - endpoint_type = args.endpoint_type label = args.label model_id = args.model model_name = args.served_model_name @@ -1228,7 +1246,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: gc.freeze() benchmark_result = await benchmark( - endpoint_type=args.endpoint_type, + endpoint_type=args.backend, api_url=api_url, base_url=base_url, model_id=model_id, @@ -1262,7 +1280,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") result_json["date"] = current_dt - result_json["endpoint_type"] = args.endpoint_type + result_json["endpoint_type"] = args.backend # for backward compatibility + result_json["backend"] = args.backend result_json["label"] = label result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id @@ -1312,7 +1331,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: base_model_id = model_id.split("/")[-1] max_concurrency_str = (f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "") - label = label or endpoint_type + label = label or args.backend if args.ramp_up_strategy is not None: file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa else: