diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index cfd7bc2a4057..67466bdb9807 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -180,7 +180,3 @@ def get_device_communicator_cls(cls) -> str: Get device specific communicator class for distributed communication. """ return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator" # noqa - - @classmethod - def supports_structured_output(cls) -> bool: - return True diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 053cf74ebceb..0576022be448 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -308,10 +308,6 @@ def supports_fp8(cls) -> bool: def supports_v1(cls, model_config: ModelConfig) -> bool: return True - @classmethod - def supports_structured_output(cls) -> bool: - return True - @classmethod def use_custom_allreduce(cls) -> bool: return True diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index f011f14029a3..4c842b525110 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -92,7 +92,3 @@ def get_punica_wrapper(cls) -> str: @classmethod def get_device_communicator_cls(cls) -> str: return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator" # noqa - - @classmethod - def supports_structured_output(cls) -> bool: - return True diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 2bb543bd73f7..9799d4cb2ea7 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 - import enum import platform import random @@ -9,14 +8,21 @@ import numpy as np import torch +from vllm.inputs import PromptType from vllm.logger import init_logger if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig + from vllm.lora.request import LoRARequest + from vllm.pooling_params import PoolingParams + from vllm.sampling_params import SamplingParams from vllm.utils import FlexibleArgumentParser else: ModelConfig = None VllmConfig = None + LoRARequest = None + PoolingParams = None + SamplingParams = None FlexibleArgumentParser = None logger = init_logger(__name__) @@ -379,13 +385,6 @@ def supports_v1(cls, model_config: ModelConfig) -> bool: """ return False - @classmethod - def supports_structured_output(cls) -> bool: - """ - Returns whether the current platform can support structured output. - """ - return False - @classmethod def use_custom_allreduce(cls) -> bool: """ @@ -393,6 +392,14 @@ def use_custom_allreduce(cls) -> bool: """ return False + @classmethod + def validate_request( + cls, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + ) -> None: + """Raises if this request is unsupported on this platform""" + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 93657881cbdd..c1f426e5b880 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -67,7 +67,3 @@ def get_device_communicator_cls(cls) -> str: @classmethod def use_all_gather(cls) -> bool: return True - - @classmethod - def supports_structured_output(cls) -> bool: - return True diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index a2fbf416ecf2..d18b7c26f7ec 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -303,10 +303,6 @@ def supports_v1(cls, model_config: ModelConfig) -> bool: # V1 support on AMD gpus is experimental return True - @classmethod - def supports_structured_output(cls) -> bool: - return True - @classmethod def use_custom_allreduce(cls) -> bool: # We only enable custom allreduce for MI300 series diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index eeadb4a71e5e..d5848424b332 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,19 +1,26 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional, Union import torch import vllm.envs as envs +from vllm.inputs import PromptType from vllm.logger import init_logger from .interface import Platform, PlatformEnum, _Backend if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig + from vllm.lora.request import LoRARequest + from vllm.pooling_params import PoolingParams + from vllm.sampling_params import SamplingParams else: ModelConfig = None VllmConfig = None + LoRARequest = None + PoolingParams = None + SamplingParams = None logger = init_logger(__name__) @@ -135,6 +142,13 @@ def supports_v1(cls, model_config: ModelConfig) -> bool: return True @classmethod - def supports_structured_output(cls) -> bool: - # Structured output is not supported on TPU. - return False + def validate_request( + cls, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + ) -> None: + """Raises if this request is unsupported on this platform""" + if isinstance(params, + SamplingParams) and params.guided_decoding is not None: + raise ValueError("Structured output is not supported on " + f"{cls.device_name}.") diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index c4bd639384a4..225e756cd7ce 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -140,7 +140,3 @@ def device_support_bf16(cls) -> bool: @classmethod def get_device_communicator_cls(cls) -> str: return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator" # noqa - - @classmethod - def supports_structured_output(cls) -> bool: - return True diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 403edddfcbee..6c1a54bca9f8 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -137,11 +137,6 @@ def _validate_structured_output(self, params: SamplingParams) -> None: else: params.guided_decoding.backend = engine_level_backend - from vllm.platforms import current_platform - if not current_platform.supports_structured_output(): - raise ValueError("Structured output is not supported on " - f"{current_platform.device_name}.") - # Request content validation if engine_level_backend.startswith("xgrammar"): # xgrammar with no fallback @@ -183,6 +178,11 @@ def process_inputs( # TODO(woosuk): Support pooling models. # TODO(woosuk): Support encoder-decoder models. + from vllm.platforms import current_platform + current_platform.validate_request( + prompt=prompt, + params=params, + ) self._validate_lora(lora_request) self._validate_params(params) if priority != 0: