From 09133e9833811778240b3c2cc4de2390fd08e470 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 26 Feb 2025 06:36:20 +0000 Subject: [PATCH 01/35] integrate aiter kernels: Linear, Norm, MOE Signed-off-by: vllmellm --- Dockerfile.rocm | 6 + vllm/envs.py | 29 +++++ .../layers/fused_moe/fused_moe.py | 113 ++++++++++++++++-- vllm/model_executor/layers/fused_moe/layer.py | 14 +++ vllm/model_executor/layers/layernorm.py | 30 ++++- vllm/model_executor/layers/linear.py | 9 ++ .../model_executor/layers/quantization/fp8.py | 19 +++ .../layers/quantization/utils/w8a8_utils.py | 26 +++- 8 files changed, 223 insertions(+), 23 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 14c522afd7f9..932661dba135 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -109,11 +109,17 @@ ARG COMMON_WORKDIR COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples +RUN git clone --recursive https://github.com/ROCm/aiter.git +RUN cd /app/aiter && GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter + ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 +# Enable Aiter. Make sure this only exists on the aiter branch. +# ENV VLLM_USE_AITER=1 + CMD ["/bin/bash"] diff --git a/vllm/envs.py b/vllm/envs.py index 84426cb5bb22..a88db87bf824 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -75,6 +75,10 @@ VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False VLLM_ROCM_FP8_PADDING: bool = True + VLLM_ROCM_USE_AITER: bool = False + VLLM_ROCM_USE_AITER_LINEAR: bool = True + VLLM_ROCM_USE_AITER_MOE: bool = True + VLLM_ROCM_USE_AITER_NORM: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 VLLM_DISABLE_COMPILE_CACHE: bool = False @@ -510,6 +514,31 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), + # use aiter ops unless specifically disabled + "VLLM_ROCM_USE_AITER": + lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in ("true", "1")), + + # use aiter moe op if aiter ops are enabled + "VLLM_ROCM_USE_AITER_MOE": + lambda: + (os.getenv("VLLM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_USE_AITER_MOE", "True").lower() in + ("true", "1")), + + # use aiter linear op if aiter ops are enabled + "VLLM_ROCM_USE_AITER_LINEAR": + lambda: + (os.getenv("VLLM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_USE_AITER_LINEAR", "True").lower() in + ("true", "1")), + + # use aiter rms norm op if aiter ops are enabled + "VLLM_ROCM_USE_AITER_NORM": + lambda: + (os.getenv("VLLM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_USE_AITER_NORM", "True").lower() in + ("true", "1")), + # Pad the fp8 weights to 256 bytes for ROCm "VLLM_ROCM_FP8_PADDING": lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))), diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index bc9573b36df7..6524084c494c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -17,6 +17,12 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op +USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm( +) +if USE_ROCM_AITER_FMOE: + import aiter + import aiter.fused_moe_bf16_asm as aiter_fmoe_asm + logger = init_logger(__name__) @@ -946,17 +952,22 @@ def fused_topk( dtype=torch.int32, device=hidden_states.device) - ops.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - gating_output.float(), # TODO(woosuk): Optimize this. - ) - del token_expert_indicies # Not used. Will be used in the future. + if USE_ROCM_AITER_FMOE: + aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies, + gating_output.float(), renormalize) + else: + ops.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + ) - if renormalize: - topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, + keepdim=True) + del token_expert_indicies # Not used. Will be used in the future. return topk_weights, topk_ids @@ -1141,6 +1152,81 @@ def outplace_fused_experts_fake( ) +def rocm_aiter_fused_experts(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_fp8_blockscale: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + expert_mask: Optional[torch.Tensor] = None): + + if use_fp8_blockscale: + local_E = E = w1.shape[0] + if expert_mask is not None: + E = expert_mask.numel() + + topk = topk_ids.shape[1] + model_dim = w1.shape[-1] + dtype = hidden_states.dtype + scale_blk_k = block_shape[1] + + ( + sorted_token_ids, + sorted_weight_buf, + sorted_expert_ids, + num_valid_ids, + out_asm, + ) = aiter_fmoe_asm.moe_sorting_ck(topk_ids, + topk_weights, + E, + model_dim, + dtype, + expert_mask=expert_mask) + + a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k) + aiter.fmoe_fp8_blockscale_g1u1( + out_asm, + a1, + w1, + w2, + sorted_token_ids, + sorted_weight_buf, + sorted_expert_ids, + num_valid_ids, + topk, + w1_scale.view(local_E, -1), + w2_scale.view(local_E, -1), + a1_scale.t().contiguous(), + block_shape[0], + block_shape[1], + None, + ) + return out_asm + + if use_fp8_w8a8: + return aiter_fmoe_asm.asm_moe(hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weight=topk_weights, + topk_ids=topk_ids, + fc1_scale=w1_scale, + fc2_scale=w2_scale, + fc1_smooth_scale=None, + fc2_smooth_scale=None, + a16=False) + else: + return aiter.ck_moe(hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + expert_mask=expert_mask) + + def fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -1150,6 +1236,7 @@ def fused_experts(hidden_states: torch.Tensor, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, use_int4_w4a16: bool = False, + use_fp8_blockscale: bool = False, global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None, w1_scale: Optional[torch.Tensor] = None, @@ -1158,8 +1245,12 @@ def fused_experts(hidden_states: torch.Tensor, w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> torch.Tensor: - + block_shape: Optional[List[int]] = None, + expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + if USE_ROCM_AITER_FMOE: + rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, + use_fp8_w8a8, use_fp8_blockscale, w1_scale, + w2_scale, block_shape, expert_mask) if inplace: torch.ops.vllm.inplace_fused_experts( hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 452f390f4987..3d3fc51723b1 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -27,6 +27,12 @@ from .moe_torch_iterative import fused_moe as fused_moe_pallas else: fused_moe_pallas = None # type: ignore + +USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm( +) +if USE_ROCM_AITER_FMOE: + import aiter.ops as aiter_ops + logger = init_logger(__name__) @@ -95,6 +101,14 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) + if USE_ROCM_AITER_FMOE: + layer.w13_weight = torch.nn.Parameter(aiter_ops.shuffle_weight( + layer.w13_weight.data), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(aiter_ops.shuffle_weight( + layer.w2_weight.data), + requires_grad=False) + if current_platform.is_cpu(): if current_platform.get_cpu_architecture() == CpuArchEnum.X86: import intel_extension_for_pytorch as ipex diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index b476fb0dbc7e..236cbbbe4a50 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -5,7 +5,14 @@ import torch import torch.nn as nn +from vllm.envs import VLLM_ROCM_USE_AITER_NORM from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform + +USE_ROCM_AITER_NORM = VLLM_ROCM_USE_AITER_NORM \ + and current_platform.is_rocm() +if USE_ROCM_AITER_NORM: + import aiter @CustomOp.register("rms_norm") @@ -84,13 +91,24 @@ def forward_cuda( from vllm import _custom_ops as ops if residual is not None: - ops.fused_add_rms_norm( - x, - residual, - self.weight.data, - self.variance_epsilon, - ) + if USE_ROCM_AITER_NORM: + aiter.rmsnorm2d_fwd_with_add( + x, + x, + residual, + residual, + self.weight.data, + self.variance_epsilon, + ) + else: + ops.fused_add_rms_norm( + x, + residual, + self.weight.data, + self.variance_epsilon, + ) return x, residual + out = torch.empty_like(x) ops.rms_norm( out, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 521724765beb..0ffb7ec52932 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -13,6 +13,7 @@ split_tensor_along_last_dim, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) +from vllm.envs import VLLM_ROCM_USE_AITER_LINEAR from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) @@ -25,6 +26,12 @@ RowvLLMParameter) # yapf: enable from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform + +USE_ROCM_AITER_LINEAR = VLLM_ROCM_USE_AITER_LINEAR \ + and current_platform.is_rocm() +if USE_ROCM_AITER_LINEAR: + from aiter.tuned_gemm import tgemm as aiter_tgemm logger = init_logger(__name__) @@ -138,6 +145,8 @@ def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: + if USE_ROCM_AITER_LINEAR: + return aiter_tgemm.mm(x, layer.weigt, bias) return F.linear(x, layer.weight, bias) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 5e1bec0bb4be..1c4e1c67a34e 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -33,6 +33,11 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm( +) +if USE_ROCM_AITER_FMOE: + import aiter.ops as aiter_ops + ACTIVATION_SCHEMES = ["static", "dynamic"] logger = init_logger(__name__) @@ -656,6 +661,20 @@ def process_weights_after_loading(self, layer: Module) -> None: dq_weight, max_w13_scales[expert_id]) start += shard_size + if USE_ROCM_AITER_FMOE: + max_w13_scales = max_w13_scales.unsqueeze(-1).unsqueeze( + -1).expand((-1, layer.w13_weight.shape[1], -1)) + w2_scales = layer.w2_weight_scale.data.unsqueeze(-1).unsqueeze( + -1).expand((-1, layer.w2_weight.shape[1], -1)) + layer.w2_weight_scale = torch.nn.Parameter( + w2_scales.contiguous(), requires_grad=False) + layer.w13_weight = torch.nn.Parameter(aiter_ops.shuffle_weight( + layer.w13_weight), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(aiter_ops.shuffle_weight( + layer.w2_weight), + requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False) return diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 0f93b7f6c45b..4276ac5f92be 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -5,8 +5,14 @@ import torch from vllm import _custom_ops as ops +from vllm.envs import VLLM_ROCM_USE_AITER_LINEAR from vllm.platforms import current_platform +USE_ROCM_AITER_LINEAR = VLLM_ROCM_USE_AITER_LINEAR \ + and current_platform.is_rocm() +if USE_ROCM_AITER_LINEAR: + from aiter.tuned_gemm import tgemm as aiter_tgemm + # Input scaling factors are no longer optional in _scaled_mm starting # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale TORCH_DEVICE_IDENTITY = None @@ -172,12 +178,20 @@ def apply_fp8_linear( if per_tensor_weights and per_tensor_activations: # Fused GEMM_DQ - output = torch._scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) + if USE_ROCM_AITER_LINEAR: + output = aiter_tgemm.mm(qinput, + weight.t(), + otype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias) + else: + output = torch._scaled_mm(qinput, + weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias) # A fix for discrepancy in scaled_mm which returns tuple # for torch < 2.5 and a single value in torch >= 2.5 if type(output) is tuple and len(output) == 2: From ead17c7182015d9608d60e416376f90314bec786 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 26 Feb 2025 07:17:39 +0000 Subject: [PATCH 02/35] maintain a consistent import pattern Signed-off-by: vllmellm --- vllm/model_executor/layers/fused_moe/fused_moe.py | 4 ++-- vllm/model_executor/layers/layernorm.py | 4 ++-- vllm/model_executor/layers/linear.py | 4 ++-- vllm/model_executor/layers/quantization/fp8.py | 4 ++-- vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 6524084c494c..7794bc6002eb 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -17,8 +17,8 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op -USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm( -) +USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ + and current_platform.is_rocm() if USE_ROCM_AITER_FMOE: import aiter import aiter.fused_moe_bf16_asm as aiter_fmoe_asm diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 236cbbbe4a50..b08c8e85e830 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -5,11 +5,11 @@ import torch import torch.nn as nn -from vllm.envs import VLLM_ROCM_USE_AITER_NORM +import vllm.envs as envs from vllm.model_executor.custom_op import CustomOp from vllm.platforms import current_platform -USE_ROCM_AITER_NORM = VLLM_ROCM_USE_AITER_NORM \ +USE_ROCM_AITER_NORM = envs.VLLM_ROCM_USE_AITER_NORM \ and current_platform.is_rocm() if USE_ROCM_AITER_NORM: import aiter diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 0ffb7ec52932..84c438ebb75a 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -8,12 +8,12 @@ import torch.nn.functional as F from torch.nn.parameter import Parameter, UninitializedParameter +import vllm.envs as envs from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) -from vllm.envs import VLLM_ROCM_USE_AITER_LINEAR from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) @@ -28,7 +28,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -USE_ROCM_AITER_LINEAR = VLLM_ROCM_USE_AITER_LINEAR \ +USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \ and current_platform.is_rocm() if USE_ROCM_AITER_LINEAR: from aiter.tuned_gemm import tgemm as aiter_tgemm diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 1c4e1c67a34e..396f544deaec 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -33,8 +33,8 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm( -) +USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ + and current_platform.is_rocm() if USE_ROCM_AITER_FMOE: import aiter.ops as aiter_ops diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 4276ac5f92be..a4a3a1d6162b 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -4,11 +4,11 @@ import torch +import vllm.envs as envs from vllm import _custom_ops as ops -from vllm.envs import VLLM_ROCM_USE_AITER_LINEAR from vllm.platforms import current_platform -USE_ROCM_AITER_LINEAR = VLLM_ROCM_USE_AITER_LINEAR \ +USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \ and current_platform.is_rocm() if USE_ROCM_AITER_LINEAR: from aiter.tuned_gemm import tgemm as aiter_tgemm From 2527956512f22a3b6ac870f874ee015203bb57b8 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 26 Feb 2025 09:03:37 +0000 Subject: [PATCH 03/35] add aiter fp8 block scaled moe kernel Signed-off-by: vllmellm --- vllm/envs.py | 7 +++++++ vllm/model_executor/layers/fused_moe/fused_moe.py | 8 +++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index a88db87bf824..b1a746a05895 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -78,6 +78,7 @@ VLLM_ROCM_USE_AITER: bool = False VLLM_ROCM_USE_AITER_LINEAR: bool = True VLLM_ROCM_USE_AITER_MOE: bool = True + VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True VLLM_ROCM_USE_AITER_NORM: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -525,6 +526,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: ("true", "1") and os.getenv("VLLM_USE_AITER_MOE", "True").lower() in ("true", "1")), + # use aiter block scaled moe op if aiter ops are enabled + "VLLM_ROCM_USE_AITER_BSCALED_MOE": + lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BSCALED_MOE", + "True").lower() in ("true", "1")), + # use aiter linear op if aiter ops are enabled "VLLM_ROCM_USE_AITER_LINEAR": lambda: diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 7794bc6002eb..35b87e7d4eb1 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1158,13 +1158,12 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, use_fp8_w8a8: bool = False, - use_fp8_blockscale: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None, expert_mask: Optional[torch.Tensor] = None): - if use_fp8_blockscale: + if envs.VLLM_ROCM_USE_AITER_BSCALED_MOE and use_fp8_w8a8: local_E = E = w1.shape[0] if expert_mask is not None: E = expert_mask.numel() @@ -1236,7 +1235,6 @@ def fused_experts(hidden_states: torch.Tensor, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, use_int4_w4a16: bool = False, - use_fp8_blockscale: bool = False, global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None, w1_scale: Optional[torch.Tensor] = None, @@ -1249,8 +1247,8 @@ def fused_experts(hidden_states: torch.Tensor, expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor: if USE_ROCM_AITER_FMOE: rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, - use_fp8_w8a8, use_fp8_blockscale, w1_scale, - w2_scale, block_shape, expert_mask) + use_fp8_w8a8, w1_scale, w2_scale, block_shape, + expert_mask) if inplace: torch.ops.vllm.inplace_fused_experts( hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8, From 814702a2909e1644f1120f2e2b6b73d064aec997 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 26 Feb 2025 11:09:16 +0000 Subject: [PATCH 04/35] bugfix: fix import paths and wrong env variables Signed-off-by: vllmellm --- vllm/envs.py | 18 +++++++++--------- .../layers/fused_moe/fused_moe.py | 15 +++++++-------- vllm/model_executor/layers/fused_moe/layer.py | 6 +++--- vllm/model_executor/layers/quantization/fp8.py | 6 +++--- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index b1a746a05895..023ab0e54f49 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -517,33 +517,33 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # use aiter ops unless specifically disabled "VLLM_ROCM_USE_AITER": - lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in ("true", "1")), + lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in + ("true", "1")), # use aiter moe op if aiter ops are enabled "VLLM_ROCM_USE_AITER_MOE": lambda: - (os.getenv("VLLM_USE_AITER", "False").lower() in - ("true", "1") and os.getenv("VLLM_USE_AITER_MOE", "True").lower() in + (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in ("true", "1")), # use aiter block scaled moe op if aiter ops are enabled "VLLM_ROCM_USE_AITER_BSCALED_MOE": - lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in + lambda: (os.getenv("LLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BSCALED_MOE", "True").lower() in ("true", "1")), # use aiter linear op if aiter ops are enabled "VLLM_ROCM_USE_AITER_LINEAR": - lambda: - (os.getenv("VLLM_USE_AITER", "False").lower() in - ("true", "1") and os.getenv("VLLM_USE_AITER_LINEAR", "True").lower() in - ("true", "1")), + lambda: (os.getenv("LLM_ROCM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True" + ).lower() in ("true", "1")), # use aiter rms norm op if aiter ops are enabled "VLLM_ROCM_USE_AITER_NORM": lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in - ("true", "1") and os.getenv("VLLM_USE_AITER_NORM", "True").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_NORM", "True").lower() in ("true", "1")), # Pad the fp8 weights to 256 bytes for ROCm diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 35b87e7d4eb1..58bd36be2f0e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -21,7 +21,7 @@ and current_platform.is_rocm() if USE_ROCM_AITER_FMOE: import aiter - import aiter.fused_moe_bf16_asm as aiter_fmoe_asm + import aiter.fused_moe_bf16_asm as aiter_asm_fmoe logger = init_logger(__name__) @@ -1179,7 +1179,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, sorted_expert_ids, num_valid_ids, out_asm, - ) = aiter_fmoe_asm.moe_sorting_ck(topk_ids, + ) = aiter_asm_fmoe.moe_sorting_ck(topk_ids, topk_weights, E, model_dim, @@ -1207,7 +1207,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, return out_asm if use_fp8_w8a8: - return aiter_fmoe_asm.asm_moe(hidden_states=hidden_states, + return aiter_asm_fmoe.asm_moe(hidden_states=hidden_states, w1=w1, w2=w2, topk_weight=topk_weights, @@ -1222,8 +1222,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, w1=w1, w2=w2, topk_weights=topk_weights, - topk_ids=topk_ids, - expert_mask=expert_mask) + topk_ids=topk_ids) def fused_experts(hidden_states: torch.Tensor, @@ -1246,9 +1245,9 @@ def fused_experts(hidden_states: torch.Tensor, block_shape: Optional[List[int]] = None, expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor: if USE_ROCM_AITER_FMOE: - rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, - use_fp8_w8a8, w1_scale, w2_scale, block_shape, - expert_mask) + return rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, + topk_ids, use_fp8_w8a8, w1_scale, + w2_scale, block_shape, expert_mask) if inplace: torch.ops.vllm.inplace_fused_experts( hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3d3fc51723b1..ef9aaa83cb61 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -31,7 +31,7 @@ USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm( ) if USE_ROCM_AITER_FMOE: - import aiter.ops as aiter_ops + from aiter.ops.shuffle import shuffle_weight as aiter_shuffle_weight logger = init_logger(__name__) @@ -102,10 +102,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) if USE_ROCM_AITER_FMOE: - layer.w13_weight = torch.nn.Parameter(aiter_ops.shuffle_weight( + layer.w13_weight = torch.nn.Parameter(aiter_shuffle_weight( layer.w13_weight.data), requires_grad=False) - layer.w2_weight = torch.nn.Parameter(aiter_ops.shuffle_weight( + layer.w2_weight = torch.nn.Parameter(aiter_shuffle_weight( layer.w2_weight.data), requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 396f544deaec..16ed848ef742 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -36,7 +36,7 @@ USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ and current_platform.is_rocm() if USE_ROCM_AITER_FMOE: - import aiter.ops as aiter_ops + from aiter.ops.shuffle import aiter_shuffle_weight ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -668,10 +668,10 @@ def process_weights_after_loading(self, layer: Module) -> None: -1).expand((-1, layer.w2_weight.shape[1], -1)) layer.w2_weight_scale = torch.nn.Parameter( w2_scales.contiguous(), requires_grad=False) - layer.w13_weight = torch.nn.Parameter(aiter_ops.shuffle_weight( + layer.w13_weight = torch.nn.Parameter(aiter_shuffle_weight( layer.w13_weight), requires_grad=False) - layer.w2_weight = torch.nn.Parameter(aiter_ops.shuffle_weight( + layer.w2_weight = torch.nn.Parameter(aiter_shuffle_weight( layer.w2_weight), requires_grad=False) From 024cfc57313d9538f7b317aee421ab9e578e9b44 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 28 Feb 2025 14:50:23 +0000 Subject: [PATCH 05/35] rename importing module names from amd/rocm aiter package to avoid confusion withpython builtin aiter function. Signed-off-by: vllmellm --- .../layers/fused_moe/fused_moe.py | 52 +++++++++---------- vllm/model_executor/layers/layernorm.py | 4 +- vllm/model_executor/layers/linear.py | 4 +- .../model_executor/layers/quantization/fp8.py | 10 ++-- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 58bd36be2f0e..b2b2dc6bd0bb 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -20,8 +20,8 @@ USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ and current_platform.is_rocm() if USE_ROCM_AITER_FMOE: - import aiter - import aiter.fused_moe_bf16_asm as aiter_asm_fmoe + import aiter as rocm_aiter + import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe logger = init_logger(__name__) @@ -953,8 +953,8 @@ def fused_topk( device=hidden_states.device) if USE_ROCM_AITER_FMOE: - aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies, - gating_output.float(), renormalize) + rocm_aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies, + gating_output.float(), renormalize) else: ops.topk_softmax( topk_weights, @@ -1179,15 +1179,15 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, sorted_expert_ids, num_valid_ids, out_asm, - ) = aiter_asm_fmoe.moe_sorting_ck(topk_ids, - topk_weights, - E, - model_dim, - dtype, - expert_mask=expert_mask) + ) = rocm_aiter_asm_fmoe.moe_sorting_ck(topk_ids, + topk_weights, + E, + model_dim, + dtype, + expert_mask=expert_mask) a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k) - aiter.fmoe_fp8_blockscale_g1u1( + rocm_aiter.fmoe_fp8_blockscale_g1u1( out_asm, a1, w1, @@ -1207,22 +1207,22 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, return out_asm if use_fp8_w8a8: - return aiter_asm_fmoe.asm_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weight=topk_weights, - topk_ids=topk_ids, - fc1_scale=w1_scale, - fc2_scale=w2_scale, - fc1_smooth_scale=None, - fc2_smooth_scale=None, - a16=False) + return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weight=topk_weights, + topk_ids=topk_ids, + fc1_scale=w1_scale, + fc2_scale=w2_scale, + fc1_smooth_scale=None, + fc2_smooth_scale=None, + a16=False) else: - return aiter.ck_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids) + return rocm_aiter.ck_moe(hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids) def fused_experts(hidden_states: torch.Tensor, diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index b08c8e85e830..0d5d1a454594 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -12,7 +12,7 @@ USE_ROCM_AITER_NORM = envs.VLLM_ROCM_USE_AITER_NORM \ and current_platform.is_rocm() if USE_ROCM_AITER_NORM: - import aiter + import aiter as rocm_aiter @CustomOp.register("rms_norm") @@ -92,7 +92,7 @@ def forward_cuda( if residual is not None: if USE_ROCM_AITER_NORM: - aiter.rmsnorm2d_fwd_with_add( + rocm_aiter.rmsnorm2d_fwd_with_add( x, x, residual, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 84c438ebb75a..dddf19e4c617 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -31,7 +31,7 @@ USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \ and current_platform.is_rocm() if USE_ROCM_AITER_LINEAR: - from aiter.tuned_gemm import tgemm as aiter_tgemm + from aiter.tuned_gemm import tgemm as rocm_aiter_tgemm logger = init_logger(__name__) @@ -146,7 +146,7 @@ def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: if USE_ROCM_AITER_LINEAR: - return aiter_tgemm.mm(x, layer.weigt, bias) + return rocm_aiter_tgemm.mm(x, layer.weigt, bias) return F.linear(x, layer.weight, bias) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 16ed848ef742..2edccd0984a3 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -36,7 +36,7 @@ USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ and current_platform.is_rocm() if USE_ROCM_AITER_FMOE: - from aiter.ops.shuffle import aiter_shuffle_weight + from aiter.ops.shuffle import shuffle_weight as rocm_aiter_shuffle_weight ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -668,10 +668,10 @@ def process_weights_after_loading(self, layer: Module) -> None: -1).expand((-1, layer.w2_weight.shape[1], -1)) layer.w2_weight_scale = torch.nn.Parameter( w2_scales.contiguous(), requires_grad=False) - layer.w13_weight = torch.nn.Parameter(aiter_shuffle_weight( - layer.w13_weight), - requires_grad=False) - layer.w2_weight = torch.nn.Parameter(aiter_shuffle_weight( + layer.w13_weight = torch.nn.Parameter( + rocm_aiter_shuffle_weight(layer.w13_weight), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight( layer.w2_weight), requires_grad=False) From 7cfe429744b7447fb60a97636a5daf5bae4ff74d Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 1 Mar 2025 10:50:37 +0000 Subject: [PATCH 06/35] bugfixe on wrong env variable spelling an add missing statment condition in Fp8MoEMethod Signed-off-by: vllmellm --- vllm/envs.py | 4 ++-- .../layers/fused_moe/fused_moe.py | 10 ++++++++-- vllm/model_executor/layers/quantization/fp8.py | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 023ab0e54f49..49b21a687736 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -529,9 +529,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # use aiter block scaled moe op if aiter ops are enabled "VLLM_ROCM_USE_AITER_BSCALED_MOE": - lambda: (os.getenv("LLM_ROCM_USE_AITER", "False").lower() in + lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BSCALED_MOE", - "True").lower() in ("true", "1")), + "true").lower() in ("true", "1")), # use aiter linear op if aiter ops are enabled "VLLM_ROCM_USE_AITER_LINEAR": diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index b2b2dc6bd0bb..0abd2c58d0e8 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1162,8 +1162,10 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, w2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None, expert_mask: Optional[torch.Tensor] = None): - if envs.VLLM_ROCM_USE_AITER_BSCALED_MOE and use_fp8_w8a8: + assert w1_scale is not None + assert w2_scale is not None + local_E = E = w1.shape[0] if expert_mask is not None: E = expert_mask.numel() @@ -1171,6 +1173,10 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, topk = topk_ids.shape[1] model_dim = w1.shape[-1] dtype = hidden_states.dtype + # The default block sizes are 128 in AITER. + if block_shape is None: + block_shape = [128, 128] + scale_blk_k = block_shape[1] ( @@ -1206,7 +1212,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, ) return out_asm - if use_fp8_w8a8: + elif use_fp8_w8a8: return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states, w1=w1, w2=w2, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 2edccd0984a3..e57d05cf2f28 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -593,6 +593,23 @@ def process_weights_after_loading(self, layer: Module) -> None: requires_grad=False) layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + + if USE_ROCM_AITER_FMOE: + w13_scales = layer.w13_weight_scale.data.unsqueeze( + -1).unsqueeze(-1).expand( + (-1, layer.w13_weight.shape[1], -1)) + w2_scales = layer.w2_weight_scale.data.unsqueeze(-1).unsqueeze( + -1).expand((-1, layer.w2_weight.shape[1], -1)) + layer.w2_weight_scale = torch.nn.Parameter( + w2_scales.contiguous(), requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter( + w13_scales.contiguous(), requires_grad=False) + layer.w13_weight = torch.nn.Parameter( + rocm_aiter_shuffle_weight(layer.w13_weight), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight( + layer.w2_weight), + requires_grad=False) return # If checkpoint is fp8, we need to handle that the From 41e7e4fa8ce5da4475046ecac6ab426bc3509fb6 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 1 Mar 2025 10:58:04 +0000 Subject: [PATCH 07/35] enabled VLLM_ROCM_USE_AITER in unit-tests Signed-off-by: vllmellm --- tests/kernels/test_moe.py | 22 ++++++++++++++----- .../decoder_only/language/test_granite.py | 19 ++++++++-------- .../decoder_only/language/test_mistral.py | 18 +++++++-------- .../decoder_only/language/test_models.py | 18 +++++++-------- .../decoder_only/language/test_phimoe.py | 17 +++++++------- 5 files changed, 53 insertions(+), 41 deletions(-) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 2f5c69046f48..b48978721069 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -202,11 +202,15 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) @torch.inference_mode() -def test_mixtral_moe(dtype: torch.dtype): +def test_mixtral_moe(dtype: torch.dtype, use_rocm_aiter: bool, monkeypatch): """Make sure our Mixtral MoE implementation agrees with the one from huggingface.""" + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") # Instantiate our and huggingface's MoE blocks config = MixtralConfig() hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda") @@ -242,10 +246,18 @@ def test_mixtral_moe(dtype: torch.dtype): torch.bfloat16: 1e-2, } - torch.testing.assert_close(hf_states.flatten(0, 1), - vllm_states, - rtol=mixtral_moe_tol[dtype], - atol=mixtral_moe_tol[dtype]) + if use_rocm_aiter: + # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501 + # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501 + torch.testing.assert_close(hf_states.flatten(0, 1), + vllm_states, + rtol=0.01, + atol=100) + else: + torch.testing.assert_close(hf_states.flatten(0, 1), + vllm_states, + rtol=mixtral_moe_tol[dtype], + atol=mixtral_moe_tol[dtype]) @pytest.mark.parametrize("m", [1, 33, 64, 222]) diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index 119b79d64c96..f34b67ca56d5 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -5,6 +5,8 @@ """ import pytest +from vllm.platforms import current_platform + from ...utils import check_logprobs_close MODELS = [ @@ -18,15 +20,14 @@ @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_models(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str, max_tokens: int, num_logprobs: int, + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 17923673023f..ce9af7faa410 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -12,6 +12,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( # noqa MistralToolParser) +from vllm.platforms import current_platform from vllm.sampling_params import GuidedDecodingParams, SamplingParams from ...utils import check_logprobs_close @@ -174,15 +175,14 @@ @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_models(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str, max_tokens: int, num_logprobs: int, + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + # TODO(sang): Sliding window should be tested separately. with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 71e4a9f11ab8..7c21ed0fecda 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -5,6 +5,8 @@ """ import pytest +from vllm.platforms import current_platform + from ...utils import check_logprobs_close @@ -63,15 +65,13 @@ @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_models(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str, max_tokens: int, num_logprobs: int, + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") with hf_runner(model, dtype=dtype) as hf_model: if model.startswith("THUDM/chatglm3"): diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py index f9757d6ac295..b8948976a03e 100644 --- a/tests/models/decoder_only/language/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -79,15 +79,14 @@ def test_phimoe_routing_function(): @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_models(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str, max_tokens: int, num_logprobs: int, + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) From 5f668ea2b9546f73bdb12f3002bc590c0b38f92c Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 1 Mar 2025 10:59:24 +0000 Subject: [PATCH 08/35] include the AMD AITER package in rocm_base docker file Signed-off-by: vllmellm --- Dockerfile.rocm | 6 ------ Dockerfile.rocm_base | 27 ++++++++++++++++++++------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 932661dba135..14c522afd7f9 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -109,17 +109,11 @@ ARG COMMON_WORKDIR COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -RUN git clone --recursive https://github.com/ROCm/aiter.git -RUN cd /app/aiter && GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter - ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 -# Enable Aiter. Make sure this only exists on the aiter branch. -# ENV VLLM_USE_AITER=1 - CMD ["/bin/bash"] diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base index e33e73b30309..494dc17c13ae 100644 --- a/Dockerfile.rocm_base +++ b/Dockerfile.rocm_base @@ -1,17 +1,19 @@ ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete -ARG HIPBLASLT_BRANCH="4d40e36" +ARG HIPBLASLT_BRANCH="db8e93b4" ARG HIPBLAS_COMMON_BRANCH="7c1566b" ARG LEGACY_HIPBLASLT_OPTION= ARG RCCL_BRANCH="648a58d" ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG TRITON_BRANCH="e5be006" ARG TRITON_REPO="https://github.com/triton-lang/triton.git" -ARG PYTORCH_BRANCH="3a585126" -ARG PYTORCH_VISION_BRANCH="v0.19.1" +ARG PYTORCH_BRANCH="6c0e7463" +ARG PYTORCH_VISION_BRANCH="v0.21.0" ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" -ARG FA_BRANCH="b7d29fb" -ARG FA_REPO="https://github.com/ROCm/flash-attention.git" +ARG FA_BRANCH="1a7f4dfa" +ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" +ARG AITER_BRANCH="dfed377" +ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -108,7 +110,7 @@ RUN git clone ${FA_REPO} RUN cd flash-attention \ && git checkout ${FA_BRANCH} \ && git submodule update --init \ - && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist + && GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \ && cp /app/flash-attention/dist/*.whl /app/install @@ -129,7 +131,17 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ pip install /install/*.whl +ARG AITER_REPO +ARG AITER_BRANCH +RUN git clone --recursive ${AITER_REPO} +RUN cd aiter \ + && git checkout ${AITER_BRANCH} \ + && git submodule update --init --recursive \ + && pip install -r requirements.txt \ + && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter + ARG BASE_IMAGE +ARG HIPBLAS_COMMON_BRANCH ARG HIPBLASLT_BRANCH ARG LEGACY_HIPBLASLT_OPTION ARG RCCL_BRANCH @@ -155,4 +167,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ - && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt + && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ No newline at end of file From 8c5eb52771d81f052218e8f685d3f047a3032a57 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 3 Mar 2025 04:07:28 +0000 Subject: [PATCH 09/35] integrate AITER paged attention Signed-off-by: vllmellm --- vllm/attention/backends/rocm_flash_attn.py | 33 +++- vllm/attention/ops/rocm_aiter_paged_attn.py | 195 ++++++++++++++++++++ vllm/envs.py | 17 +- 3 files changed, 239 insertions(+), 6 deletions(-) create mode 100644 vllm/attention/ops/rocm_aiter_paged_attn.py diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 3f40686ee2fd..2a696b7147d0 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -12,20 +12,27 @@ AttentionMetadata, AttentionType) from vllm.attention.backends.utils import (CommonAttentionState, CommonMetadataBuilder) -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) from vllm.logger import init_logger from vllm.platforms import current_platform if TYPE_CHECKING: from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata +USE_ROCM_AITER_PA = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN +if USE_ROCM_AITER_PA: + from vllm.attention.ops.rocm_aiter_paged_attn import ( + PagedAttention, PagedAttentionMetadata) +else: + from vllm.attention.ops.paged_attn import (PagedAttention, + PagedAttentionMetadata) + logger = init_logger(__name__) _PARTITION_SIZE_ROCM = 512 _GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName _ON_NAVI = "gfx1" in _GPU_ARCH _ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"]) +USE_ROCM_CUSTOM_PA = envs.VLLM_ROCM_USE_CUSTOM_PAGED_ATTN class ROCmFlashAttentionBackend(AttentionBackend): @@ -463,6 +470,7 @@ def __init__( if blocksparse_params is not None: raise ValueError( "ROCmFlashAttention does not support blocksparse attention.") + self.aiter_kv_scales_initialized = False if logits_soft_cap is None: # In flash-attn, setting logits_soft_cap as 0 means no soft cap. @@ -608,6 +616,24 @@ def forward( else: assert value is None + if (USE_ROCM_AITER_PA and kv_cache.dtype.itemsize == 1 + and not self.aiter_kv_scales_initialized + and kv_cache.shape != torch.Size([0])): + num_blocks = kv_cache.shape[1] + block_size = kv_cache.shape[2] // (self.num_kv_heads * + self.head_size) + k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size), + dtype=torch.float32, + device=kv_cache.device) + v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size), + dtype=torch.float32, + device=kv_cache.device) + self.aiter_kv_scales_initialized = True + k_scale.fill_(layer._k_scale.item()) + v_scale.fill_(layer._v_scale.item()) + layer._k_scale = k_scale + layer._v_scale = v_scale + if self.attn_type != AttentionType.ENCODER and kv_cache.numel() > 0: key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) @@ -885,4 +911,5 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int, and (qtype == torch.half or qtype == torch.bfloat16) and (head_size == 64 or head_size == 128) and (block_size == 16 or block_size == 32) - and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768) + and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768 + and USE_ROCM_CUSTOM_PA) diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py new file mode 100644 index 000000000000..847d39077494 --- /dev/null +++ b/vllm/attention/ops/rocm_aiter_paged_attn.py @@ -0,0 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import aiter as rocm_aiter +import torch + +from vllm import _custom_ops as ops +from vllm.triton_utils import HAS_TRITON + +if HAS_TRITON: + from vllm.attention.ops.prefix_prefill import context_attention_fwd + + +@dataclass +class PagedAttentionMetadata: + """Metadata for PagedAttention.""" + # (batch_size,). The length of sequences (entire tokens seen so far) per + # sequence. + seq_lens_tensor: Optional[torch.Tensor] + # Maximum sequence length in the batch. 0 if it is prefill-only batch. + max_decode_seq_len: int + # (batch_size, max_blocks_per_seq). + # Block addresses per sequence. (Seq id -> list of physical block) + # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks + # in the kv cache. Each block can contain up to block_size tokens. + # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph + # captured. + block_tables: Optional[torch.Tensor] + + +class PagedAttention: + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [32, 64, 80, 96, 112, 120, 128, 192, 256] + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (2, num_blocks, block_size * num_kv_heads * head_size) + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = 16 // kv_cache.element_size() + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + ) -> None: + if key_cache.dtype.itemsize == 1: + if "fp8" in kv_cache_dtype: + key_cache = key_cache.view(torch.float8_e4m3fnuz) + value_cache = value_cache.view(torch.float8_e4m3fnuz) + else: + key_cache = key_cache.view(torch.int8) + value_cache = value_cache.view(torch.int8) + rocm_aiter.reshape_and_cache_with_pertoken_quant( + key, value, key_cache, value_cache, k_scale, v_scale, + slot_mapping.flatten(), True) + else: + rocm_aiter.reshape_and_cache(key, value, key_cache, value_cache, + slot_mapping.flatten(), + kv_cache_dtype, + k_scale.view(-1)[0].item(), + v_scale.view(-1)[0].item(), True) + + @staticmethod + def forward_decode( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + max_seq_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: torch.Tensor, + v_scale: torch.Tensor, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> torch.Tensor: + if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1: + # use blocksparse paged attention + block_size = value_cache.size(-1) + assert (blocksparse_block_size > 0 and + blocksparse_block_size % block_size == 0), \ + (f"{blocksparse_block_size=} needs to be a multiple of" + f"{block_size=} used in block_tables.") + + output = torch.empty_like(query) + block_size = value_cache.shape[3] + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + + if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]: + k_scale, v_scale = (None, None) + query = query.contiguous() + elif "fp8" in kv_cache_dtype: + key_cache = key_cache.view(torch.float8_e4m3fnuz) + value_cache = value_cache.view(torch.float8_e4m3fnuz) + rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables, + seq_lens, max_num_blocks_per_seq, k_scale, + v_scale, output) + return output + + @staticmethod + def forward_prefix( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache_dtype: str, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + query_start_loc: torch.Tensor, + seq_lens_tensor: torch.Tensor, + context_lens: torch.Tensor, + max_query_len: int, + alibi_slopes: Optional[torch.Tensor], + sliding_window: Optional[int], + k_scale: float, + v_scale: float, + ) -> torch.Tensor: + output = torch.empty_like(query) + context_attention_fwd( + query, + key, + value, + output, + kv_cache_dtype, + key_cache, + value_cache, + block_tables, + # query_start_loc is (batch_size + 1,) + query_start_loc[:-1], + seq_lens_tensor, + context_lens, + max_query_len, + k_scale, + v_scale, + alibi_slopes, + sliding_window, + ) + return output + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + src_key_cache = src_kv_cache[0] + dst_key_cache = dst_kv_cache[0] + ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + + src_value_cache = src_kv_cache[1] + dst_value_cache = dst_kv_cache[1] + ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + ops.copy_blocks(key_caches, value_caches, src_to_dists) diff --git a/vllm/envs.py b/vllm/envs.py index 49b21a687736..ab39cfe805dd 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -74,12 +74,14 @@ VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False - VLLM_ROCM_FP8_PADDING: bool = True VLLM_ROCM_USE_AITER: bool = False VLLM_ROCM_USE_AITER_LINEAR: bool = True VLLM_ROCM_USE_AITER_MOE: bool = True VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True VLLM_ROCM_USE_AITER_NORM: bool = True + VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = True + VLLM_ROCM_USE_CUSTOM_PAGED_ATTN: bool = True + VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 VLLM_DISABLE_COMPILE_CACHE: bool = False @@ -535,16 +537,25 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # use aiter linear op if aiter ops are enabled "VLLM_ROCM_USE_AITER_LINEAR": - lambda: (os.getenv("LLM_ROCM_USE_AITER", "False").lower() in + lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True" ).lower() in ("true", "1")), # use aiter rms norm op if aiter ops are enabled "VLLM_ROCM_USE_AITER_NORM": lambda: - (os.getenv("VLLM_USE_AITER", "False").lower() in + (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_NORM", "True").lower() in ("true", "1")), + "VLLM_ROCM_USE_AITER_PAGED_ATTN": + lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", + "True").lower() in ("true", "1")), + + # use rocm custom paged attention. + "VLLM_ROCM_USE_CUSTOM_PAGED_ATTN": + lambda: (os.getenv("VLLM_ROCM_USE_CUSTOM_PAGED_ATTN", "False").lower() in + ("true", "1")), # Pad the fp8 weights to 256 bytes for ROCm "VLLM_ROCM_FP8_PADDING": From 77cb4360f9f78c69fb3ce7b9b7e5bd2c31d1129d Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 3 Mar 2025 05:18:47 +0000 Subject: [PATCH 10/35] bugfixes and disable rocm aiter paged attention Signed-off-by: vllmellm --- vllm/attention/backends/rocm_flash_attn.py | 3 +-- vllm/attention/ops/rocm_aiter_paged_attn.py | 1 + vllm/envs.py | 10 +++------- vllm/model_executor/layers/linear.py | 2 +- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 2a696b7147d0..17578069e5d3 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -32,7 +32,6 @@ _GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName _ON_NAVI = "gfx1" in _GPU_ARCH _ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"]) -USE_ROCM_CUSTOM_PA = envs.VLLM_ROCM_USE_CUSTOM_PAGED_ATTN class ROCmFlashAttentionBackend(AttentionBackend): @@ -912,4 +911,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int, and (head_size == 64 or head_size == 128) and (block_size == 16 or block_size == 32) and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768 - and USE_ROCM_CUSTOM_PA) + and not USE_ROCM_AITER_PA) diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py index 847d39077494..8a425988290c 100644 --- a/vllm/attention/ops/rocm_aiter_paged_attn.py +++ b/vllm/attention/ops/rocm_aiter_paged_attn.py @@ -126,6 +126,7 @@ def forward_decode( elif "fp8" in kv_cache_dtype: key_cache = key_cache.view(torch.float8_e4m3fnuz) value_cache = value_cache.view(torch.float8_e4m3fnuz) + rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables, seq_lens, max_num_blocks_per_seq, k_scale, v_scale, output) diff --git a/vllm/envs.py b/vllm/envs.py index ab39cfe805dd..689cab23a946 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -80,7 +80,6 @@ VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True VLLM_ROCM_USE_AITER_NORM: bool = True VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = True - VLLM_ROCM_USE_CUSTOM_PAGED_ATTN: bool = True VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -547,15 +546,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_NORM", "True").lower() in ("true", "1")), + + # use aiter paged attention if aiter ops are enabled. "VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", - "True").lower() in ("true", "1")), - - # use rocm custom paged attention. - "VLLM_ROCM_USE_CUSTOM_PAGED_ATTN": - lambda: (os.getenv("VLLM_ROCM_USE_CUSTOM_PAGED_ATTN", "False").lower() in - ("true", "1")), + "False").lower() in ("false", "0")), # Pad the fp8 weights to 256 bytes for ROCm "VLLM_ROCM_FP8_PADDING": diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dddf19e4c617..efad68f63c69 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -146,7 +146,7 @@ def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: if USE_ROCM_AITER_LINEAR: - return rocm_aiter_tgemm.mm(x, layer.weigt, bias) + return rocm_aiter_tgemm.mm(x, layer.weight, bias) return F.linear(x, layer.weight, bias) From 4c4178154fba53539fe4fcf25b3ac90f1db21ad2 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 4 Mar 2025 07:41:12 +0000 Subject: [PATCH 11/35] revert back the custom pa condition Signed-off-by: vllmellm --- vllm/attention/backends/rocm_flash_attn.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 122556d656c6..dd7e3a67f6a9 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -910,5 +910,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int, and (qtype == torch.half or qtype == torch.bfloat16) and (head_size == 64 or head_size == 128) and (block_size == 16 or block_size == 32) - and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768 - and not USE_ROCM_AITER_PA) + and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768) From c09a740cef157ed6322deaa4706bf6a07820a499 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 4 Mar 2025 08:48:09 +0000 Subject: [PATCH 12/35] enable AITER tgemm.mm per tensor scaled mm unittest Signed-off-by: tjtanaa --- tests/quantization/test_fp8.py | 24 +++++++++++++++++++++--- vllm/envs.py | 5 +++-- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 3a7f0a196b5b..7081dd3cae0b 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -23,8 +23,13 @@ reason="FP8 is not supported on this GPU type.") @pytest.mark.parametrize("model_id", MODELS) @pytest.mark.parametrize("force_marlin", [False, True]) +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, - monkeypatch) -> None: + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") @@ -47,7 +52,14 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="FP8 is not supported on this GPU type.") @pytest.mark.parametrize("model_id", KV_CACHE_MODELS) -def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, + use_rocm_aiter: bool, monkeypatch): + + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: def check_model(model): @@ -84,8 +96,14 @@ def check_model(model): reason="FP8 is not supported on this GPU type.") @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) @pytest.mark.parametrize("force_marlin", [False, True]) +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, - monkeypatch) -> None: + use_rocm_aiter: bool, monkeypatch) -> None: + + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") diff --git a/vllm/envs.py b/vllm/envs.py index c1e3dede8248..60a0a9eb9163 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -79,7 +79,7 @@ VLLM_ROCM_USE_AITER_MOE: bool = True VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True VLLM_ROCM_USE_AITER_NORM: bool = True - VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = True + VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -555,9 +555,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: ("true", "1")), # use aiter paged attention if aiter ops are enabled. + # this is disabled by default "VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in - ("false", "0")), + ("true", "1")), # Pad the fp8 weights to 256 bytes for ROCm "VLLM_ROCM_FP8_PADDING": From e19b7f59bc74e6b2cdf043d2ee4b3cbeef150a00 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 4 Mar 2025 09:27:04 +0000 Subject: [PATCH 13/35] bugfix: shuffle the weights when using aiter fmoe block scaled kernel Signed-off-by: vllmellm --- vllm/envs.py | 9 +++++---- vllm/model_executor/layers/fused_moe/fused_moe.py | 7 +++++-- vllm/model_executor/layers/quantization/fp8.py | 11 ++++++++++- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 60a0a9eb9163..1fef5141fd58 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -77,7 +77,7 @@ VLLM_ROCM_USE_AITER: bool = False VLLM_ROCM_USE_AITER_LINEAR: bool = True VLLM_ROCM_USE_AITER_MOE: bool = True - VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True + VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = True VLLM_ROCM_USE_AITER_NORM: bool = True VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False VLLM_ROCM_FP8_PADDING: bool = True @@ -536,10 +536,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: ("true", "1")), # use aiter block scaled moe op if aiter ops are enabled - "VLLM_ROCM_USE_AITER_BSCALED_MOE": + "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in - ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BSCALED_MOE", - "true").lower() in ("true", "1")), + ("true", "1") and os.getenv( + "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "true").lower() in + ("true", "1")), # use aiter linear op if aiter ops are enabled "VLLM_ROCM_USE_AITER_LINEAR": diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f982b8288885..4d5c56874485 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -19,7 +19,10 @@ USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ and current_platform.is_rocm() -if USE_ROCM_AITER_FMOE: +USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE = envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE \ + and current_platform.is_rocm() # noqa: E501 + +if USE_ROCM_AITER_FMOE or USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE: import aiter as rocm_aiter import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe @@ -1167,7 +1170,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, w2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None, expert_mask: Optional[torch.Tensor] = None): - if envs.VLLM_ROCM_USE_AITER_BSCALED_MOE and use_fp8_w8a8: + if USE_ROCM_AITER_FMOE and use_fp8_w8a8: assert w1_scale is not None assert w2_scale is not None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ff0574477ff5..6d035bbf4a6b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -35,7 +35,9 @@ USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ and current_platform.is_rocm() -if USE_ROCM_AITER_FMOE: +USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE = envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE \ + and current_platform.is_rocm() # noqa: E501 +if USE_ROCM_AITER_FMOE or USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE: from aiter.ops.shuffle import shuffle_weight as rocm_aiter_shuffle_weight ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -561,6 +563,13 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w2_weight = Parameter(w2_weight, requires_grad=False) layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv, requires_grad=False) + if USE_ROCM_AITER_FMOE and USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE: + layer.w13_weight = torch.nn.Parameter( + rocm_aiter_shuffle_weight(layer.w13_weight.data), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight( + layer.w2_weight.data), + requires_grad=False) return # If checkpoint is fp16, quantize in place. From 11ac580243225d3480315ce490082903de060f14 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 4 Mar 2025 15:27:05 +0000 Subject: [PATCH 14/35] fix environment wrong variable in unit tests Signed-off-by: vllmellm --- tests/kernels/test_moe.py | 2 +- .../decoder_only/language/test_granite.py | 2 +- .../decoder_only/language/test_mistral.py | 54 ++++++++++--------- .../decoder_only/language/test_models.py | 2 +- .../decoder_only/language/test_phimoe.py | 2 +- tests/quantization/test_fp8.py | 6 +-- 6 files changed, 37 insertions(+), 31 deletions(-) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index b48978721069..0c122b07d2d6 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -210,7 +210,7 @@ def test_mixtral_moe(dtype: torch.dtype, use_rocm_aiter: bool, monkeypatch): huggingface.""" if use_rocm_aiter: - monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") # Instantiate our and huggingface's MoE blocks config = MixtralConfig() hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda") diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index f34b67ca56d5..7dec7bbd0ca1 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -26,7 +26,7 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: - monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index ce9af7faa410..68b27a8e7087 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -181,7 +181,7 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: - monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") # TODO(sang): Sliding window should be tested separately. with hf_runner(model, dtype=dtype) as hf_model: @@ -205,14 +205,14 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_mistral_format( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str, + max_tokens: int, num_logprobs: int, + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + with vllm_runner( model, dtype=dtype, @@ -243,11 +243,13 @@ def test_mistral_format( @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -def test_mistral_symbolic_languages( - vllm_runner, - model: str, - dtype: str, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str, + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + with vllm_runner(model, dtype=dtype, max_model_len=8192, @@ -264,11 +266,13 @@ def test_mistral_symbolic_languages( @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) # v1 can't do func calling -def test_mistral_function_calling( - vllm_runner, - model: str, - dtype: str, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_mistral_function_calling(vllm_runner, model: str, dtype: str, + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral", @@ -299,11 +303,13 @@ def test_mistral_function_calling( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("guided_backend", ["outlines", "lm-format-enforcer", "xgrammar"]) -def test_mistral_guided_decoding( - vllm_runner, - model: str, - guided_backend: str, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_mistral_guided_decoding(vllm_runner, model: str, guided_backend: str, + use_rocm_aiter: bool, monkeypatch) -> None: + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + with vllm_runner(model, dtype='bfloat16', tokenizer_mode="mistral") as vllm_model: diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 7c21ed0fecda..eca2ca18dcb7 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -71,7 +71,7 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: - monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with hf_runner(model, dtype=dtype) as hf_model: if model.startswith("THUDM/chatglm3"): diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py index b8948976a03e..7e42124d7b2c 100644 --- a/tests/models/decoder_only/language/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -85,7 +85,7 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: - monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 7081dd3cae0b..f279a8e9f664 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -28,7 +28,7 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: - monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") @@ -58,7 +58,7 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, use_rocm_aiter: bool, monkeypatch): if use_rocm_aiter: - monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: @@ -102,7 +102,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: - monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") From 08651240cf4edc4c2c685da48a47bbf5926ea931 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 5 Mar 2025 07:42:05 +0000 Subject: [PATCH 15/35] add aiter block gemm kernel and refactor aiter envs conditions Signed-off-by: tjtanaa --- vllm/envs.py | 7 +++++ .../layers/fused_moe/fused_moe.py | 24 +++++++-------- vllm/model_executor/layers/fused_moe/layer.py | 23 +++++++------- vllm/model_executor/layers/layernorm.py | 12 +++----- vllm/model_executor/layers/linear.py | 12 +++----- .../model_executor/layers/quantization/fp8.py | 25 +++++++++------- .../layers/quantization/utils/fp8_utils.py | 24 ++++++++++----- vllm/utils.py | 30 +++++++++++++++++++ 8 files changed, 101 insertions(+), 56 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 1fef5141fd58..46c806e89310 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -80,6 +80,7 @@ VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = True VLLM_ROCM_USE_AITER_NORM: bool = True VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False + VLLM_ROCM_USE_AITER_BLOCK_GEMM: bool = True VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -561,6 +562,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in ("true", "1")), + # use aiter w8a8 block gemm kerner if aiter ops are enabled. + "VLLM_ROCM_USE_AITER_BLOCK_GEMM": + lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM", + "True").lower() in ("true", "1")), + # Pad the fp8 weights to 256 bytes for ROCm "VLLM_ROCM_FP8_PADDING": lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))), diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 4d5c56874485..6470dbaa555a 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -15,16 +15,9 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) from vllm.platforms import current_platform -from vllm.utils import direct_register_custom_op - -USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ - and current_platform.is_rocm() -USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE = envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE \ - and current_platform.is_rocm() # noqa: E501 - -if USE_ROCM_AITER_FMOE or USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE: - import aiter as rocm_aiter - import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe +from vllm.utils import (direct_register_custom_op, + rocm_aiter_fp8_block_scaled_moe_enabled, + rocm_aiter_moe_enabled) logger = init_logger(__name__) @@ -955,7 +948,9 @@ def fused_topk( dtype=torch.int32, device=hidden_states.device) - if USE_ROCM_AITER_FMOE: + if rocm_aiter_moe_enabled(): + import aiter as rocm_aiter + rocm_aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output.float(), renormalize) else: @@ -1170,7 +1165,10 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, w2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None, expert_mask: Optional[torch.Tensor] = None): - if USE_ROCM_AITER_FMOE and use_fp8_w8a8: + import aiter as rocm_aiter + import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe + + if rocm_aiter_fp8_block_scaled_moe_enabled() and use_fp8_w8a8: assert w1_scale is not None assert w2_scale is not None @@ -1259,7 +1257,7 @@ def fused_experts(hidden_states: torch.Tensor, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[List[int]] = None, expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor: - if USE_ROCM_AITER_FMOE: + if rocm_aiter_moe_enabled(): return rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8, w1_scale, w2_scale, block_shape, expert_mask) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f74137cbab0a..61924c68e393 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# isort: skip_file from abc import abstractmethod from enum import Enum @@ -18,6 +19,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum +from vllm.utils import rocm_aiter_moe_enabled if current_platform.is_cuda_alike(): from .fused_moe import fused_experts @@ -29,11 +31,6 @@ else: fused_moe_pallas = None # type: ignore -USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm( -) -if USE_ROCM_AITER_FMOE: - from aiter.ops.shuffle import shuffle_weight as aiter_shuffle_weight - logger = init_logger(__name__) @@ -102,12 +99,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) - if USE_ROCM_AITER_FMOE: - layer.w13_weight = torch.nn.Parameter(aiter_shuffle_weight( - layer.w13_weight.data), + if rocm_aiter_moe_enabled(): + from aiter.ops.shuffle import (shuffle_weight as + rocm_aiter_shuffle_weight) + + shuffled_w13_weight = rocm_aiter_shuffle_weight( + layer.w13_weight.data) + layer.w13_weight = torch.nn.Parameter(shuffled_w13_weight, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(aiter_shuffle_weight( - layer.w2_weight.data), + + shuffled_w2_weight = rocm_aiter_shuffle_weight( + layer.w2_weight.data) + layer.w2_weight = torch.nn.Parameter(shuffled_w2_weight, requires_grad=False) if current_platform.is_cpu(): diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 0d5d1a454594..5976d9e38253 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -5,14 +5,8 @@ import torch import torch.nn as nn -import vllm.envs as envs from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform - -USE_ROCM_AITER_NORM = envs.VLLM_ROCM_USE_AITER_NORM \ - and current_platform.is_rocm() -if USE_ROCM_AITER_NORM: - import aiter as rocm_aiter +from vllm.utils import rocm_aiter_norm_enabled @CustomOp.register("rms_norm") @@ -91,7 +85,9 @@ def forward_cuda( from vllm import _custom_ops as ops if residual is not None: - if USE_ROCM_AITER_NORM: + if rocm_aiter_norm_enabled(): + import aiter as rocm_aiter + rocm_aiter.rmsnorm2d_fwd_with_add( x, x, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 36085d18ea42..72c2289bb063 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -8,7 +8,6 @@ import torch.nn.functional as F from torch.nn.parameter import Parameter, UninitializedParameter -import vllm.envs as envs from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -26,12 +25,7 @@ RowvLLMParameter) # yapf: enable from vllm.model_executor.utils import set_weight_attrs -from vllm.platforms import current_platform - -USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \ - and current_platform.is_rocm() -if USE_ROCM_AITER_LINEAR: - from aiter.tuned_gemm import tgemm as rocm_aiter_tgemm +from vllm.utils import rocm_aiter_linear_enabled logger = init_logger(__name__) @@ -145,7 +139,9 @@ def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - if USE_ROCM_AITER_LINEAR: + if rocm_aiter_linear_enabled(): + from aiter.tuned_gemm import tgemm as rocm_aiter_tgemm + return rocm_aiter_tgemm.mm(x, layer.weight, bias) return F.linear(x, layer.weight, bias) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 6d035bbf4a6b..f6aa35d44cb2 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# isort: skip_file from typing import Any, Callable, Dict, List, Optional @@ -32,13 +33,8 @@ PerTensorScaleParameter) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform - -USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \ - and current_platform.is_rocm() -USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE = envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE \ - and current_platform.is_rocm() # noqa: E501 -if USE_ROCM_AITER_FMOE or USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE: - from aiter.ops.shuffle import shuffle_weight as rocm_aiter_shuffle_weight +from vllm.utils import (rocm_aiter_fp8_block_scaled_moe_enabled, + rocm_aiter_moe_enabled) ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -563,7 +559,10 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w2_weight = Parameter(w2_weight, requires_grad=False) layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv, requires_grad=False) - if USE_ROCM_AITER_FMOE and USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE: + if rocm_aiter_fp8_block_scaled_moe_enabled(): + from aiter.ops.shuffle import (shuffle_weight as + rocm_aiter_shuffle_weight) + layer.w13_weight = torch.nn.Parameter( rocm_aiter_shuffle_weight(layer.w13_weight.data), requires_grad=False) @@ -600,7 +599,10 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - if USE_ROCM_AITER_FMOE: + if rocm_aiter_moe_enabled(): + from aiter.ops.shuffle import (shuffle_weight as + rocm_aiter_shuffle_weight) + w13_scales = layer.w13_weight_scale.data.unsqueeze( -1).unsqueeze(-1).expand( (-1, layer.w13_weight.shape[1], -1)) @@ -684,7 +686,10 @@ def process_weights_after_loading(self, layer: Module) -> None: dq_weight, max_w13_scales[expert_id]) start += shard_size - if USE_ROCM_AITER_FMOE: + if rocm_aiter_moe_enabled(): + from aiter.ops.shuffle import (shuffle_weight as + rocm_aiter_shuffle_weight) + max_w13_scales = max_w13_scales.unsqueeze(-1).unsqueeze( -1).expand((-1, layer.w13_weight.shape[1], -1)) w2_scales = layer.w2_weight_scale.data.unsqueeze(-1).unsqueeze( diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 7d91d2cf1c6e..894fe775a85a 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -17,7 +17,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear) from vllm.platforms import current_platform -from vllm.utils import direct_register_custom_op +from vllm.utils import (direct_register_custom_op, + rocm_aiter_fp8_block_scaled_moe_enabled) logger = init_logger(__name__) @@ -71,12 +72,21 @@ def apply_w8a8_block_fp8_linear( q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1], column_major_scales=False) - output = w8a8_block_fp8_matmul(q_input, - weight, - x_scale, - weight_scale, - block_size, - output_dtype=input.dtype) + if rocm_aiter_fp8_block_scaled_moe_enabled(): + import aiter as rocm_aiter + + output = torch.zeros([q_input.shape[0], weight.shape[0]], + dtype=input.dtype, + device=q_input.device) + output = rocm_aiter.gemm_a8w8_blockscale(q_input, weight, x_scale, + weight_scale, output) + else: + output = w8a8_block_fp8_matmul(q_input, + weight, + x_scale, + weight_scale, + block_size, + output_dtype=input.dtype) if bias is not None: output = output + bias return output.to(dtype=input.dtype).view(*output_shape) diff --git a/vllm/utils.py b/vllm/utils.py index 26c9e1a90837..1b87a21ed966 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -997,6 +997,36 @@ def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None: enable_trace_function_call(log_path) +def is_hip() -> bool: + from vllm.platforms import current_platform + return current_platform.is_rocm() + + +def rocm_aiter_moe_enabled() -> bool: + return is_hip() and envs.VLLM_ROCM_USE_AITER_MOE + + +def rocm_aiter_paged_attn_enabled() -> bool: + return is_hip() and envs.VLLM_ROCM_USE_AITER_PAGED_ATTN + + +def rocm_aiter_linear_enabled() -> bool: + return is_hip() and envs.VLLM_ROCM_USE_AITER_LINEAR + + +def rocm_aiter_norm_enabled() -> bool: + return is_hip() and envs.VLLM_ROCM_USE_AITER_NORM + + +def rocm_aiter_fp8_block_scaled_moe_enabled() -> bool: + return rocm_aiter_moe_enabled( + ) and envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE + + +def rocm_aiter_w8a8_block_gemm_enabled() -> bool: + return is_hip() and envs.VLLM_ROCM_USE_AITER_BLOCK_GEMM + + # `functools` helpers def identity(value: T, **kwargs) -> T: """Returns the first provided value.""" From 623dadbea8056b0b0d222ecd5117379f3459a2d4 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 5 Mar 2025 10:37:31 +0000 Subject: [PATCH 16/35] add dispatch tests Signed-off-by: vllmellm --- requirements-test.txt | 22 ++- .../layers/fused_moe/fused_moe.py | 179 +++++++++++------- 2 files changed, 130 insertions(+), 71 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index f5722c82e201..e5bf67e099e4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -23,6 +23,10 @@ anyio==4.6.2.post1 # via httpx argcomplete==3.5.1 # via datamodel-code-generator +async-timeout==4.0.3 + # via + # aiohttp + # redis attrs==24.2.0 # via # aiohttp @@ -116,6 +120,10 @@ encodec==0.1.1 # via vocos evaluate==0.4.3 # via lm-eval +exceptiongroup==1.2.2 + # via + # anyio + # pytest fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -544,9 +552,7 @@ sentence-transformers==3.2.1 sentencepiece==0.2.0 # via mistral-common setuptools==75.8.0 - # via - # pytablewriter - # torch + # via pytablewriter six==1.16.0 # via # python-dateutil @@ -591,6 +597,12 @@ timm==1.0.11 # via -r requirements-test.in tokenizers==0.21.0 # via transformers +toml==0.10.2 + # via datamodel-code-generator +tomli==2.2.1 + # via + # black + # pytest torch==2.5.1 # via # -r requirements-test.in @@ -651,13 +663,17 @@ typepy==1.3.2 # tabledata typing-extensions==4.12.2 # via + # anyio # bitsandbytes + # black # huggingface-hub # librosa # mistral-common + # multidict # pqdm # pydantic # pydantic-core + # rich # torch tzdata==2024.2 # via pandas diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 6470dbaa555a..6b5073d3b913 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -924,6 +924,35 @@ def try_get_optimal_moe_config( return config +def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, + topk_indices: torch.Tensor, + token_expert_indices: torch.Tensor, + gating_output: torch.Tensor, + renormalize: bool) -> None: + import aiter as rocm_aiter + rocm_aiter.topk_softmax(topk_weights, topk_indices, token_expert_indices, + gating_output, renormalize) + + +def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor, + token_expert_indices: torch.Tensor, + gating_output: torch.Tensor, renormalize: bool) -> None: + ops.topk_softmax( + topk_weights, + topk_indices, + token_expert_indices, + gating_output, + ) + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + +def dispatch_topk_func() -> Callable[..., torch.Tensor]: + if rocm_aiter_moe_enabled(): + return rocm_aiter_topk_softmax + return vllm_topk_softmax + + def fused_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, @@ -948,22 +977,10 @@ def fused_topk( dtype=torch.int32, device=hidden_states.device) - if rocm_aiter_moe_enabled(): - import aiter as rocm_aiter - - rocm_aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies, - gating_output.float(), renormalize) - else: - ops.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - gating_output.float(), # TODO(woosuk): Optimize this. - ) + gating_output_float = gating_output.float() # TODO(woosuk): Optimize this. - if renormalize: - topk_weights = topk_weights / topk_weights.sum(dim=-1, - keepdim=True) + dispatch_topk_func()(topk_weights, topk_ids, token_expert_indicies, + gating_output_float, renormalize) del token_expert_indicies # Not used. Will be used in the future. return topk_weights, topk_ids @@ -1155,16 +1172,21 @@ def outplace_fused_experts_fake( ) -def rocm_aiter_fused_experts(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - use_fp8_w8a8: bool = False, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None, - expert_mask: Optional[torch.Tensor] = None): +def rocm_aiter_fused_experts( + *, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + expert_mask: Optional[torch.Tensor] = None, + **kwagrs # Ignore additional keyword arguments +) -> torch.Tensor: + import aiter as rocm_aiter import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe @@ -1229,51 +1251,72 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor, fc1_smooth_scale=None, fc2_smooth_scale=None, a16=False) - else: - return rocm_aiter.ck_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids) - - -def fused_experts(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - inplace: bool = False, - activation: str = "silu", - use_fp8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None, - expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + + return rocm_aiter.ck_moe(hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids) + + +def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor: + hidden_states = kwargs['hidden_states'] + torch.ops.vllm.inplace_fused_experts(**kwargs) + return hidden_states + + +def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor: + return torch.ops.vllm.outplace_fused_experts(**kwargs) + + +def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]: if rocm_aiter_moe_enabled(): - return rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, - topk_ids, use_fp8_w8a8, w1_scale, - w2_scale, block_shape, expert_mask) + return rocm_aiter_fused_experts if inplace: - torch.ops.vllm.inplace_fused_experts( - hidden_states, w1, w2, topk_weights, topk_ids, activation, - use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, - expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, - block_shape) - return hidden_states - else: - return torch.ops.vllm.outplace_fused_experts( - hidden_states, w1, w2, topk_weights, topk_ids, activation, - use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, - expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, - block_shape) + return torch_vllm_inplace_fused_experts + return torch_vllm_outplace_fused_experts + + +def fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, +) -> torch.Tensor: + return dispatch_fused_experts_func(inplace)( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape) def fused_experts_impl(hidden_states: torch.Tensor, From 459bb02fa4b1f52ef8aff0c8f8b0b427dd7b0cdd Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 5 Mar 2025 10:38:56 +0000 Subject: [PATCH 17/35] add dispatch tests Signed-off-by: vllmellm --- .../model_executor/test_enabled_custom_ops.py | 54 ++++++++++ .../layers/quantization/utils/fp8_utils.py | 102 ++++++++++-------- 2 files changed, 114 insertions(+), 42 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 4a6a766b8ca0..4d2a61d76160 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -7,7 +7,15 @@ from vllm.model_executor.layers.activation import (GeluAndMul, ReLUSquaredActivation, SiluAndMul) +from vllm.model_executor.layers.fused_moe.fused_moe import ( + dispatch_fused_experts_func, dispatch_topk_func, rocm_aiter_fused_experts, + rocm_aiter_topk_softmax, torch_vllm_inplace_fused_experts, + torch_vllm_outplace_fused_experts, vllm_topk_softmax) from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + cutlass_scaled_mm, dispatch_blockscale_func, + rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul) +from vllm.platforms import current_platform # Registered subclass for test @@ -87,3 +95,49 @@ def test_enabled_ops_invalid(env: str): custom_ops=env.split(","))) with set_current_vllm_config(vllm_config): RMSNorm(1024).enabled() + + +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +def test_topk_dispatch(use_rocm_aiter: str, monkeypatch): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + topk_func = dispatch_topk_func() + + if current_platform.is_rocm() and int(use_rocm_aiter): + assert topk_func == rocm_aiter_topk_softmax + else: + assert topk_func == vllm_topk_softmax + + +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool, + monkeypatch): + + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + fused_experts_func = dispatch_fused_experts_func(inplace) + if current_platform.is_rocm() and int(use_rocm_aiter): + assert fused_experts_func == rocm_aiter_fused_experts + elif inplace: + assert fused_experts_func == torch_vllm_inplace_fused_experts + else: + assert fused_experts_func == torch_vllm_outplace_fused_experts + + +@pytest.mark.parametrize("use_cutlass", [True, False]) +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +@pytest.mark.parametrize("use_rocm_aiter_block_gemm", ["0", "1"]) +def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str, + use_rocm_aiter_block_gemm: str, monkeypatch): + + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + monkeypatch.setenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM", + use_rocm_aiter_block_gemm) + block_scale_func = dispatch_blockscale_func(use_cutlass) + + if use_cutlass: + assert block_scale_func == cutlass_scaled_mm + elif current_platform.is_rocm() and int(use_rocm_aiter) and int( + use_rocm_aiter_block_gemm): + assert block_scale_func == rocm_aiter_gemm_a8w8_blockscale + else: + assert block_scale_func == w8a8_block_fp8_matmul diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 894fe775a85a..339fd8c00b95 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -4,7 +4,7 @@ import functools import json import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch import triton @@ -18,7 +18,7 @@ CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear) from vllm.platforms import current_platform from vllm.utils import (direct_register_custom_op, - rocm_aiter_fp8_block_scaled_moe_enabled) + rocm_aiter_w8a8_block_gemm_enabled) logger = init_logger(__name__) @@ -33,6 +33,54 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool: return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz +def shape_supported_by_cutlass(weight: torch.Tensor, block_size: List[int], + weight_scale: torch.Tensor, + input_2d: torch.Tensor) -> bool: + if current_platform.is_rocm(): + scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) + + input_2d.shape[:-1])[::-1] + scale_b_shape = (weight_scale.view(-1, 1) + if weight_scale.dim() <= 1 else weight_scale.T).shape + ar, ac = scale_a_shape + br, bc = scale_b_shape + return ac > 1 or bc > 1 or ar not in (1, input_2d.shape[0]) \ + or br not in (1, weight.shape[0]) + + return weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0 + + +def cutlass_scaled_mm(A: torch.Tensor, B: torch.Tensor, As: torch.Tensor, + Bs: torch.Tensor, output_dtype: torch.dtype, + **kwargs) -> torch.Tensor: + return ops.cutlass_scaled_mm(A, + B.T, + out_dtype=output_dtype, + scale_a=As, + scale_b=Bs.T) + + +def rocm_aiter_gemm_a8w8_blockscale(A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + output_dtype: torch.dtype = torch.float16, + **kwargs) -> torch.Tensor: + import aiter as rocm_aiter + + output = torch.zeros([A.shape[0], B.shape[0]], + dtype=output_dtype, + device=A.device) + return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, output) + + +def dispatch_blockscale_func(use_cutlass: bool) -> Callable[..., torch.Tensor]: + if use_cutlass: + return cutlass_scaled_mm + if rocm_aiter_w8a8_block_gemm_enabled(): + return rocm_aiter_gemm_a8w8_blockscale + return w8a8_block_fp8_matmul + + def apply_w8a8_block_fp8_linear( input: torch.Tensor, weight: torch.Tensor, @@ -46,47 +94,17 @@ def apply_w8a8_block_fp8_linear( # View input as 2D matrix for fp8 methods input_2d = input.view(-1, input.shape[-1]) output_shape = [*input.shape[:-1], weight.shape[0]] + use_cutlass = cutlass_block_fp8_supported and shape_supported_by_cutlass() + + q_input, x_scale = per_token_group_quant_fp8( + input_2d, block_size[1], column_major_scales=use_cutlass) + output = dispatch_blockscale_func()(A=q_input, + B=weight, + As=x_scale, + Bs=weight_scale, + block_size=block_size, + output_dtype=input.dtype) - shape_supported_by_cutlass = (weight.shape[0] % 128 == 0 - and weight.shape[1] % 128 == 0) - if current_platform.is_rocm(): - scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) + - input_2d.shape[:-1])[::-1] - scale_b_shape = (weight_scale.view(-1, 1) - if weight_scale.dim() <= 1 else weight_scale.T).shape - ar, ac = scale_a_shape - br, bc = scale_b_shape - if (ac > 1 or bc > 1 or ar not in (1, input_2d.shape[0]) - or br not in (1, weight.shape[0])): - shape_supported_by_cutlass = False - if cutlass_block_fp8_supported and shape_supported_by_cutlass: - q_input, x_scale = per_token_group_quant_fp8(input_2d, - block_size[1], - column_major_scales=True) - output = ops.cutlass_scaled_mm(q_input, - weight.T, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale.T) - else: - q_input, x_scale = per_token_group_quant_fp8(input_2d, - block_size[1], - column_major_scales=False) - if rocm_aiter_fp8_block_scaled_moe_enabled(): - import aiter as rocm_aiter - - output = torch.zeros([q_input.shape[0], weight.shape[0]], - dtype=input.dtype, - device=q_input.device) - output = rocm_aiter.gemm_a8w8_blockscale(q_input, weight, x_scale, - weight_scale, output) - else: - output = w8a8_block_fp8_matmul(q_input, - weight, - x_scale, - weight_scale, - block_size, - output_dtype=input.dtype) if bias is not None: output = output + bias return output.to(dtype=input.dtype).view(*output_shape) From acc27ffa94e677b8f6fce0f5b593430ce6acbfe4 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 5 Mar 2025 11:48:26 +0000 Subject: [PATCH 18/35] add dispatch tests Signed-off-by: vllmellm --- .../model_executor/test_enabled_custom_ops.py | 39 ++++++++- vllm/model_executor/layers/layernorm.py | 82 ++++++++++++------- vllm/model_executor/layers/linear.py | 21 +++-- 3 files changed, 106 insertions(+), 36 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 4d2a61d76160..9796b2e494d2 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +import torch.nn.functional as F from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config from vllm.model_executor.custom_op import CustomOp @@ -11,7 +12,11 @@ dispatch_fused_experts_func, dispatch_topk_func, rocm_aiter_fused_experts, rocm_aiter_topk_softmax, torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts, vllm_topk_softmax) -from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.layernorm import ( + RMSNorm, dispatch_rmsnorm_func, fused_add_rms_norm, rms_norm, + rocm_aiter_rmsnorm2d_fwd_with_add) +from vllm.model_executor.layers.linear import ( + dipsatch_unquantized_linear_func, rocm_aiter_tgemm_mm) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( cutlass_scaled_mm, dispatch_blockscale_func, rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul) @@ -141,3 +146,35 @@ def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str, assert block_scale_func == rocm_aiter_gemm_a8w8_blockscale else: assert block_scale_func == w8a8_block_fp8_matmul + + +@pytest.mark.parametrize("add_residual", [True, False]) +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"]) +def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str, + use_rocm_aiter_norm: str, monkeypatch): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + monkeypatch.setenv("VLLM_ROCM_USE_AITER_NORM", use_rocm_aiter_norm) + rms_norm_func = dispatch_rmsnorm_func(add_residual) + + if not add_residual: + assert rms_norm_func == rms_norm + elif current_platform.is_rocm() and int(use_rocm_aiter) and int( + use_rocm_aiter_norm): + assert rms_norm_func == rocm_aiter_rmsnorm2d_fwd_with_add + else: + assert rms_norm_func == fused_add_rms_norm + + +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +@pytest.mark.parametrize("use_rocm_aiter_linear", ["0", "1"]) +def test_unquantized_linear_dispatch(use_rocm_aiter: str, + use_rocm_aiter_linear: str, monkeypatch): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR", use_rocm_aiter_linear) + linear_func = dipsatch_unquantized_linear_func() + if current_platform.is_rocm() and int(use_rocm_aiter) and int( + use_rocm_aiter_linear): + assert linear_func == rocm_aiter_tgemm_mm + else: + assert linear_func == F.linear diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 5976d9e38253..d31b690a8c0d 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Custom normalization layers.""" -from typing import Optional, Tuple, Union +from typing import Callable, Optional, Tuple, Union import torch import torch.nn as nn @@ -9,6 +9,56 @@ from vllm.utils import rocm_aiter_norm_enabled +def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float, + **kwargs) -> torch.Tensor: + from vllm import _custom_ops as ops + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + weight, + variance_epsilon, + ) + return out + + +def fused_add_rms_norm(*, x: torch.Tensor, residual: torch.Tensor, + weight: torch.Tensor, variance_epsilon: float): + from vllm import _custom_ops as ops + return ops.fused_add_rms_norm( + x, + residual, + weight, + variance_epsilon, + ) + + +def rocm_aiter_rmsnorm2d_fwd_with_add( + *, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]: + import aiter as rocm_aiter + + rocm_aiter.rmsnorm2d_fwd_with_add( + x, + x, + residual, + residual, + weight, + variance_epsilon, + ) + return x, residual + + +def dispatch_rmsnorm_func( + add_residual: bool +) -> Callable[..., Tuple[torch.Tensor, torch.Tensor]]: + if not add_residual: + return rms_norm + if rocm_aiter_norm_enabled(): + return rocm_aiter_rmsnorm2d_fwd_with_add + return fused_add_rms_norm + + @CustomOp.register("rms_norm") class RMSNorm(CustomOp): """Root mean square normalization. @@ -82,37 +132,13 @@ def forward_cuda( if self.variance_size_override is not None: return self.forward_native(x, residual) - from vllm import _custom_ops as ops - - if residual is not None: - if rocm_aiter_norm_enabled(): - import aiter as rocm_aiter - - rocm_aiter.rmsnorm2d_fwd_with_add( - x, - x, - residual, - residual, - self.weight.data, - self.variance_epsilon, - ) - else: - ops.fused_add_rms_norm( - x, - residual, - self.weight.data, - self.variance_epsilon, - ) - return x, residual - - out = torch.empty_like(x) - ops.rms_norm( - out, + add_residual = residual is not None + return dispatch_rmsnorm_func(add_residual)( x, + residual, self.weight.data, self.variance_epsilon, ) - return out def forward_hpu( self, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 72c2289bb063..f484eb140f53 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -2,7 +2,7 @@ import itertools from abc import abstractmethod -from typing import Optional +from typing import Callable, Optional import torch import torch.nn.functional as F @@ -39,6 +39,18 @@ ] +def rocm_aiter_tgemm_mm(x: torch.Tensor, weight: torch.Tensor, + bias: torch.Tensor) -> torch.Tensor: + from aiter.tuned_gemm import tgemm + return tgemm.mm(x, weight, bias) + + +def dipsatch_unquantized_linear_func() -> Callable[..., torch.Tensor]: + if rocm_aiter_linear_enabled(): + return rocm_aiter_tgemm_mm + return F.linear + + def adjust_marlin_shard(param, shard_size, shard_offset): marlin_tile_size = getattr(param, "marlin_tile_size", None) if marlin_tile_size is None: @@ -139,12 +151,7 @@ def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - if rocm_aiter_linear_enabled(): - from aiter.tuned_gemm import tgemm as rocm_aiter_tgemm - - return rocm_aiter_tgemm.mm(x, layer.weight, bias) - - return F.linear(x, layer.weight, bias) + return dipsatch_unquantized_linear_func()(x, layer.weight, bias) class LinearBase(torch.nn.Module): From 11b6aba777b6f265cd35d1fcda1ce287c7c413ff Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 5 Mar 2025 15:23:18 +0000 Subject: [PATCH 19/35] bugfixes in layernorm and fix spelling mistakes Signed-off-by: vllmellm --- tests/model_executor/test_enabled_custom_ops.py | 4 ++-- vllm/model_executor/layers/layernorm.py | 12 +++++++----- vllm/model_executor/layers/linear.py | 4 ++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 9796b2e494d2..b6d0248b8ab7 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -16,7 +16,7 @@ RMSNorm, dispatch_rmsnorm_func, fused_add_rms_norm, rms_norm, rocm_aiter_rmsnorm2d_fwd_with_add) from vllm.model_executor.layers.linear import ( - dipsatch_unquantized_linear_func, rocm_aiter_tgemm_mm) + dispatch_unquantized_linear_func, rocm_aiter_tgemm_mm) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( cutlass_scaled_mm, dispatch_blockscale_func, rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul) @@ -172,7 +172,7 @@ def test_unquantized_linear_dispatch(use_rocm_aiter: str, use_rocm_aiter_linear: str, monkeypatch): monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR", use_rocm_aiter_linear) - linear_func = dipsatch_unquantized_linear_func() + linear_func = dispatch_unquantized_linear_func() if current_platform.is_rocm() and int(use_rocm_aiter) and int( use_rocm_aiter_linear): assert linear_func == rocm_aiter_tgemm_mm diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index d31b690a8c0d..ffd2187a1da3 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -25,12 +25,14 @@ def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float, def fused_add_rms_norm(*, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, variance_epsilon: float): from vllm import _custom_ops as ops - return ops.fused_add_rms_norm( + + ops.fused_add_rms_norm( x, residual, weight, variance_epsilon, ) + return x, residual def rocm_aiter_rmsnorm2d_fwd_with_add( @@ -134,10 +136,10 @@ def forward_cuda( add_residual = residual is not None return dispatch_rmsnorm_func(add_residual)( - x, - residual, - self.weight.data, - self.variance_epsilon, + x=x, + residual=residual, + weight=self.weight.data, + variance_epsilon=self.variance_epsilon, ) def forward_hpu( diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f484eb140f53..dcf28237c85b 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -45,7 +45,7 @@ def rocm_aiter_tgemm_mm(x: torch.Tensor, weight: torch.Tensor, return tgemm.mm(x, weight, bias) -def dipsatch_unquantized_linear_func() -> Callable[..., torch.Tensor]: +def dispatch_unquantized_linear_func() -> Callable[..., torch.Tensor]: if rocm_aiter_linear_enabled(): return rocm_aiter_tgemm_mm return F.linear @@ -151,7 +151,7 @@ def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - return dipsatch_unquantized_linear_func()(x, layer.weight, bias) + return dispatch_unquantized_linear_func()(x, layer.weight, bias) class LinearBase(torch.nn.Module): From 0a6b8a0c0a20d2c10f301fb452737aa309a8f31d Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 5 Mar 2025 15:27:40 +0000 Subject: [PATCH 20/35] enable rocm aiter paged attention Signed-off-by: vllmellm --- vllm/attention/backends/rocm_flash_attn.py | 9 +++++---- vllm/attention/ops/rocm_aiter_paged_attn.py | 21 ++++++++++++++------- vllm/envs.py | 5 +++-- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index dd7e3a67f6a9..28b3c914b3f6 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -14,12 +14,12 @@ CommonMetadataBuilder) from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.utils import rocm_aiter_paged_attn_enabled if TYPE_CHECKING: from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata -USE_ROCM_AITER_PA = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN -if USE_ROCM_AITER_PA: +if rocm_aiter_paged_attn_enabled(): from vllm.attention.ops.rocm_aiter_paged_attn import ( PagedAttention, PagedAttentionMetadata) else: @@ -615,7 +615,7 @@ def forward( else: assert value is None - if (USE_ROCM_AITER_PA and kv_cache.dtype.itemsize == 1 + if (rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1 and not self.aiter_kv_scales_initialized and kv_cache.shape != torch.Size([0])): num_blocks = kv_cache.shape[1] @@ -910,4 +910,5 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int, and (qtype == torch.half or qtype == torch.bfloat16) and (head_size == 64 or head_size == 128) and (block_size == 16 or block_size == 32) - and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768) + and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768 + and not rocm_aiter_paged_attn_enabled()) diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py index 8a425988290c..ee48ad928501 100644 --- a/vllm/attention/ops/rocm_aiter_paged_attn.py +++ b/vllm/attention/ops/rocm_aiter_paged_attn.py @@ -108,6 +108,20 @@ def forward_decode( blocksparse_block_size: int = 64, blocksparse_head_sliding_step: int = 0, ) -> torch.Tensor: + if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]: + if num_kv_heads == 1: + k_scale, v_scale = (None, None) + query = query.contiguous() + else: + raise NotImplementedError( + f"ROCM AITER paged attention does not \ + support num_kv_heads > 1 \ + for kv_cache_dtype: {kv_cache_dtype}") + + elif "fp8" in kv_cache_dtype: + key_cache = key_cache.view(torch.float8_e4m3fnuz) + value_cache = value_cache.view(torch.float8_e4m3fnuz) + if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1: # use blocksparse paged attention block_size = value_cache.size(-1) @@ -120,13 +134,6 @@ def forward_decode( block_size = value_cache.shape[3] max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]: - k_scale, v_scale = (None, None) - query = query.contiguous() - elif "fp8" in kv_cache_dtype: - key_cache = key_cache.view(torch.float8_e4m3fnuz) - value_cache = value_cache.view(torch.float8_e4m3fnuz) - rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables, seq_lens, max_num_blocks_per_seq, k_scale, v_scale, output) diff --git a/vllm/envs.py b/vllm/envs.py index 46c806e89310..f661bc479765 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -559,8 +559,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # use aiter paged attention if aiter ops are enabled. # this is disabled by default "VLLM_ROCM_USE_AITER_PAGED_ATTN": - lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in - ("true", "1")), + lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", + "False").lower() in ("true", "1")), # use aiter w8a8 block gemm kerner if aiter ops are enabled. "VLLM_ROCM_USE_AITER_BLOCK_GEMM": From 1474828da678efb959776c81332cc960dcd1048a Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 5 Mar 2025 16:07:29 +0000 Subject: [PATCH 21/35] bugfix: add the missing argument in dispatch Signed-off-by: vllmellm --- .../layers/quantization/utils/fp8_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 339fd8c00b95..b8f7a8ca7636 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -98,12 +98,12 @@ def apply_w8a8_block_fp8_linear( q_input, x_scale = per_token_group_quant_fp8( input_2d, block_size[1], column_major_scales=use_cutlass) - output = dispatch_blockscale_func()(A=q_input, - B=weight, - As=x_scale, - Bs=weight_scale, - block_size=block_size, - output_dtype=input.dtype) + output = dispatch_blockscale_func(use_cutlass)(A=q_input, + B=weight, + As=x_scale, + Bs=weight_scale, + block_size=block_size, + output_dtype=input.dtype) if bias is not None: output = output + bias From b78114a8c38bb5f1098aa0f2535a9f0eae9f8a32 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 6 Mar 2025 08:10:05 +0000 Subject: [PATCH 22/35] update rocm AITER commit version Signed-off-by: vllmellm --- Dockerfile.rocm_base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base index 494dc17c13ae..8f423d8f1492 100644 --- a/Dockerfile.rocm_base +++ b/Dockerfile.rocm_base @@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="1a7f4dfa" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="dfed377" +ARG AITER_BRANCH="e1ec015" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base From d20d7571128bf5927e0e04c918bbb8992edb7aa2 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 6 Mar 2025 09:37:29 +0000 Subject: [PATCH 23/35] bug fix Signed-off-by: vllmellm --- .../model_executor/test_enabled_custom_ops.py | 8 ++++---- vllm/model_executor/layers/layernorm.py | 13 +++++++------ .../layers/quantization/utils/fp8_utils.py | 19 +++++++++++-------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index b6d0248b8ab7..340376d783c8 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -13,12 +13,12 @@ rocm_aiter_topk_softmax, torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts, vllm_topk_softmax) from vllm.model_executor.layers.layernorm import ( - RMSNorm, dispatch_rmsnorm_func, fused_add_rms_norm, rms_norm, + RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm, rocm_aiter_rmsnorm2d_fwd_with_add) from vllm.model_executor.layers.linear import ( dispatch_unquantized_linear_func, rocm_aiter_tgemm_mm) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - cutlass_scaled_mm, dispatch_blockscale_func, + cutlass_scaled_mm, dispatch_w8a8_blockscale_func, rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul) from vllm.platforms import current_platform @@ -137,7 +137,7 @@ def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str, monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) monkeypatch.setenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM", use_rocm_aiter_block_gemm) - block_scale_func = dispatch_blockscale_func(use_cutlass) + block_scale_func = dispatch_w8a8_blockscale_func(use_cutlass) if use_cutlass: assert block_scale_func == cutlass_scaled_mm @@ -155,7 +155,7 @@ def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str, use_rocm_aiter_norm: str, monkeypatch): monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) monkeypatch.setenv("VLLM_ROCM_USE_AITER_NORM", use_rocm_aiter_norm) - rms_norm_func = dispatch_rmsnorm_func(add_residual) + rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual) if not add_residual: assert rms_norm_func == rms_norm diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index ffd2187a1da3..8fb55802ca38 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -22,8 +22,9 @@ def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float, return out -def fused_add_rms_norm(*, x: torch.Tensor, residual: torch.Tensor, - weight: torch.Tensor, variance_epsilon: float): +def fused_add_rms_norm( + *, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]: from vllm import _custom_ops as ops ops.fused_add_rms_norm( @@ -51,9 +52,9 @@ def rocm_aiter_rmsnorm2d_fwd_with_add( return x, residual -def dispatch_rmsnorm_func( - add_residual: bool -) -> Callable[..., Tuple[torch.Tensor, torch.Tensor]]: +def dispatch_cuda_rmsnorm_func( + add_residual: bool +) -> Callable[..., Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]: if not add_residual: return rms_norm if rocm_aiter_norm_enabled(): @@ -135,7 +136,7 @@ def forward_cuda( return self.forward_native(x, residual) add_residual = residual is not None - return dispatch_rmsnorm_func(add_residual)( + return dispatch_cuda_rmsnorm_func(add_residual)( x=x, residual=residual, weight=self.weight.data, diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index b8f7a8ca7636..33ad52a54334 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -73,7 +73,8 @@ def rocm_aiter_gemm_a8w8_blockscale(A: torch.Tensor, return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, output) -def dispatch_blockscale_func(use_cutlass: bool) -> Callable[..., torch.Tensor]: +def dispatch_w8a8_blockscale_func( + use_cutlass: bool) -> Callable[..., torch.Tensor]: if use_cutlass: return cutlass_scaled_mm if rocm_aiter_w8a8_block_gemm_enabled(): @@ -94,16 +95,18 @@ def apply_w8a8_block_fp8_linear( # View input as 2D matrix for fp8 methods input_2d = input.view(-1, input.shape[-1]) output_shape = [*input.shape[:-1], weight.shape[0]] - use_cutlass = cutlass_block_fp8_supported and shape_supported_by_cutlass() + use_cutlass = cutlass_block_fp8_supported and shape_supported_by_cutlass( + weight, block_size, weight_scale, input_2d) q_input, x_scale = per_token_group_quant_fp8( input_2d, block_size[1], column_major_scales=use_cutlass) - output = dispatch_blockscale_func(use_cutlass)(A=q_input, - B=weight, - As=x_scale, - Bs=weight_scale, - block_size=block_size, - output_dtype=input.dtype) + output = dispatch_w8a8_blockscale_func(use_cutlass)( + A=q_input, + B=weight, + As=x_scale, + Bs=weight_scale, + block_size=block_size, + output_dtype=input.dtype) if bias is not None: output = output + bias From 7754c2e0260f18690aa893dc504c1cc3d5adc2c3 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 6 Mar 2025 10:03:09 +0000 Subject: [PATCH 24/35] add more comments for code documentation Signed-off-by: vllmellm --- vllm/envs.py | 32 +++++++++++-------- vllm/model_executor/layers/fused_moe/layer.py | 1 + .../model_executor/layers/quantization/fp8.py | 3 ++ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index f661bc479765..79baee7b5bf1 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -80,7 +80,7 @@ VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = True VLLM_ROCM_USE_AITER_NORM: bool = True VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False - VLLM_ROCM_USE_AITER_BLOCK_GEMM: bool = True + VLLM_ROCM_USE_AITER_BLOCK_GEMM: bool = False VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -524,32 +524,34 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), - # use aiter ops unless specifically disabled + # use aiter ops unless specifically disabled. + # Acts as a parent switch to enable the rest of the other operations. "VLLM_ROCM_USE_AITER": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1")), - # use aiter moe op if aiter ops are enabled + # use aiter moe op if aiter ops are enabled. "VLLM_ROCM_USE_AITER_MOE": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in ("true", "1")), - # use aiter block scaled moe op if aiter ops are enabled + # use aiter block scaled moe op if aiter ops are enabled. + # by default this is disabled. "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE": - lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in - ("true", "1") and os.getenv( - "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "true").lower() in - ("true", "1")), + lambda: + (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", + "false").lower() in ("true", "1")), - # use aiter linear op if aiter ops are enabled + # use aiter linear op if aiter ops are enabled. "VLLM_ROCM_USE_AITER_LINEAR": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True" ).lower() in ("true", "1")), - # use aiter rms norm op if aiter ops are enabled + # use aiter rms norm op if aiter ops are enabled. "VLLM_ROCM_USE_AITER_NORM": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in @@ -557,19 +559,21 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: ("true", "1")), # use aiter paged attention if aiter ops are enabled. - # this is disabled by default + # this is disabled by default. "VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in ("true", "1")), - # use aiter w8a8 block gemm kerner if aiter ops are enabled. + # use aiter w8a8 block gemm kernel if aiter ops are enabled. + # this is disabled by default. "VLLM_ROCM_USE_AITER_BLOCK_GEMM": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM", - "True").lower() in ("true", "1")), + "False").lower() in ("true", "1")), - # Pad the fp8 weights to 256 bytes for ROCm + # Pad the fp8 weights to 256 bytes for ROCm. + # Used only in Fp8LinearMethod "VLLM_ROCM_FP8_PADDING": lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))), # Divisor for dynamic key scale factor calculation for FP8 KV Cache diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 61924c68e393..08da5a42d171 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -100,6 +100,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) if rocm_aiter_moe_enabled(): + # reshaping weights is required for aiter moe kernel. from aiter.ops.shuffle import (shuffle_weight as rocm_aiter_shuffle_weight) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index f6aa35d44cb2..e9e73a0aa2e1 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -560,6 +560,7 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv, requires_grad=False) if rocm_aiter_fp8_block_scaled_moe_enabled(): + # reshaping weights is required for aiter moe kernel. from aiter.ops.shuffle import (shuffle_weight as rocm_aiter_shuffle_weight) @@ -600,6 +601,7 @@ def process_weights_after_loading(self, layer: Module) -> None: requires_grad=False) if rocm_aiter_moe_enabled(): + # reshaping weights is required for aiter moe kernel. from aiter.ops.shuffle import (shuffle_weight as rocm_aiter_shuffle_weight) @@ -687,6 +689,7 @@ def process_weights_after_loading(self, layer: Module) -> None: start += shard_size if rocm_aiter_moe_enabled(): + # reshaping weights is required for aiter moe kernel. from aiter.ops.shuffle import (shuffle_weight as rocm_aiter_shuffle_weight) From 5e31c3e129e756b87a8886557b58665a0c72df8b Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 7 Mar 2025 04:47:54 +0000 Subject: [PATCH 25/35] disable some model tests Signed-off-by: vllmellm --- .buildkite/run-amd-test.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 96fcafc9dc1c..a18dd1abd41a 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -96,6 +96,15 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_mamba_mixer2.py" fi +#ignore certain Model tests +if [[ $commands == *" models/decoder_only/language "* ]]; then + commands=${commands//" models/decoder_only/language "/" models/decoder_only/language \ + --ignore=models/decoder_only/language/test_mistral.py \ + --ignore=models/decoder_only/language/test_phimoe.py \ + --ignore=models/decoder_only/language/test_granite.py \ + --ignore=models/decoder_only/language/test_models.py "} +fi + #ignore certain Entrypoints tests if [[ $commands == *" entrypoints/openai "* ]]; then commands=${commands//" entrypoints/openai "/" entrypoints/openai \ From d21c912f544f16d59297e23aa70d0232760be203 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 7 Mar 2025 04:55:58 +0000 Subject: [PATCH 26/35] move rocm-aiter env flag checks to vllm.platforms.current_platform Signed-off-by: vllmellm --- .../model_executor/test_enabled_custom_ops.py | 4 +-- vllm/envs.py | 17 +++++------ .../layers/fused_moe/fused_moe.py | 11 ++++--- vllm/model_executor/layers/fused_moe/layer.py | 3 +- vllm/model_executor/layers/layernorm.py | 4 +-- vllm/model_executor/layers/linear.py | 4 +-- .../model_executor/layers/quantization/fp8.py | 8 ++--- .../layers/quantization/utils/fp8_utils.py | 5 ++-- vllm/platforms/interface.py | 18 +++++++++++ vllm/platforms/rocm.py | 19 ++++++++++++ vllm/utils.py | 30 ------------------- 11 files changed, 62 insertions(+), 61 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 340376d783c8..7c64ee2b03d9 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -135,7 +135,7 @@ def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str, use_rocm_aiter_block_gemm: str, monkeypatch): monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) - monkeypatch.setenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM", + monkeypatch.setenv("VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM", use_rocm_aiter_block_gemm) block_scale_func = dispatch_w8a8_blockscale_func(use_cutlass) @@ -154,7 +154,7 @@ def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str, def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str, use_rocm_aiter_norm: str, monkeypatch): monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) - monkeypatch.setenv("VLLM_ROCM_USE_AITER_NORM", use_rocm_aiter_norm) + monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm) rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual) if not add_residual: diff --git a/vllm/envs.py b/vllm/envs.py index 79baee7b5bf1..824b2730388f 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -78,9 +78,9 @@ VLLM_ROCM_USE_AITER_LINEAR: bool = True VLLM_ROCM_USE_AITER_MOE: bool = True VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = True - VLLM_ROCM_USE_AITER_NORM: bool = True + VLLM_ROCM_USE_AITER_RMSNORM: bool = True VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False - VLLM_ROCM_USE_AITER_BLOCK_GEMM: bool = False + VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM: bool = False VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -552,11 +552,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: ).lower() in ("true", "1")), # use aiter rms norm op if aiter ops are enabled. - "VLLM_ROCM_USE_AITER_NORM": - lambda: - (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in - ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_NORM", "True").lower() in - ("true", "1")), + "VLLM_ROCM_USE_AITER_RMSNORM": + lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True" + ).lower() in ("true", "1")), # use aiter paged attention if aiter ops are enabled. # this is disabled by default. @@ -567,9 +566,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # use aiter w8a8 block gemm kernel if aiter ops are enabled. # this is disabled by default. - "VLLM_ROCM_USE_AITER_BLOCK_GEMM": + "VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM": lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in - ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM", + ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM", "False").lower() in ("true", "1")), # Pad the fp8 weights to 256 bytes for ROCm. diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 6b5073d3b913..6b8048edf32c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -15,9 +15,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) from vllm.platforms import current_platform -from vllm.utils import (direct_register_custom_op, - rocm_aiter_fp8_block_scaled_moe_enabled, - rocm_aiter_moe_enabled) +from vllm.utils import direct_register_custom_op logger = init_logger(__name__) @@ -948,7 +946,7 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor, def dispatch_topk_func() -> Callable[..., torch.Tensor]: - if rocm_aiter_moe_enabled(): + if current_platform.is_rocm_aiter_moe_enabled(): return rocm_aiter_topk_softmax return vllm_topk_softmax @@ -1190,7 +1188,8 @@ def rocm_aiter_fused_experts( import aiter as rocm_aiter import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe - if rocm_aiter_fp8_block_scaled_moe_enabled() and use_fp8_w8a8: + if current_platform.is_rocm_aiter_fp8_block_scaled_moe_enabled( + ) and use_fp8_w8a8: assert w1_scale is not None assert w2_scale is not None @@ -1270,7 +1269,7 @@ def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor: def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]: - if rocm_aiter_moe_enabled(): + if current_platform.is_rocm_aiter_moe_enabled(): return rocm_aiter_fused_experts if inplace: return torch_vllm_inplace_fused_experts diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 08da5a42d171..42aa6252ca1c 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -19,7 +19,6 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum -from vllm.utils import rocm_aiter_moe_enabled if current_platform.is_cuda_alike(): from .fused_moe import fused_experts @@ -99,7 +98,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) - if rocm_aiter_moe_enabled(): + if current_platform.is_rocm_aiter_moe_enabled(): # reshaping weights is required for aiter moe kernel. from aiter.ops.shuffle import (shuffle_weight as rocm_aiter_shuffle_weight) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 8fb55802ca38..a3e12bf49d0f 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -6,7 +6,7 @@ import torch.nn as nn from vllm.model_executor.custom_op import CustomOp -from vllm.utils import rocm_aiter_norm_enabled +from vllm.platforms import current_platform def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float, @@ -57,7 +57,7 @@ def dispatch_cuda_rmsnorm_func( ) -> Callable[..., Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]: if not add_residual: return rms_norm - if rocm_aiter_norm_enabled(): + if current_platform.is_rocm_aiter_rmsnorm_enabled(): return rocm_aiter_rmsnorm2d_fwd_with_add return fused_add_rms_norm diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dcf28237c85b..9b07a5262443 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -25,7 +25,7 @@ RowvLLMParameter) # yapf: enable from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import rocm_aiter_linear_enabled +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -46,7 +46,7 @@ def rocm_aiter_tgemm_mm(x: torch.Tensor, weight: torch.Tensor, def dispatch_unquantized_linear_func() -> Callable[..., torch.Tensor]: - if rocm_aiter_linear_enabled(): + if current_platform.is_rocm_aiter_linear_enabled(): return rocm_aiter_tgemm_mm return F.linear diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e9e73a0aa2e1..9db95d37d4ef 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -33,8 +33,6 @@ PerTensorScaleParameter) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import (rocm_aiter_fp8_block_scaled_moe_enabled, - rocm_aiter_moe_enabled) ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -559,7 +557,7 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w2_weight = Parameter(w2_weight, requires_grad=False) layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv, requires_grad=False) - if rocm_aiter_fp8_block_scaled_moe_enabled(): + if current_platform.is_rocm_aiter_fp8_block_scaled_moe_enabled(): # reshaping weights is required for aiter moe kernel. from aiter.ops.shuffle import (shuffle_weight as rocm_aiter_shuffle_weight) @@ -600,7 +598,7 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - if rocm_aiter_moe_enabled(): + if current_platform.is_rocm_aiter_moe_enabled(): # reshaping weights is required for aiter moe kernel. from aiter.ops.shuffle import (shuffle_weight as rocm_aiter_shuffle_weight) @@ -688,7 +686,7 @@ def process_weights_after_loading(self, layer: Module) -> None: dq_weight, max_w13_scales[expert_id]) start += shard_size - if rocm_aiter_moe_enabled(): + if current_platform.is_rocm_aiter_moe_enabled(): # reshaping weights is required for aiter moe kernel. from aiter.ops.shuffle import (shuffle_weight as rocm_aiter_shuffle_weight) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 33ad52a54334..be7978a323c5 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -17,8 +17,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear) from vllm.platforms import current_platform -from vllm.utils import (direct_register_custom_op, - rocm_aiter_w8a8_block_gemm_enabled) +from vllm.utils import direct_register_custom_op logger = init_logger(__name__) @@ -77,7 +76,7 @@ def dispatch_w8a8_blockscale_func( use_cutlass: bool) -> Callable[..., torch.Tensor]: if use_cutlass: return cutlass_scaled_mm - if rocm_aiter_w8a8_block_gemm_enabled(): + if current_platform.is_rocm_aiter_w8a8_block_gemm_enabled(): return rocm_aiter_gemm_a8w8_blockscale return w8a8_block_fp8_matmul diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index e7e55e11775c..4305223e497f 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -143,6 +143,24 @@ def is_cuda_alike(self) -> bool: """Stateless version of :func:`torch.cuda.is_available`.""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) + def is_rocm_aiter_moe_enabled(self) -> bool: + return False + + def is_rocm_aiter_paged_attn_enabled(self) -> bool: + return False + + def is_rocm_aiter_linear_enabled(self) -> bool: + return False + + def is_rocm_aiter_rmsnorm_enabled(self) -> bool: + return False + + def is_rocm_aiter_fp8_block_scaled_moe_enabled(self) -> bool: + return False + + def is_rocm_aiter_w8a8_block_gemm_enabled(self) -> bool: + return False + @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, dtype: torch.dtype, kv_cache_dtype: Optional[str], diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index a4f18cbfc587..5fc08950f0af 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -231,3 +231,22 @@ def get_current_memory_usage(cls, @classmethod def get_device_communicator_cls(cls) -> str: return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator" # noqa + + def is_rocm_aiter_moe_enabled(self) -> bool: + return envs.VLLM_ROCM_USE_AITER_MOE + + def is_rocm_aiter_paged_attn_enabled(self) -> bool: + return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN + + def is_rocm_aiter_linear_enabled(self) -> bool: + return envs.VLLM_ROCM_USE_AITER_LINEAR + + def is_rocm_aiter_rmsnorm_enabled(self) -> bool: + return envs.VLLM_ROCM_USE_AITER_RMSNORM + + def is_rocm_aiter_fp8_block_scaled_moe_enabled(self) -> bool: + return self.rocm_aiter_moe_enabled( + ) and envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE + + def is_rocm_aiter_w8a8_block_gemm_enabled(self) -> bool: + return envs.VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM diff --git a/vllm/utils.py b/vllm/utils.py index 1b87a21ed966..26c9e1a90837 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -997,36 +997,6 @@ def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None: enable_trace_function_call(log_path) -def is_hip() -> bool: - from vllm.platforms import current_platform - return current_platform.is_rocm() - - -def rocm_aiter_moe_enabled() -> bool: - return is_hip() and envs.VLLM_ROCM_USE_AITER_MOE - - -def rocm_aiter_paged_attn_enabled() -> bool: - return is_hip() and envs.VLLM_ROCM_USE_AITER_PAGED_ATTN - - -def rocm_aiter_linear_enabled() -> bool: - return is_hip() and envs.VLLM_ROCM_USE_AITER_LINEAR - - -def rocm_aiter_norm_enabled() -> bool: - return is_hip() and envs.VLLM_ROCM_USE_AITER_NORM - - -def rocm_aiter_fp8_block_scaled_moe_enabled() -> bool: - return rocm_aiter_moe_enabled( - ) and envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE - - -def rocm_aiter_w8a8_block_gemm_enabled() -> bool: - return is_hip() and envs.VLLM_ROCM_USE_AITER_BLOCK_GEMM - - # `functools` helpers def identity(value: T, **kwargs) -> T: """Returns the first provided value.""" From 59f0208ef021cb56be1d1838e5bb7923b182e8a6 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 7 Mar 2025 08:28:59 +0000 Subject: [PATCH 27/35] bugfixes after refactoring the aiter modules enablility in current platform Signed-off-by: vllmellm --- vllm/attention/backends/rocm_flash_attn.py | 8 ++++---- vllm/platforms/rocm.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 28b3c914b3f6..8d2868f8fded 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -14,12 +14,11 @@ CommonMetadataBuilder) from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import rocm_aiter_paged_attn_enabled if TYPE_CHECKING: from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata -if rocm_aiter_paged_attn_enabled(): +if current_platform.is_rocm_aiter_paged_attn_enabled(): from vllm.attention.ops.rocm_aiter_paged_attn import ( PagedAttention, PagedAttentionMetadata) else: @@ -615,7 +614,8 @@ def forward( else: assert value is None - if (rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1 + if (current_platform.is_rocm_aiter_paged_attn_enabled() + and kv_cache.dtype.itemsize == 1 and not self.aiter_kv_scales_initialized and kv_cache.shape != torch.Size([0])): num_blocks = kv_cache.shape[1] @@ -911,4 +911,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int, and (head_size == 64 or head_size == 128) and (block_size == 16 or block_size == 32) and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768 - and not rocm_aiter_paged_attn_enabled()) + and not current_platform.is_rocm_aiter_paged_attn_enabled()) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 5fc08950f0af..03c2d1b0a254 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -245,7 +245,7 @@ def is_rocm_aiter_rmsnorm_enabled(self) -> bool: return envs.VLLM_ROCM_USE_AITER_RMSNORM def is_rocm_aiter_fp8_block_scaled_moe_enabled(self) -> bool: - return self.rocm_aiter_moe_enabled( + return self.is_rocm_aiter_moe_enabled( ) and envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE def is_rocm_aiter_w8a8_block_gemm_enabled(self) -> bool: From 17b4d6adc78a547a6f032851cbb37d2c78122a7e Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 7 Mar 2025 15:53:11 +0000 Subject: [PATCH 28/35] update AMD CI to skip certain test cases Signed-off-by: vllmellm --- .buildkite/run-amd-test.sh | 13 ++++--------- tests/models/decoder_only/language/test_granite.py | 2 ++ tests/models/decoder_only/language/test_mistral.py | 10 ++++++++++ tests/models/decoder_only/language/test_models.py | 2 ++ tests/models/decoder_only/language/test_phimoe.py | 2 ++ 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index a18dd1abd41a..7a145de9671f 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -72,6 +72,10 @@ HF_CACHE="$(realpath ~)/huggingface" mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" +# environment variables +SKIP_ROCM_ATIER_MODEL_TEST_CASES="True" +echo $SKIP_ROCM_ATIER_MODEL_TEST_CASES + commands=$@ echo "Commands:$commands" #ignore certain kernels tests @@ -96,15 +100,6 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_mamba_mixer2.py" fi -#ignore certain Model tests -if [[ $commands == *" models/decoder_only/language "* ]]; then - commands=${commands//" models/decoder_only/language "/" models/decoder_only/language \ - --ignore=models/decoder_only/language/test_mistral.py \ - --ignore=models/decoder_only/language/test_phimoe.py \ - --ignore=models/decoder_only/language/test_granite.py \ - --ignore=models/decoder_only/language/test_models.py "} -fi - #ignore certain Entrypoints tests if [[ $commands == *" entrypoints/openai "* ]]; then commands=${commands//" entrypoints/openai "/" entrypoints/openai \ diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index 7dec7bbd0ca1..01ca96fd5d33 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -26,6 +26,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: + if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true": + pytest.skip("Skipping test suite for ROCM AITER") monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with hf_runner(model, dtype=dtype) as hf_model: diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 68b27a8e7087..ef240b606bd7 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -181,6 +181,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: + if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true": + pytest.skip("Skipping test suite for ROCM AITER") monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") # TODO(sang): Sliding window should be tested separately. @@ -211,6 +213,8 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: + if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true": + pytest.skip("Skipping test suite for ROCM AITER") monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with vllm_runner( @@ -248,6 +252,8 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str, def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: + if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true": + pytest.skip("Skipping test suite for ROCM AITER") monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with vllm_runner(model, @@ -271,6 +277,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str, def test_mistral_function_calling(vllm_runner, model: str, dtype: str, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: + if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true": + pytest.skip("Skipping test suite for ROCM AITER") monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with vllm_runner(model, @@ -308,6 +316,8 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str, def test_mistral_guided_decoding(vllm_runner, model: str, guided_backend: str, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: + if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true": + pytest.skip("Skipping test suite for ROCM AITER") monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with vllm_runner(model, dtype='bfloat16', diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index eca2ca18dcb7..b6b3d4b315f9 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -71,6 +71,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: + if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true": + pytest.skip("Skipping test suite for ROCM AITER") monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with hf_runner(model, dtype=dtype) as hf_model: diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py index 7e42124d7b2c..2badcaf104bd 100644 --- a/tests/models/decoder_only/language/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -85,6 +85,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, monkeypatch) -> None: if use_rocm_aiter: + if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true": + pytest.skip("Skipping test suite for ROCM AITER") monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") with hf_runner(model, dtype=dtype) as hf_model: From c32c31f37f40c8481e24255f46aa26897993a3e4 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 7 Mar 2025 17:47:32 +0000 Subject: [PATCH 29/35] refactor dispatching for w8a8 scaled-mm Signed-off-by: vllmellm --- .../model_executor/test_enabled_custom_ops.py | 44 +++ .../layers/quantization/utils/w8a8_utils.py | 270 +++++++++++------- 2 files changed, 206 insertions(+), 108 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 7c64ee2b03d9..43c8bfe37b74 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -20,6 +20,10 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( cutlass_scaled_mm, dispatch_w8a8_blockscale_func, rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + cutlass_w8a8_scaled_mm, dispatch_w8a8_scaled_mm, + rocm_aiter_per_tensor_w8a8_scaled_mm, torch_channelwise_w8a8_scaled_mm, + torch_per_tensor_w8a8_scaled_mm, torch_per_token_w8a8_scaled_mm) from vllm.platforms import current_platform @@ -178,3 +182,43 @@ def test_unquantized_linear_dispatch(use_rocm_aiter: str, assert linear_func == rocm_aiter_tgemm_mm else: assert linear_func == F.linear + + +@pytest.mark.parametrize("cutlass_fp8_supported", [True, False]) +@pytest.mark.parametrize("per_tensor_weights", [True, False]) +@pytest.mark.parametrize("per_tensor_activations", [True, False]) +@pytest.mark.parametrize("use_per_token_if_dynamic", [True, False]) +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +@pytest.mark.parametrize("use_rocm_aiter_linear", ["0", "1"]) +def test_scaled_mm_dispatch(cutlass_fp8_supported: bool, + per_tensor_weights: bool, + per_tensor_activations: bool, + use_per_token_if_dynamic: bool, + use_rocm_aiter: str, use_rocm_aiter_linear: str, + monkeypatch): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR", use_rocm_aiter_linear) + + w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(cutlass_fp8_supported, + per_tensor_weights, + per_tensor_activations, + use_per_token_if_dynamic) + + if cutlass_fp8_supported: + assert w8a8_scaled_mm_func == cutlass_w8a8_scaled_mm + + elif per_tensor_weights and per_tensor_activations: + + if current_platform.is_rocm() and int(use_rocm_aiter) and int( + use_rocm_aiter_linear): + assert w8a8_scaled_mm_func == rocm_aiter_per_tensor_w8a8_scaled_mm + else: + assert w8a8_scaled_mm_func == torch_per_tensor_w8a8_scaled_mm + + elif (current_platform.is_rocm() + and current_platform.has_device_capability(94) + and use_per_token_if_dynamic and not per_tensor_weights + and not per_tensor_activations): + assert w8a8_scaled_mm_func == torch_per_token_w8a8_scaled_mm + else: + assert w8a8_scaled_mm_func == torch_channelwise_w8a8_scaled_mm diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 32a0415c4294..60d40e12b1ba 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -1,19 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Union import torch -import vllm.envs as envs from vllm import _custom_ops as ops from vllm.config import CompilationLevel, get_current_vllm_config from vllm.platforms import current_platform -USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \ - and current_platform.is_rocm() -if USE_ROCM_AITER_LINEAR: - from aiter.tuned_gemm import tgemm as aiter_tgemm - # Input scaling factors are no longer optional in _scaled_mm starting # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale TORCH_DEVICE_IDENTITY = None @@ -127,6 +121,151 @@ def maybe_create_device_identity(): TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32) +def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor, + out_dtype: torch.dtype, scale_a: torch.Tensor, + scale_b: torch.Tensor, bias: torch.Tensor, + output_shape: List, **kwargs) -> torch.Tensor: + + # Fused GEMM_DQ + output = ops.cutlass_scaled_mm(qinput, + weight, + out_dtype=out_dtype, + scale_a=scale_a, + scale_b=scale_b, + bias=bias) + return output.view(*output_shape) + + +def rocm_aiter_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, + input_2d: torch.Tensor, + output_shape: List) -> torch.Tensor: + from aiter.tuned_gemm import tgemm as aiter_tgemm + + output = aiter_tgemm.mm(qinput, + weight.t(), + otype=out_dtype, + scale_a=scale_a, + scale_b=scale_b, + bias=bias) + if type(output) is tuple and len(output) == 2: + output = output[0] + + return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape) + + +def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, bias: torch.Tensor, + input_2d: torch.Tensor, + output_shape: List) -> torch.Tensor: + output = torch._scaled_mm(qinput, + weight, + out_dtype=out_dtype, + scale_a=scale_a, + scale_b=scale_b, + bias=bias) + if type(output) is tuple and len(output) == 2: + output = output[0] + + return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape) + + +def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, bias: torch.Tensor, + input_2d: torch.Tensor, + output_shape: List) -> torch.Tensor: + # For now validated on ROCm platform + # fp8 rowwise scaling in torch._scaled_mm is introduced in + # https://github.com/pytorch/pytorch/pull/144432 using + # hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above. + # For CUDA platform please validate if the + # torch._scaled_mm support rowwise scaled GEMM + # Fused GEMM_DQ Rowwise GEMM + output = torch._scaled_mm(qinput, + weight, + out_dtype=out_dtype, + scale_a=scale_a, + scale_b=scale_b.t(), + bias=bias) + + output = torch.narrow(output, 0, 0, input_2d.shape[0]) + output = output.view(*output_shape) + return output + + +def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, bias: torch.Tensor, + input_2d: torch.Tensor, + output_shape: List, + **kwargs) -> torch.Tensor: + # use unfused DQ due to limitations with scaled_mm + + # Symmetric quantized GEMM by definition computes the following: + # C = (s_x * X) (s_w * W) + bias + # This is equivalent to dequantizing the weights and activations + # before applying a GEMM. + # + # In order to compute quantized operands, a quantized kernel + # will rewrite the above like so: + # C = s_w * s_x * (X * W) + bias + # + # For the scaled_mm fallback case, we break this down, since it + # does not support s_w being a vector. + + # GEMM + # This computes C = (X * W). + # Output in fp32 to allow subsequent ops to happen in-place + output = torch._scaled_mm(qinput, + weight, + scale_a=TORCH_DEVICE_IDENTITY, + scale_b=TORCH_DEVICE_IDENTITY, + out_dtype=torch.float32) + # A fix for discrepancy in scaled_mm which returns tuple + # for torch < 2.5 and a single value in torch >= 2.5 + if type(output) is tuple and len(output) == 2: + output = output[0] + # Unpad (undo num_token_padding) + output = torch.narrow(output, 0, 0, input_2d.shape[0]) + x_scale = torch.narrow(scale_a, 0, 0, input_2d.shape[0]) + + # DQ + # C = sw * sx * (X * W) + bias + output = output * x_scale * scale_b.t() + if bias is not None: + output = output + bias + return output.to(out_dtype).view(*output_shape) + + +def dispatch_w8a8_scaled_mm( + cutlass_fp8_supported: bool, per_tensor_weights: bool, + per_tensor_activations: bool, + use_per_token_if_dynamic: bool) -> Callable[..., torch.Tensor]: + + if cutlass_fp8_supported: + return cutlass_w8a8_scaled_mm + if per_tensor_weights and per_tensor_activations: + if current_platform.is_rocm_aiter_linear_enabled(): + return rocm_aiter_per_tensor_w8a8_scaled_mm + return torch_per_tensor_w8a8_scaled_mm + if (use_per_token_if_dynamic and not per_tensor_weights + and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM): + return torch_per_token_w8a8_scaled_mm + return torch_channelwise_w8a8_scaled_mm + + def apply_fp8_linear( input: torch.Tensor, weight: torch.Tensor, @@ -153,23 +292,7 @@ def apply_fp8_linear( scale_ub=input_scale_ub, use_per_token_if_dynamic=use_per_token_if_dynamic) - # Fused GEMM_DQ - output = ops.cutlass_scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) - return output.view(*output_shape) - - # torch.scaled_mm supports per tensor weights + activations only - # so fallback to naive if per channel or per token else: - # Note: we pad the input because torch._scaled_mm is more performant - # for matrices with batch dimension > 16. - # This could change in the future. - # We also don't pad when using torch.compile, - # as it breaks with dynamic shapes. config = get_current_vllm_config().compilation_config do_pad = config.level < CompilationLevel.PIECEWISE qinput, x_scale = ops.scaled_fp8_quant( @@ -178,91 +301,22 @@ def apply_fp8_linear( num_token_padding=17 if do_pad else None, use_per_token_if_dynamic=use_per_token_if_dynamic) - per_tensor_weights = (weight_scale.numel() == 1) - per_tensor_activations = (x_scale.numel() == 1) - - if per_tensor_weights and per_tensor_activations: - # Fused GEMM_DQ - if USE_ROCM_AITER_LINEAR: - output = aiter_tgemm.mm(qinput, - weight.t(), - otype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) - else: - output = torch._scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) - # A fix for discrepancy in scaled_mm which returns tuple - # for torch < 2.5 and a single value in torch >= 2.5 - if type(output) is tuple and len(output) == 2: - output = output[0] - - return torch.narrow(output, 0, 0, - input_2d.shape[0]).view(*output_shape) - - elif (use_per_token_if_dynamic and not per_tensor_weights - and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM): - # For now validated on ROCm platform - # fp8 rowwise scaling in torch._scaled_mm is introduced in - # https://github.com/pytorch/pytorch/pull/144432 using - # hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above. - # For CUDA platform please validate if the - # torch._scaled_mm support rowwise scaled GEMM - # Fused GEMM_DQ Rowwise GEMM - output = torch._scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale.t(), - bias=bias) - - output = torch.narrow(output, 0, 0, input_2d.shape[0]) - output = output.view(*output_shape) - return output - - else: - # Fallback for channelwise case, where we use unfused DQ - # due to limitations with scaled_mm - - # Symmetric quantized GEMM by definition computes the following: - # C = (s_x * X) (s_w * W) + bias - # This is equivalent to dequantizing the weights and activations - # before applying a GEMM. - # - # In order to compute quantized operands, a quantized kernel - # will rewrite the above like so: - # C = s_w * s_x * (X * W) + bias - # - # For the scaled_mm fallback case, we break this down, since it - # does not support s_w being a vector. - - # GEMM - # This computes C = (X * W). - # Output in fp32 to allow subsequent ops to happen in-place - output = torch._scaled_mm(qinput, - weight, - scale_a=TORCH_DEVICE_IDENTITY, - scale_b=TORCH_DEVICE_IDENTITY, - out_dtype=torch.float32) - # A fix for discrepancy in scaled_mm which returns tuple - # for torch < 2.5 and a single value in torch >= 2.5 - if type(output) is tuple and len(output) == 2: - output = output[0] - # Unpad (undo num_token_padding) - output = torch.narrow(output, 0, 0, input_2d.shape[0]) - x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0]) - - # DQ - # C = sw * sx * (X * W) + bias - output = output * x_scale * weight_scale.t() - if bias is not None: - output = output + bias - return output.to(dtype=input.dtype).view(*output_shape) + per_tensor_weights = (weight_scale.numel() == 1) + per_tensor_activations = (x_scale.numel() == 1) + + w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(cutlass_fp8_supported, + per_tensor_weights, + per_tensor_activations, + use_per_token_if_dynamic) + + return w8a8_scaled_mm_func(qinput=qinput, + weight=weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias, + input_2d=input_2d, + output_shape=output_shape) def normalize_e4m3fn_to_e4m3fnuz( From a5d73393f7f2162460089042374e2ebf6abad476 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 8 Mar 2025 05:08:25 +0000 Subject: [PATCH 30/35] fix cutlass flag bug Signed-off-by: vllmellm --- .../layers/quantization/utils/w8a8_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 0ae255415b38..75ee0203f8fd 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -251,8 +251,8 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor, def dispatch_w8a8_scaled_mm( cutlass_fp8_supported: bool, per_tensor_weights: bool, - per_tensor_activations: bool, - use_per_token_if_dynamic: bool) -> Callable[..., torch.Tensor]: + per_tensor_activations: bool, use_per_token_if_dynamic: Optional[bool] +) -> Callable[..., torch.Tensor]: if cutlass_fp8_supported: return cutlass_w8a8_scaled_mm @@ -306,7 +306,7 @@ def apply( ) -> torch.Tensor: input_2d = input.view(-1, input.shape[-1]) output_shape = [*input.shape[:-1], weight.shape[1]] - if cutlass_fp8_supported: + if self.cutlass_fp8_supported: qinput, x_scale = ops.scaled_fp8_quant( input_2d, input_scale, @@ -326,8 +326,8 @@ def apply( per_tensor_activations = (x_scale.numel() == 1) w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm( - cutlass_fp8_supported, per_tensor_weights, per_tensor_activations, - use_per_token_if_dynamic) + self.cutlass_fp8_supported, per_tensor_weights, + per_tensor_activations, use_per_token_if_dynamic) return w8a8_scaled_mm_func(qinput=qinput, weight=weight, From ce30f63a54ead25ded4429ece2e750776316c2a0 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 8 Mar 2025 05:26:07 +0000 Subject: [PATCH 31/35] revert test requirements Signed-off-by: vllmellm --- requirements-test.in | 718 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 680 insertions(+), 38 deletions(-) diff --git a/requirements-test.in b/requirements-test.in index de33f92b37b9..cfc000a1b09f 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -1,41 +1,683 @@ -# testing -pytest -tensorizer>=2.9.0 -pytest-forked -pytest-asyncio -pytest-rerunfailures -pytest-shard - -# testing utils -awscli -decord # required for video tests -einops # required for MPT, qwen-vl and Mamba -httpx -librosa # required for audio tests -vector_quantize_pytorch # required for minicpmo_26 test -vocos # required for minicpmo_26 test -peft -pqdm -ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests -sentence-transformers # required for embedding tests -soundfile # required for audio tests -jiwer # required for audio tests -timm # required for internvl test +# This file was autogenerated by uv via the following command: +# uv pip compile requirements-test.in -o requirements-test.txt +absl-py==2.1.0 + # via rouge-score +accelerate==1.0.1 + # via + # lm-eval + # peft +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.10 + # via + # datasets + # fsspec + # lm-eval +aiosignal==1.3.1 + # via + # aiohttp + # ray +annotated-types==0.7.0 + # via pydantic +anyio==4.6.2.post1 + # via httpx +argcomplete==3.5.1 + # via datamodel-code-generator +attrs==24.2.0 + # via + # aiohttp + # jsonlines + # jsonschema + # referencing +audioread==3.0.1 + # via librosa +awscli==1.35.23 + # via -r requirements-test.in +bitsandbytes==0.45.0 + # via -r requirements-test.in +black==24.10.0 + # via datamodel-code-generator +boto3==1.35.57 + # via tensorizer +botocore==1.35.57 + # via + # awscli + # boto3 + # s3transfer +bounded-pool-executor==0.0.3 + # via pqdm +buildkite-test-collector==0.1.9 + # via -r requirements-test.in +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +cffi==1.17.1 + # via soundfile +chardet==5.2.0 + # via mbstrdecoder +charset-normalizer==3.4.0 + # via requests +click==8.1.7 + # via + # black + # jiwer + # nltk + # ray +colorama==0.4.6 + # via + # awscli + # sacrebleu + # tqdm-multiprocess +contourpy==1.3.0 + # via matplotlib +cramjam==2.9.0 + # via fastparquet +cupy-cuda12x==13.3.0 + # via ray +cycler==0.12.1 + # via matplotlib +datamodel-code-generator==0.26.3 + # via -r requirements-test.in +dataproperty==1.0.1 + # via + # pytablewriter + # tabledata +datasets==3.0.2 + # via + # evaluate + # lm-eval +decorator==5.1.1 + # via librosa +decord==0.6.0 + # via -r requirements-test.in +dill==0.3.8 + # via + # datasets + # evaluate + # lm-eval + # multiprocess +dnspython==2.7.0 + # via email-validator +docutils==0.16 + # via awscli +einops==0.8.0 + # via + # -r requirements-test.in + # encodec + # vector-quantize-pytorch + # vocos +einx==0.3.0 + # via vector-quantize-pytorch +email-validator==2.2.0 + # via pydantic +encodec==0.1.1 + # via vocos +evaluate==0.4.3 + # via lm-eval +fastparquet==2024.11.0 + # via genai-perf +fastrlock==0.8.2 + # via cupy-cuda12x +filelock==3.16.1 + # via + # datasets + # huggingface-hub + # ray + # torch + # transformers + # triton +fonttools==4.54.1 + # via matplotlib +frozendict==2.4.6 + # via einx +frozenlist==1.5.0 + # via + # aiohttp + # aiosignal + # ray +fsspec==2024.9.0 + # via + # datasets + # evaluate + # fastparquet + # huggingface-hub + # torch +genai-perf==0.0.8 + # via -r requirements-test.in +genson==1.3.0 + # via datamodel-code-generator +h11==0.14.0 + # via httpcore +hiredis==3.0.0 + # via tensorizer +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via -r requirements-test.in +huggingface-hub==0.26.2 + # via + # accelerate + # datasets + # evaluate + # peft + # sentence-transformers + # timm + # tokenizers + # transformers + # vocos +humanize==4.11.0 + # via runai-model-streamer +idna==3.10 + # via + # anyio + # email-validator + # httpx + # requests + # yarl +inflect==5.6.2 + # via datamodel-code-generator +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via datamodel-code-generator +jinja2==3.1.4 + # via + # datamodel-code-generator + # torch +jiwer==3.0.5 + # via -r requirements-test.in +jmespath==1.0.1 + # via + # boto3 + # botocore +joblib==1.4.2 + # via + # librosa + # nltk + # scikit-learn +jsonlines==4.0.0 + # via lm-eval +jsonschema==4.23.0 + # via + # mistral-common + # ray +jsonschema-specifications==2024.10.1 + # via jsonschema +kaleido==0.2.1 + # via genai-perf +kiwisolver==1.4.7 + # via matplotlib +lazy-loader==0.4 + # via librosa +libnacl==2.1.0 + # via tensorizer +librosa==0.10.2.post1 + # via -r requirements-test.in +llvmlite==0.43.0 + # via numba +lm-eval==0.4.4 + # via -r requirements-test.in +lxml==5.3.0 + # via sacrebleu +markdown-it-py==3.0.0 + # via rich +markupsafe==3.0.2 + # via jinja2 +matplotlib==3.9.2 + # via -r requirements-test.in +mbstrdecoder==1.1.3 + # via + # dataproperty + # pytablewriter + # typepy +mdurl==0.1.2 + # via markdown-it-py +mistral-common==1.5.1 + # via -r requirements-test.in +more-itertools==10.5.0 + # via lm-eval +mpmath==1.3.0 + # via sympy +msgpack==1.1.0 + # via + # librosa + # ray +multidict==6.1.0 + # via + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # datasets + # evaluate +mypy-extensions==1.0.0 + # via black +networkx==3.2.1 + # via torch +nltk==3.9.1 + # via rouge-score +numba==0.60.0 + # via librosa +numexpr==2.10.1 + # via lm-eval +numpy==1.26.4 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # contourpy + # cupy-cuda12x + # datasets + # decord + # einx + # encodec + # evaluate + # fastparquet + # genai-perf + # librosa + # matplotlib + # mistral-common + # numba + # numexpr + # opencv-python-headless + # pandas + # patsy + # peft + # rouge-score + # runai-model-streamer + # sacrebleu + # scikit-learn + # scipy + # soxr + # statsmodels + # tensorizer + # torchvision + # transformers + # tritonclient + # vocos +nvidia-cublas-cu12==12.4.5.8 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.4.127 + # via torch +opencv-python-headless==4.10.0.84 + # via mistral-common +packaging==24.1 + # via + # accelerate + # black + # datamodel-code-generator + # datasets + # evaluate + # fastparquet + # huggingface-hub + # lazy-loader + # matplotlib + # peft + # plotly + # pooch + # pytest + # pytest-rerunfailures + # ray + # statsmodels + # transformers + # typepy +pandas==2.2.3 + # via + # datasets + # evaluate + # fastparquet + # genai-perf + # statsmodels +pathspec==0.12.1 + # via black +pathvalidate==3.2.1 + # via pytablewriter +patsy==1.0.1 + # via statsmodels +peft==0.13.2 + # via + # -r requirements-test.in + # lm-eval +pillow==10.4.0 + # via + # genai-perf + # matplotlib + # mistral-common + # sentence-transformers + # torchvision +platformdirs==4.3.6 + # via + # black + # pooch +plotly==5.24.1 + # via genai-perf +pluggy==1.5.0 + # via pytest +pooch==1.8.2 + # via librosa +portalocker==2.10.1 + # via sacrebleu +pqdm==0.2.0 + # via -r requirements-test.in +propcache==0.2.0 + # via yarl +protobuf==5.28.3 + # via + # ray + # tensorizer +psutil==6.1.0 + # via + # accelerate + # peft + # tensorizer +py==1.11.0 + # via pytest-forked +pyarrow==18.0.0 + # via + # datasets + # genai-perf +pyasn1==0.6.1 + # via rsa +pybind11==2.13.6 + # via lm-eval +pycparser==2.22 + # via cffi +pydantic==2.9.2 + # via + # datamodel-code-generator + # mistral-common +pydantic-core==2.23.4 + # via pydantic +pygments==2.18.0 + # via rich +pyparsing==3.2.0 + # via matplotlib +pytablewriter==1.2.0 + # via lm-eval +pytest==8.3.3 + # via + # -r requirements-test.in + # buildkite-test-collector + # genai-perf + # pytest-asyncio + # pytest-forked + # pytest-mock + # pytest-rerunfailures + # pytest-shard +pytest-asyncio==0.24.0 + # via -r requirements-test.in +pytest-forked==1.6.0 + # via -r requirements-test.in +pytest-mock==3.14.0 + # via genai-perf +pytest-rerunfailures==14.0 + # via -r requirements-test.in +pytest-shard==0.1.2 + # via -r requirements-test.in +python-dateutil==2.9.0.post0 + # via + # botocore + # matplotlib + # pandas + # typepy +python-rapidjson==1.20 + # via tritonclient +pytz==2024.2 + # via + # pandas + # typepy +pyyaml==6.0.2 + # via + # accelerate + # awscli + # datamodel-code-generator + # datasets + # genai-perf + # huggingface-hub + # peft + # ray + # responses + # timm + # transformers + # vocos +rapidfuzz==3.12.1 + # via jiwer +ray==2.43.0 + # via -r requirements-test.in +redis==5.2.0 + # via tensorizer +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications +regex==2024.9.11 + # via + # nltk + # sacrebleu + # tiktoken + # transformers +requests==2.32.3 + # via + # buildkite-test-collector + # datasets + # evaluate + # huggingface-hub + # lm-eval + # mistral-common + # pooch + # ray + # responses + # tiktoken + # transformers +responses==0.25.3 + # via genai-perf +rich==13.9.4 + # via genai-perf +rouge-score==0.1.2 + # via lm-eval +rpds-py==0.20.1 + # via + # jsonschema + # referencing +rsa==4.7.2 + # via awscli +runai-model-streamer==0.11.0 + # via -r requirements-test.in +runai-model-streamer-s3==0.11.0 + # via -r requirements-test.in +s3transfer==0.10.3 + # via + # awscli + # boto3 +sacrebleu==2.4.3 + # via lm-eval +safetensors==0.4.5 + # via + # accelerate + # peft + # timm + # transformers +scikit-learn==1.5.2 + # via + # librosa + # lm-eval + # sentence-transformers +scipy==1.13.1 + # via + # librosa + # scikit-learn + # sentence-transformers + # statsmodels + # vocos +sentence-transformers==3.2.1 + # via -r requirements-test.in +sentencepiece==0.2.0 + # via mistral-common +setuptools==75.8.0 + # via + # pytablewriter + # torch +six==1.16.0 + # via + # python-dateutil + # rouge-score +sniffio==1.3.1 + # via + # anyio + # httpx +soundfile==0.12.1 + # via + # -r requirements-test.in + # librosa +soxr==0.5.0.post1 + # via librosa +sqlitedict==2.1.0 + # via lm-eval +statsmodels==0.14.4 + # via genai-perf +sympy==1.13.1 + # via + # einx + # torch +tabledata==1.3.3 + # via pytablewriter +tabulate==0.9.0 + # via sacrebleu +tcolorpy==0.1.6 + # via pytablewriter +tenacity==9.0.0 + # via + # lm-eval + # plotly +tensorizer==2.9.0 + # via -r requirements-test.in +threadpoolctl==3.5.0 + # via scikit-learn +tiktoken==0.7.0 + # via + # lm-eval + # mistral-common +timm==1.0.11 + # via -r requirements-test.in +tokenizers==0.21.0 + # via transformers torch==2.5.1 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # encodec + # lm-eval + # peft + # runai-model-streamer + # sentence-transformers + # tensorizer + # timm + # torchaudio + # torchvision + # vector-quantize-pytorch + # vocos torchaudio==2.5.1 -transformers_stream_generator # required for qwen-vl test -matplotlib # required for qwen-vl test -mistral_common[opencv] >= 1.5.0 # required for pixtral test -datamodel_code_generator # required for minicpm3 test -lm-eval[api]==0.4.4 # required for model evaluation test -transformers==4.48.2 -# quantization -bitsandbytes>=0.45.0 -buildkite-test-collector==0.1.9 - -genai_perf==0.0.8 + # via + # -r requirements-test.in + # encodec + # vocos +torchvision==0.20.1 + # via timm +tqdm==4.66.6 + # via + # datasets + # evaluate + # huggingface-hub + # lm-eval + # nltk + # peft + # pqdm + # sentence-transformers + # tqdm-multiprocess + # transformers +tqdm-multiprocess==0.0.11 + # via lm-eval +transformers==4.48.2 + # via + # -r requirements-test.in + # genai-perf + # lm-eval + # peft + # sentence-transformers + # transformers-stream-generator +transformers-stream-generator==0.0.5 + # via -r requirements-test.in +triton==3.1.0 + # via torch tritonclient==2.51.0 - -numpy < 2.0.0 -runai-model-streamer==0.11.0 -runai-model-streamer-s3==0.11.0 \ No newline at end of file + # via + # -r requirements-test.in + # genai-perf +typepy==1.3.2 + # via + # dataproperty + # pytablewriter + # tabledata +typing-extensions==4.12.2 + # via + # bitsandbytes + # huggingface-hub + # librosa + # mistral-common + # pqdm + # pydantic + # pydantic-core + # torch +tzdata==2024.2 + # via pandas +urllib3==2.2.3 + # via + # botocore + # requests + # responses + # tritonclient +vector-quantize-pytorch==1.21.2 + # via -r requirements-test.in +vocos==0.1.0 + # via -r requirements-test.in +word2number==1.1 + # via lm-eval +xxhash==3.5.0 + # via + # datasets + # evaluate +yarl==1.17.1 + # via aiohttp +zstandard==0.23.0 + # via lm-eval \ No newline at end of file From 375e9db5bc9b87865d829d7ba5d66ed203883a00 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 8 Mar 2025 05:29:10 +0000 Subject: [PATCH 32/35] revert test requirements Signed-off-by: vllmellm --- requirements-test.in | 718 +++--------------------------------------- requirements-test.txt | 24 +- 2 files changed, 42 insertions(+), 700 deletions(-) diff --git a/requirements-test.in b/requirements-test.in index cfc000a1b09f..de33f92b37b9 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -1,683 +1,41 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile requirements-test.in -o requirements-test.txt -absl-py==2.1.0 - # via rouge-score -accelerate==1.0.1 - # via - # lm-eval - # peft -aiohappyeyeballs==2.4.3 - # via aiohttp -aiohttp==3.10.10 - # via - # datasets - # fsspec - # lm-eval -aiosignal==1.3.1 - # via - # aiohttp - # ray -annotated-types==0.7.0 - # via pydantic -anyio==4.6.2.post1 - # via httpx -argcomplete==3.5.1 - # via datamodel-code-generator -attrs==24.2.0 - # via - # aiohttp - # jsonlines - # jsonschema - # referencing -audioread==3.0.1 - # via librosa -awscli==1.35.23 - # via -r requirements-test.in -bitsandbytes==0.45.0 - # via -r requirements-test.in -black==24.10.0 - # via datamodel-code-generator -boto3==1.35.57 - # via tensorizer -botocore==1.35.57 - # via - # awscli - # boto3 - # s3transfer -bounded-pool-executor==0.0.3 - # via pqdm -buildkite-test-collector==0.1.9 - # via -r requirements-test.in -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests -cffi==1.17.1 - # via soundfile -chardet==5.2.0 - # via mbstrdecoder -charset-normalizer==3.4.0 - # via requests -click==8.1.7 - # via - # black - # jiwer - # nltk - # ray -colorama==0.4.6 - # via - # awscli - # sacrebleu - # tqdm-multiprocess -contourpy==1.3.0 - # via matplotlib -cramjam==2.9.0 - # via fastparquet -cupy-cuda12x==13.3.0 - # via ray -cycler==0.12.1 - # via matplotlib -datamodel-code-generator==0.26.3 - # via -r requirements-test.in -dataproperty==1.0.1 - # via - # pytablewriter - # tabledata -datasets==3.0.2 - # via - # evaluate - # lm-eval -decorator==5.1.1 - # via librosa -decord==0.6.0 - # via -r requirements-test.in -dill==0.3.8 - # via - # datasets - # evaluate - # lm-eval - # multiprocess -dnspython==2.7.0 - # via email-validator -docutils==0.16 - # via awscli -einops==0.8.0 - # via - # -r requirements-test.in - # encodec - # vector-quantize-pytorch - # vocos -einx==0.3.0 - # via vector-quantize-pytorch -email-validator==2.2.0 - # via pydantic -encodec==0.1.1 - # via vocos -evaluate==0.4.3 - # via lm-eval -fastparquet==2024.11.0 - # via genai-perf -fastrlock==0.8.2 - # via cupy-cuda12x -filelock==3.16.1 - # via - # datasets - # huggingface-hub - # ray - # torch - # transformers - # triton -fonttools==4.54.1 - # via matplotlib -frozendict==2.4.6 - # via einx -frozenlist==1.5.0 - # via - # aiohttp - # aiosignal - # ray -fsspec==2024.9.0 - # via - # datasets - # evaluate - # fastparquet - # huggingface-hub - # torch -genai-perf==0.0.8 - # via -r requirements-test.in -genson==1.3.0 - # via datamodel-code-generator -h11==0.14.0 - # via httpcore -hiredis==3.0.0 - # via tensorizer -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via -r requirements-test.in -huggingface-hub==0.26.2 - # via - # accelerate - # datasets - # evaluate - # peft - # sentence-transformers - # timm - # tokenizers - # transformers - # vocos -humanize==4.11.0 - # via runai-model-streamer -idna==3.10 - # via - # anyio - # email-validator - # httpx - # requests - # yarl -inflect==5.6.2 - # via datamodel-code-generator -iniconfig==2.0.0 - # via pytest -isort==5.13.2 - # via datamodel-code-generator -jinja2==3.1.4 - # via - # datamodel-code-generator - # torch -jiwer==3.0.5 - # via -r requirements-test.in -jmespath==1.0.1 - # via - # boto3 - # botocore -joblib==1.4.2 - # via - # librosa - # nltk - # scikit-learn -jsonlines==4.0.0 - # via lm-eval -jsonschema==4.23.0 - # via - # mistral-common - # ray -jsonschema-specifications==2024.10.1 - # via jsonschema -kaleido==0.2.1 - # via genai-perf -kiwisolver==1.4.7 - # via matplotlib -lazy-loader==0.4 - # via librosa -libnacl==2.1.0 - # via tensorizer -librosa==0.10.2.post1 - # via -r requirements-test.in -llvmlite==0.43.0 - # via numba -lm-eval==0.4.4 - # via -r requirements-test.in -lxml==5.3.0 - # via sacrebleu -markdown-it-py==3.0.0 - # via rich -markupsafe==3.0.2 - # via jinja2 -matplotlib==3.9.2 - # via -r requirements-test.in -mbstrdecoder==1.1.3 - # via - # dataproperty - # pytablewriter - # typepy -mdurl==0.1.2 - # via markdown-it-py -mistral-common==1.5.1 - # via -r requirements-test.in -more-itertools==10.5.0 - # via lm-eval -mpmath==1.3.0 - # via sympy -msgpack==1.1.0 - # via - # librosa - # ray -multidict==6.1.0 - # via - # aiohttp - # yarl -multiprocess==0.70.16 - # via - # datasets - # evaluate -mypy-extensions==1.0.0 - # via black -networkx==3.2.1 - # via torch -nltk==3.9.1 - # via rouge-score -numba==0.60.0 - # via librosa -numexpr==2.10.1 - # via lm-eval -numpy==1.26.4 - # via - # -r requirements-test.in - # accelerate - # bitsandbytes - # contourpy - # cupy-cuda12x - # datasets - # decord - # einx - # encodec - # evaluate - # fastparquet - # genai-perf - # librosa - # matplotlib - # mistral-common - # numba - # numexpr - # opencv-python-headless - # pandas - # patsy - # peft - # rouge-score - # runai-model-streamer - # sacrebleu - # scikit-learn - # scipy - # soxr - # statsmodels - # tensorizer - # torchvision - # transformers - # tritonclient - # vocos -nvidia-cublas-cu12==12.4.5.8 - # via - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 - # torch -nvidia-cuda-cupti-cu12==12.4.127 - # via torch -nvidia-cuda-nvrtc-cu12==12.4.127 - # via torch -nvidia-cuda-runtime-cu12==12.4.127 - # via torch -nvidia-cudnn-cu12==9.1.0.70 - # via torch -nvidia-cufft-cu12==11.2.1.3 - # via torch -nvidia-curand-cu12==10.3.5.147 - # via torch -nvidia-cusolver-cu12==11.6.1.9 - # via torch -nvidia-cusparse-cu12==12.3.1.170 - # via - # nvidia-cusolver-cu12 - # torch -nvidia-nccl-cu12==2.21.5 - # via torch -nvidia-nvjitlink-cu12==12.4.127 - # via - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 - # torch -nvidia-nvtx-cu12==12.4.127 - # via torch -opencv-python-headless==4.10.0.84 - # via mistral-common -packaging==24.1 - # via - # accelerate - # black - # datamodel-code-generator - # datasets - # evaluate - # fastparquet - # huggingface-hub - # lazy-loader - # matplotlib - # peft - # plotly - # pooch - # pytest - # pytest-rerunfailures - # ray - # statsmodels - # transformers - # typepy -pandas==2.2.3 - # via - # datasets - # evaluate - # fastparquet - # genai-perf - # statsmodels -pathspec==0.12.1 - # via black -pathvalidate==3.2.1 - # via pytablewriter -patsy==1.0.1 - # via statsmodels -peft==0.13.2 - # via - # -r requirements-test.in - # lm-eval -pillow==10.4.0 - # via - # genai-perf - # matplotlib - # mistral-common - # sentence-transformers - # torchvision -platformdirs==4.3.6 - # via - # black - # pooch -plotly==5.24.1 - # via genai-perf -pluggy==1.5.0 - # via pytest -pooch==1.8.2 - # via librosa -portalocker==2.10.1 - # via sacrebleu -pqdm==0.2.0 - # via -r requirements-test.in -propcache==0.2.0 - # via yarl -protobuf==5.28.3 - # via - # ray - # tensorizer -psutil==6.1.0 - # via - # accelerate - # peft - # tensorizer -py==1.11.0 - # via pytest-forked -pyarrow==18.0.0 - # via - # datasets - # genai-perf -pyasn1==0.6.1 - # via rsa -pybind11==2.13.6 - # via lm-eval -pycparser==2.22 - # via cffi -pydantic==2.9.2 - # via - # datamodel-code-generator - # mistral-common -pydantic-core==2.23.4 - # via pydantic -pygments==2.18.0 - # via rich -pyparsing==3.2.0 - # via matplotlib -pytablewriter==1.2.0 - # via lm-eval -pytest==8.3.3 - # via - # -r requirements-test.in - # buildkite-test-collector - # genai-perf - # pytest-asyncio - # pytest-forked - # pytest-mock - # pytest-rerunfailures - # pytest-shard -pytest-asyncio==0.24.0 - # via -r requirements-test.in -pytest-forked==1.6.0 - # via -r requirements-test.in -pytest-mock==3.14.0 - # via genai-perf -pytest-rerunfailures==14.0 - # via -r requirements-test.in -pytest-shard==0.1.2 - # via -r requirements-test.in -python-dateutil==2.9.0.post0 - # via - # botocore - # matplotlib - # pandas - # typepy -python-rapidjson==1.20 - # via tritonclient -pytz==2024.2 - # via - # pandas - # typepy -pyyaml==6.0.2 - # via - # accelerate - # awscli - # datamodel-code-generator - # datasets - # genai-perf - # huggingface-hub - # peft - # ray - # responses - # timm - # transformers - # vocos -rapidfuzz==3.12.1 - # via jiwer -ray==2.43.0 - # via -r requirements-test.in -redis==5.2.0 - # via tensorizer -referencing==0.35.1 - # via - # jsonschema - # jsonschema-specifications -regex==2024.9.11 - # via - # nltk - # sacrebleu - # tiktoken - # transformers -requests==2.32.3 - # via - # buildkite-test-collector - # datasets - # evaluate - # huggingface-hub - # lm-eval - # mistral-common - # pooch - # ray - # responses - # tiktoken - # transformers -responses==0.25.3 - # via genai-perf -rich==13.9.4 - # via genai-perf -rouge-score==0.1.2 - # via lm-eval -rpds-py==0.20.1 - # via - # jsonschema - # referencing -rsa==4.7.2 - # via awscli -runai-model-streamer==0.11.0 - # via -r requirements-test.in -runai-model-streamer-s3==0.11.0 - # via -r requirements-test.in -s3transfer==0.10.3 - # via - # awscli - # boto3 -sacrebleu==2.4.3 - # via lm-eval -safetensors==0.4.5 - # via - # accelerate - # peft - # timm - # transformers -scikit-learn==1.5.2 - # via - # librosa - # lm-eval - # sentence-transformers -scipy==1.13.1 - # via - # librosa - # scikit-learn - # sentence-transformers - # statsmodels - # vocos -sentence-transformers==3.2.1 - # via -r requirements-test.in -sentencepiece==0.2.0 - # via mistral-common -setuptools==75.8.0 - # via - # pytablewriter - # torch -six==1.16.0 - # via - # python-dateutil - # rouge-score -sniffio==1.3.1 - # via - # anyio - # httpx -soundfile==0.12.1 - # via - # -r requirements-test.in - # librosa -soxr==0.5.0.post1 - # via librosa -sqlitedict==2.1.0 - # via lm-eval -statsmodels==0.14.4 - # via genai-perf -sympy==1.13.1 - # via - # einx - # torch -tabledata==1.3.3 - # via pytablewriter -tabulate==0.9.0 - # via sacrebleu -tcolorpy==0.1.6 - # via pytablewriter -tenacity==9.0.0 - # via - # lm-eval - # plotly -tensorizer==2.9.0 - # via -r requirements-test.in -threadpoolctl==3.5.0 - # via scikit-learn -tiktoken==0.7.0 - # via - # lm-eval - # mistral-common -timm==1.0.11 - # via -r requirements-test.in -tokenizers==0.21.0 - # via transformers +# testing +pytest +tensorizer>=2.9.0 +pytest-forked +pytest-asyncio +pytest-rerunfailures +pytest-shard + +# testing utils +awscli +decord # required for video tests +einops # required for MPT, qwen-vl and Mamba +httpx +librosa # required for audio tests +vector_quantize_pytorch # required for minicpmo_26 test +vocos # required for minicpmo_26 test +peft +pqdm +ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests +sentence-transformers # required for embedding tests +soundfile # required for audio tests +jiwer # required for audio tests +timm # required for internvl test torch==2.5.1 - # via - # -r requirements-test.in - # accelerate - # bitsandbytes - # encodec - # lm-eval - # peft - # runai-model-streamer - # sentence-transformers - # tensorizer - # timm - # torchaudio - # torchvision - # vector-quantize-pytorch - # vocos torchaudio==2.5.1 - # via - # -r requirements-test.in - # encodec - # vocos -torchvision==0.20.1 - # via timm -tqdm==4.66.6 - # via - # datasets - # evaluate - # huggingface-hub - # lm-eval - # nltk - # peft - # pqdm - # sentence-transformers - # tqdm-multiprocess - # transformers -tqdm-multiprocess==0.0.11 - # via lm-eval -transformers==4.48.2 - # via - # -r requirements-test.in - # genai-perf - # lm-eval - # peft - # sentence-transformers - # transformers-stream-generator -transformers-stream-generator==0.0.5 - # via -r requirements-test.in -triton==3.1.0 - # via torch +transformers_stream_generator # required for qwen-vl test +matplotlib # required for qwen-vl test +mistral_common[opencv] >= 1.5.0 # required for pixtral test +datamodel_code_generator # required for minicpm3 test +lm-eval[api]==0.4.4 # required for model evaluation test +transformers==4.48.2 +# quantization +bitsandbytes>=0.45.0 +buildkite-test-collector==0.1.9 + +genai_perf==0.0.8 tritonclient==2.51.0 - # via - # -r requirements-test.in - # genai-perf -typepy==1.3.2 - # via - # dataproperty - # pytablewriter - # tabledata -typing-extensions==4.12.2 - # via - # bitsandbytes - # huggingface-hub - # librosa - # mistral-common - # pqdm - # pydantic - # pydantic-core - # torch -tzdata==2024.2 - # via pandas -urllib3==2.2.3 - # via - # botocore - # requests - # responses - # tritonclient -vector-quantize-pytorch==1.21.2 - # via -r requirements-test.in -vocos==0.1.0 - # via -r requirements-test.in -word2number==1.1 - # via lm-eval -xxhash==3.5.0 - # via - # datasets - # evaluate -yarl==1.17.1 - # via aiohttp -zstandard==0.23.0 - # via lm-eval \ No newline at end of file + +numpy < 2.0.0 +runai-model-streamer==0.11.0 +runai-model-streamer-s3==0.11.0 \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt index e5bf67e099e4..cfc000a1b09f 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -23,10 +23,6 @@ anyio==4.6.2.post1 # via httpx argcomplete==3.5.1 # via datamodel-code-generator -async-timeout==4.0.3 - # via - # aiohttp - # redis attrs==24.2.0 # via # aiohttp @@ -120,10 +116,6 @@ encodec==0.1.1 # via vocos evaluate==0.4.3 # via lm-eval -exceptiongroup==1.2.2 - # via - # anyio - # pytest fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -552,7 +544,9 @@ sentence-transformers==3.2.1 sentencepiece==0.2.0 # via mistral-common setuptools==75.8.0 - # via pytablewriter + # via + # pytablewriter + # torch six==1.16.0 # via # python-dateutil @@ -597,12 +591,6 @@ timm==1.0.11 # via -r requirements-test.in tokenizers==0.21.0 # via transformers -toml==0.10.2 - # via datamodel-code-generator -tomli==2.2.1 - # via - # black - # pytest torch==2.5.1 # via # -r requirements-test.in @@ -663,17 +651,13 @@ typepy==1.3.2 # tabledata typing-extensions==4.12.2 # via - # anyio # bitsandbytes - # black # huggingface-hub # librosa # mistral-common - # multidict # pqdm # pydantic # pydantic-core - # rich # torch tzdata==2024.2 # via pandas @@ -696,4 +680,4 @@ xxhash==3.5.0 yarl==1.17.1 # via aiohttp zstandard==0.23.0 - # via lm-eval + # via lm-eval \ No newline at end of file From fd3f4e354fa5ff4785ce8737af4bc621ad73ace1 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 8 Mar 2025 05:36:03 +0000 Subject: [PATCH 33/35] revert test requirements Signed-off-by: vllmellm --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index cfc000a1b09f..f5722c82e201 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -680,4 +680,4 @@ xxhash==3.5.0 yarl==1.17.1 # via aiohttp zstandard==0.23.0 - # via lm-eval \ No newline at end of file + # via lm-eval From f64bfe06a557bf9090074b5d8c1a8b8f3af175dc Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 11 Mar 2025 05:22:31 +0000 Subject: [PATCH 34/35] addressing PR comment reviews: fix isort ignores, revert back missing comments from merge conflict, code edocumentation Signed-off-by: vllmellm --- vllm/attention/backends/rocm_flash_attn.py | 5 +++ vllm/model_executor/layers/fused_moe/layer.py | 10 ++---- vllm/model_executor/layers/layernorm.py | 4 +-- .../model_executor/layers/quantization/fp8.py | 34 ++++++++----------- .../layers/quantization/utils/w8a8_utils.py | 23 ++++++++++--- 5 files changed, 44 insertions(+), 32 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 8d2868f8fded..9759ff61205d 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -614,6 +614,11 @@ def forward( else: assert value is None + # Reshaping kv tensors is required for AITER paged attention kernel + # because it works on a different tensor shape, + # when the size of one element is one byte (int8/fp8 dtypes). + # This reshaping is only required on the first forward call + # and the kv cache must not be empty. if (current_platform.is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1 and not self.aiter_kv_scales_initialized diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 99d0628ef5d8..90c915ed40b7 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -# isort: skip_file from abc import abstractmethod from enum import Enum @@ -102,16 +101,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if current_platform.is_rocm_aiter_moe_enabled(): # reshaping weights is required for aiter moe kernel. - from aiter.ops.shuffle import (shuffle_weight as - rocm_aiter_shuffle_weight) + from aiter.ops.shuffle import shuffle_weight - shuffled_w13_weight = rocm_aiter_shuffle_weight( - layer.w13_weight.data) + shuffled_w13_weight = shuffle_weight(layer.w13_weight.data) layer.w13_weight = torch.nn.Parameter(shuffled_w13_weight, requires_grad=False) - shuffled_w2_weight = rocm_aiter_shuffle_weight( - layer.w2_weight.data) + shuffled_w2_weight = shuffle_weight(layer.w2_weight.data) layer.w2_weight = torch.nn.Parameter(shuffled_w2_weight, requires_grad=False) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index a3e12bf49d0f..8c3c0d02fafa 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -24,7 +24,7 @@ def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float, def fused_add_rms_norm( *, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, - variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]: + variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]: from vllm import _custom_ops as ops ops.fused_add_rms_norm( @@ -38,7 +38,7 @@ def fused_add_rms_norm( def rocm_aiter_rmsnorm2d_fwd_with_add( *, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, - variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]: + variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]: import aiter as rocm_aiter rocm_aiter.rmsnorm2d_fwd_with_add( diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 07b71a4e97cf..0bca83d1d332 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -# isort: skip_file from typing import Any, Callable, Dict, List, Optional @@ -558,13 +557,12 @@ def process_weights_after_loading(self, layer: Module) -> None: requires_grad=False) if current_platform.is_rocm_aiter_fp8_block_scaled_moe_enabled(): # reshaping weights is required for aiter moe kernel. - from aiter.ops.shuffle import (shuffle_weight as - rocm_aiter_shuffle_weight) + from aiter.ops.shuffle import shuffle_weight - layer.w13_weight = torch.nn.Parameter( - rocm_aiter_shuffle_weight(layer.w13_weight.data), - requires_grad=False) - layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight( + layer.w13_weight = torch.nn.Parameter(shuffle_weight( + layer.w13_weight.data), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffle_weight( layer.w2_weight.data), requires_grad=False) return @@ -599,8 +597,7 @@ def process_weights_after_loading(self, layer: Module) -> None: if current_platform.is_rocm_aiter_moe_enabled(): # reshaping weights is required for aiter moe kernel. - from aiter.ops.shuffle import (shuffle_weight as - rocm_aiter_shuffle_weight) + from aiter.ops.shuffle import shuffle_weight w13_scales = layer.w13_weight_scale.data.unsqueeze( -1).unsqueeze(-1).expand( @@ -611,10 +608,10 @@ def process_weights_after_loading(self, layer: Module) -> None: w2_scales.contiguous(), requires_grad=False) layer.w13_weight_scale = torch.nn.Parameter( w13_scales.contiguous(), requires_grad=False) - layer.w13_weight = torch.nn.Parameter( - rocm_aiter_shuffle_weight(layer.w13_weight), - requires_grad=False) - layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight( + layer.w13_weight = torch.nn.Parameter(shuffle_weight( + layer.w13_weight), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffle_weight( layer.w2_weight), requires_grad=False) return @@ -687,8 +684,7 @@ def process_weights_after_loading(self, layer: Module) -> None: if current_platform.is_rocm_aiter_moe_enabled(): # reshaping weights is required for aiter moe kernel. - from aiter.ops.shuffle import (shuffle_weight as - rocm_aiter_shuffle_weight) + from aiter.ops.shuffle import shuffle_weight max_w13_scales = max_w13_scales.unsqueeze(-1).unsqueeze( -1).expand((-1, layer.w13_weight.shape[1], -1)) @@ -696,10 +692,10 @@ def process_weights_after_loading(self, layer: Module) -> None: -1).expand((-1, layer.w2_weight.shape[1], -1)) layer.w2_weight_scale = torch.nn.Parameter( w2_scales.contiguous(), requires_grad=False) - layer.w13_weight = torch.nn.Parameter( - rocm_aiter_shuffle_weight(layer.w13_weight), - requires_grad=False) - layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight( + layer.w13_weight = torch.nn.Parameter(shuffle_weight( + layer.w13_weight), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffle_weight( layer.w2_weight), requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 75ee0203f8fd..e4a966996af2 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -171,6 +171,8 @@ def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, scale_a=scale_a, scale_b=scale_b, bias=bias) + # A fix for discrepancy in scaled_mm which returns tuple + # for torch < 2.5 and a single value in torch >= 2.5 if type(output) is tuple and len(output) == 2: output = output[0] @@ -211,7 +213,7 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor, input_2d: torch.Tensor, output_shape: List, **kwargs) -> torch.Tensor: - # use unfused DQ due to limitations with scaled_mm + # Use unfused DQ due to limitations with scaled_mm # Symmetric quantized GEMM by definition computes the following: # C = (s_x * X) (s_w * W) + bias @@ -260,6 +262,8 @@ def dispatch_w8a8_scaled_mm( if current_platform.is_rocm_aiter_linear_enabled(): return rocm_aiter_per_tensor_w8a8_scaled_mm return torch_per_tensor_w8a8_scaled_mm + # torch.scaled_mm supports per tensor weights + activations only + # so fallback to naive if per channel or per token if (use_per_token_if_dynamic and not per_tensor_weights and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM): return torch_per_token_w8a8_scaled_mm @@ -304,8 +308,20 @@ def apply( # TODO(luka) remove this parameter in favor of __init__ use_per_token_if_dynamic: Optional[bool] = None ) -> torch.Tensor: + # ops.scaled_fp8_quant supports both dynamic and static quant. + # If dynamic, layer.input_scale is None and x_scale computed from x. + # If static, layer.input_scale is scalar and x_scale is input_scale. + + # View input as 2D matrix for fp8 methods + input_2d = input.view(-1, input.shape[-1]) output_shape = [*input.shape[:-1], weight.shape[1]] + # TODO(luka) this is here because currently MLA only decides this + # during the forward method instead of in __init__. + if use_per_token_if_dynamic is None: + use_per_token_if_dynamic = self.use_per_token_if_dynamic + + # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A if self.cutlass_fp8_supported: qinput, x_scale = ops.scaled_fp8_quant( input_2d, @@ -314,12 +330,11 @@ def apply( use_per_token_if_dynamic=use_per_token_if_dynamic) else: - config = get_current_vllm_config().compilation_config - do_pad = config.level < CompilationLevel.PIECEWISE + # Maybe apply padding to output, see comment in __init__ qinput, x_scale = ops.scaled_fp8_quant( input_2d, input_scale, - num_token_padding=17 if do_pad else None, + num_token_padding=self.output_padding, use_per_token_if_dynamic=use_per_token_if_dynamic) per_tensor_weights = (weight_scale.numel() == 1) From c1297e5d35d8018bb69122e46cbe79e160c5572b Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 11 Mar 2025 05:53:40 +0000 Subject: [PATCH 35/35] add missing comment in fp8_utils Signed-off-by: vllmellm --- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index b0ab94ccfff5..929396d58b09 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -37,6 +37,7 @@ def shape_supported_by_cutlass(weight: torch.Tensor, block_size: List[int], weight_scale: torch.Tensor, input_2d: torch.Tensor) -> bool: if current_platform.is_rocm(): + # TODO this is never used, as cutlass_block_fp8_supported is False scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) + input_2d.shape[:-1])[::-1] scale_b_shape = (weight_scale.view(-1, 1)