From 09133e9833811778240b3c2cc4de2390fd08e470 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 26 Feb 2025 06:36:20 +0000
Subject: [PATCH 01/35] integrate aiter kernels: Linear, Norm, MOE

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 Dockerfile.rocm                               |   6 +
 vllm/envs.py                                  |  29 +++++
 .../layers/fused_moe/fused_moe.py             | 113 ++++++++++++++++--
 vllm/model_executor/layers/fused_moe/layer.py |  14 +++
 vllm/model_executor/layers/layernorm.py       |  30 ++++-
 vllm/model_executor/layers/linear.py          |   9 ++
 .../model_executor/layers/quantization/fp8.py |  19 +++
 .../layers/quantization/utils/w8a8_utils.py   |  26 +++-
 8 files changed, 223 insertions(+), 23 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 14c522afd7f9..932661dba135 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -109,11 +109,17 @@ ARG COMMON_WORKDIR
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 
+RUN git clone --recursive https://github.com/ROCm/aiter.git
+RUN cd /app/aiter && GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
+
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
+# Enable Aiter. Make sure this only exists on the aiter branch.
+# ENV VLLM_USE_AITER=1
+
 CMD ["/bin/bash"]
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 84426cb5bb22..a88db87bf824 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,10 @@
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
     VLLM_ROCM_FP8_PADDING: bool = True
+    VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_LINEAR: bool = True
+    VLLM_ROCM_USE_AITER_MOE: bool = True
+    VLLM_ROCM_USE_AITER_NORM: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -510,6 +514,31 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_USE_V1":
     lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
 
+    # use aiter ops unless specifically disabled
+    "VLLM_ROCM_USE_AITER":
+    lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in ("true", "1")),
+
+    # use aiter moe op if aiter ops are enabled
+    "VLLM_ROCM_USE_AITER_MOE":
+    lambda:
+    (os.getenv("VLLM_USE_AITER", "False").lower() in
+     ("true", "1") and os.getenv("VLLM_USE_AITER_MOE", "True").lower() in
+     ("true", "1")),
+
+    # use aiter linear op if aiter ops are enabled
+    "VLLM_ROCM_USE_AITER_LINEAR":
+    lambda:
+    (os.getenv("VLLM_USE_AITER", "False").lower() in
+     ("true", "1") and os.getenv("VLLM_USE_AITER_LINEAR", "True").lower() in
+     ("true", "1")),
+
+    # use aiter rms norm op if aiter ops are enabled
+    "VLLM_ROCM_USE_AITER_NORM":
+    lambda:
+    (os.getenv("VLLM_USE_AITER", "False").lower() in
+     ("true", "1") and os.getenv("VLLM_USE_AITER_NORM", "True").lower() in
+     ("true", "1")),
+
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index bc9573b36df7..6524084c494c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -17,6 +17,12 @@
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm(
+)
+if USE_ROCM_AITER_FMOE:
+    import aiter
+    import aiter.fused_moe_bf16_asm as aiter_fmoe_asm
+
 logger = init_logger(__name__)
 
 
@@ -946,17 +952,22 @@ def fused_topk(
                                         dtype=torch.int32,
                                         device=hidden_states.device)
 
-    ops.topk_softmax(
-        topk_weights,
-        topk_ids,
-        token_expert_indicies,
-        gating_output.float(),  # TODO(woosuk): Optimize this.
-    )
-    del token_expert_indicies  # Not used. Will be used in the future.
+    if USE_ROCM_AITER_FMOE:
+        aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies,
+                           gating_output.float(), renormalize)
+    else:
+        ops.topk_softmax(
+            topk_weights,
+            topk_ids,
+            token_expert_indicies,
+            gating_output.float(),  # TODO(woosuk): Optimize this.
+        )
 
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+        if renormalize:
+            topk_weights = topk_weights / topk_weights.sum(dim=-1,
+                                                           keepdim=True)
 
+    del token_expert_indicies  # Not used. Will be used in the future.
     return topk_weights, topk_ids
 
 
@@ -1141,6 +1152,81 @@ def outplace_fused_experts_fake(
 )
 
 
+def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
+                             w1: torch.Tensor,
+                             w2: torch.Tensor,
+                             topk_weights: torch.Tensor,
+                             topk_ids: torch.Tensor,
+                             use_fp8_w8a8: bool = False,
+                             use_fp8_blockscale: bool = False,
+                             w1_scale: Optional[torch.Tensor] = None,
+                             w2_scale: Optional[torch.Tensor] = None,
+                             block_shape: Optional[List[int]] = None,
+                             expert_mask: Optional[torch.Tensor] = None):
+
+    if use_fp8_blockscale:
+        local_E = E = w1.shape[0]
+        if expert_mask is not None:
+            E = expert_mask.numel()
+
+        topk = topk_ids.shape[1]
+        model_dim = w1.shape[-1]
+        dtype = hidden_states.dtype
+        scale_blk_k = block_shape[1]
+
+        (
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            out_asm,
+        ) = aiter_fmoe_asm.moe_sorting_ck(topk_ids,
+                                          topk_weights,
+                                          E,
+                                          model_dim,
+                                          dtype,
+                                          expert_mask=expert_mask)
+
+        a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k)
+        aiter.fmoe_fp8_blockscale_g1u1(
+            out_asm,
+            a1,
+            w1,
+            w2,
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            topk,
+            w1_scale.view(local_E, -1),
+            w2_scale.view(local_E, -1),
+            a1_scale.t().contiguous(),
+            block_shape[0],
+            block_shape[1],
+            None,
+        )
+        return out_asm
+
+    if use_fp8_w8a8:
+        return aiter_fmoe_asm.asm_moe(hidden_states=hidden_states,
+                                      w1=w1,
+                                      w2=w2,
+                                      topk_weight=topk_weights,
+                                      topk_ids=topk_ids,
+                                      fc1_scale=w1_scale,
+                                      fc2_scale=w2_scale,
+                                      fc1_smooth_scale=None,
+                                      fc2_smooth_scale=None,
+                                      a16=False)
+    else:
+        return aiter.ck_moe(hidden_states=hidden_states,
+                            w1=w1,
+                            w2=w2,
+                            topk_weights=topk_weights,
+                            topk_ids=topk_ids,
+                            expert_mask=expert_mask)
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
@@ -1150,6 +1236,7 @@ def fused_experts(hidden_states: torch.Tensor,
                   use_fp8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
                   use_int4_w4a16: bool = False,
+                  use_fp8_blockscale: bool = False,
                   global_num_experts: int = -1,
                   expert_map: Optional[torch.Tensor] = None,
                   w1_scale: Optional[torch.Tensor] = None,
@@ -1158,8 +1245,12 @@ def fused_experts(hidden_states: torch.Tensor,
                   w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[List[int]] = None) -> torch.Tensor:
-
+                  block_shape: Optional[List[int]] = None,
+                  expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if USE_ROCM_AITER_FMOE:
+        rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids,
+                                 use_fp8_w8a8, use_fp8_blockscale, w1_scale,
+                                 w2_scale, block_shape, expert_mask)
     if inplace:
         torch.ops.vllm.inplace_fused_experts(
             hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 452f390f4987..3d3fc51723b1 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -27,6 +27,12 @@
     from .moe_torch_iterative import fused_moe as fused_moe_pallas
 else:
     fused_moe_pallas = None  # type: ignore
+
+USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm(
+)
+if USE_ROCM_AITER_FMOE:
+    import aiter.ops as aiter_ops
+
 logger = init_logger(__name__)
 
 
@@ -95,6 +101,14 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
+        if USE_ROCM_AITER_FMOE:
+            layer.w13_weight = torch.nn.Parameter(aiter_ops.shuffle_weight(
+                layer.w13_weight.data),
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(aiter_ops.shuffle_weight(
+                layer.w2_weight.data),
+                                                 requires_grad=False)
+
         if current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
                 import intel_extension_for_pytorch as ipex
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index b476fb0dbc7e..236cbbbe4a50 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -5,7 +5,14 @@
 import torch
 import torch.nn as nn
 
+from vllm.envs import VLLM_ROCM_USE_AITER_NORM
 from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+
+USE_ROCM_AITER_NORM = VLLM_ROCM_USE_AITER_NORM \
+    and current_platform.is_rocm()
+if USE_ROCM_AITER_NORM:
+    import aiter
 
 
 @CustomOp.register("rms_norm")
@@ -84,13 +91,24 @@ def forward_cuda(
         from vllm import _custom_ops as ops
 
         if residual is not None:
-            ops.fused_add_rms_norm(
-                x,
-                residual,
-                self.weight.data,
-                self.variance_epsilon,
-            )
+            if USE_ROCM_AITER_NORM:
+                aiter.rmsnorm2d_fwd_with_add(
+                    x,
+                    x,
+                    residual,
+                    residual,
+                    self.weight.data,
+                    self.variance_epsilon,
+                )
+            else:
+                ops.fused_add_rms_norm(
+                    x,
+                    residual,
+                    self.weight.data,
+                    self.variance_epsilon,
+                )
             return x, residual
+
         out = torch.empty_like(x)
         ops.rms_norm(
             out,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 521724765beb..0ffb7ec52932 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -13,6 +13,7 @@
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
+from vllm.envs import VLLM_ROCM_USE_AITER_LINEAR
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
@@ -25,6 +26,12 @@
                                            RowvLLMParameter)
 # yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+USE_ROCM_AITER_LINEAR = VLLM_ROCM_USE_AITER_LINEAR \
+    and current_platform.is_rocm()
+if USE_ROCM_AITER_LINEAR:
+    from aiter.tuned_gemm import tgemm as aiter_tgemm
 
 logger = init_logger(__name__)
 
@@ -138,6 +145,8 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if USE_ROCM_AITER_LINEAR:
+            return aiter_tgemm.mm(x, layer.weigt, bias)
 
         return F.linear(x, layer.weight, bias)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5e1bec0bb4be..1c4e1c67a34e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -33,6 +33,11 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
+USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm(
+)
+if USE_ROCM_AITER_FMOE:
+    import aiter.ops as aiter_ops
+
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
 logger = init_logger(__name__)
@@ -656,6 +661,20 @@ def process_weights_after_loading(self, layer: Module) -> None:
                             dq_weight, max_w13_scales[expert_id])
                     start += shard_size
 
+            if USE_ROCM_AITER_FMOE:
+                max_w13_scales = max_w13_scales.unsqueeze(-1).unsqueeze(
+                    -1).expand((-1, layer.w13_weight.shape[1], -1))
+                w2_scales = layer.w2_weight_scale.data.unsqueeze(-1).unsqueeze(
+                    -1).expand((-1, layer.w2_weight.shape[1], -1))
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
+                layer.w13_weight = torch.nn.Parameter(aiter_ops.shuffle_weight(
+                    layer.w13_weight),
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(aiter_ops.shuffle_weight(
+                    layer.w2_weight),
+                                                     requires_grad=False)
+
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
             return
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 0f93b7f6c45b..4276ac5f92be 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -5,8 +5,14 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.envs import VLLM_ROCM_USE_AITER_LINEAR
 from vllm.platforms import current_platform
 
+USE_ROCM_AITER_LINEAR = VLLM_ROCM_USE_AITER_LINEAR \
+    and current_platform.is_rocm()
+if USE_ROCM_AITER_LINEAR:
+    from aiter.tuned_gemm import tgemm as aiter_tgemm
+
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = None
@@ -172,12 +178,20 @@ def apply_fp8_linear(
 
         if per_tensor_weights and per_tensor_activations:
             # Fused GEMM_DQ
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      out_dtype=input.dtype,
-                                      scale_a=x_scale,
-                                      scale_b=weight_scale,
-                                      bias=bias)
+            if USE_ROCM_AITER_LINEAR:
+                output = aiter_tgemm.mm(qinput,
+                                        weight.t(),
+                                        otype=input.dtype,
+                                        scale_a=x_scale,
+                                        scale_b=weight_scale,
+                                        bias=bias)
+            else:
+                output = torch._scaled_mm(qinput,
+                                          weight,
+                                          out_dtype=input.dtype,
+                                          scale_a=x_scale,
+                                          scale_b=weight_scale,
+                                          bias=bias)
             # A fix for discrepancy in scaled_mm which returns tuple
             # for torch < 2.5 and a single value in torch >= 2.5
             if type(output) is tuple and len(output) == 2:

From ead17c7182015d9608d60e416376f90314bec786 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 26 Feb 2025 07:17:39 +0000
Subject: [PATCH 02/35] maintain a consistent import pattern

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py           | 4 ++--
 vllm/model_executor/layers/layernorm.py                     | 4 ++--
 vllm/model_executor/layers/linear.py                        | 4 ++--
 vllm/model_executor/layers/quantization/fp8.py              | 4 ++--
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 6524084c494c..7794bc6002eb 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -17,8 +17,8 @@
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
-USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm(
-)
+USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
+    and current_platform.is_rocm()
 if USE_ROCM_AITER_FMOE:
     import aiter
     import aiter.fused_moe_bf16_asm as aiter_fmoe_asm
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 236cbbbe4a50..b08c8e85e830 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -5,11 +5,11 @@
 import torch
 import torch.nn as nn
 
-from vllm.envs import VLLM_ROCM_USE_AITER_NORM
+import vllm.envs as envs
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 
-USE_ROCM_AITER_NORM = VLLM_ROCM_USE_AITER_NORM \
+USE_ROCM_AITER_NORM = envs.VLLM_ROCM_USE_AITER_NORM \
     and current_platform.is_rocm()
 if USE_ROCM_AITER_NORM:
     import aiter
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 0ffb7ec52932..84c438ebb75a 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -8,12 +8,12 @@
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 
+import vllm.envs as envs
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
-from vllm.envs import VLLM_ROCM_USE_AITER_LINEAR
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
@@ -28,7 +28,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
-USE_ROCM_AITER_LINEAR = VLLM_ROCM_USE_AITER_LINEAR \
+USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \
     and current_platform.is_rocm()
 if USE_ROCM_AITER_LINEAR:
     from aiter.tuned_gemm import tgemm as aiter_tgemm
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 1c4e1c67a34e..396f544deaec 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -33,8 +33,8 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
-USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm(
-)
+USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
+    and current_platform.is_rocm()
 if USE_ROCM_AITER_FMOE:
     import aiter.ops as aiter_ops
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 4276ac5f92be..a4a3a1d6162b 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -4,11 +4,11 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.envs import VLLM_ROCM_USE_AITER_LINEAR
 from vllm.platforms import current_platform
 
-USE_ROCM_AITER_LINEAR = VLLM_ROCM_USE_AITER_LINEAR \
+USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \
     and current_platform.is_rocm()
 if USE_ROCM_AITER_LINEAR:
     from aiter.tuned_gemm import tgemm as aiter_tgemm

From 2527956512f22a3b6ac870f874ee015203bb57b8 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 26 Feb 2025 09:03:37 +0000
Subject: [PATCH 03/35] add aiter fp8 block scaled moe kernel

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/envs.py                                      | 7 +++++++
 vllm/model_executor/layers/fused_moe/fused_moe.py | 8 +++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index a88db87bf824..b1a746a05895 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,6 +78,7 @@
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
+    VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True
     VLLM_ROCM_USE_AITER_NORM: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
@@ -525,6 +526,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
      ("true", "1") and os.getenv("VLLM_USE_AITER_MOE", "True").lower() in
      ("true", "1")),
 
+    # use aiter block scaled moe op if aiter ops are enabled
+    "VLLM_ROCM_USE_AITER_BSCALED_MOE":
+    lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BSCALED_MOE",
+                                         "True").lower() in ("true", "1")),
+
     # use aiter linear op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_LINEAR":
     lambda:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 7794bc6002eb..35b87e7d4eb1 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1158,13 +1158,12 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
                              topk_weights: torch.Tensor,
                              topk_ids: torch.Tensor,
                              use_fp8_w8a8: bool = False,
-                             use_fp8_blockscale: bool = False,
                              w1_scale: Optional[torch.Tensor] = None,
                              w2_scale: Optional[torch.Tensor] = None,
                              block_shape: Optional[List[int]] = None,
                              expert_mask: Optional[torch.Tensor] = None):
 
-    if use_fp8_blockscale:
+    if envs.VLLM_ROCM_USE_AITER_BSCALED_MOE and use_fp8_w8a8:
         local_E = E = w1.shape[0]
         if expert_mask is not None:
             E = expert_mask.numel()
@@ -1236,7 +1235,6 @@ def fused_experts(hidden_states: torch.Tensor,
                   use_fp8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
                   use_int4_w4a16: bool = False,
-                  use_fp8_blockscale: bool = False,
                   global_num_experts: int = -1,
                   expert_map: Optional[torch.Tensor] = None,
                   w1_scale: Optional[torch.Tensor] = None,
@@ -1249,8 +1247,8 @@ def fused_experts(hidden_states: torch.Tensor,
                   expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
     if USE_ROCM_AITER_FMOE:
         rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids,
-                                 use_fp8_w8a8, use_fp8_blockscale, w1_scale,
-                                 w2_scale, block_shape, expert_mask)
+                                 use_fp8_w8a8, w1_scale, w2_scale, block_shape,
+                                 expert_mask)
     if inplace:
         torch.ops.vllm.inplace_fused_experts(
             hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,

From 814702a2909e1644f1120f2e2b6b73d064aec997 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 26 Feb 2025 11:09:16 +0000
Subject: [PATCH 04/35] bugfix: fix import paths and wrong env variables

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/envs.py                                   | 18 +++++++++---------
 .../layers/fused_moe/fused_moe.py              | 15 +++++++--------
 vllm/model_executor/layers/fused_moe/layer.py  |  6 +++---
 vllm/model_executor/layers/quantization/fp8.py |  6 +++---
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index b1a746a05895..023ab0e54f49 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -517,33 +517,33 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # use aiter ops unless specifically disabled
     "VLLM_ROCM_USE_AITER":
-    lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in ("true", "1")),
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1")),
 
     # use aiter moe op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_MOE":
     lambda:
-    (os.getenv("VLLM_USE_AITER", "False").lower() in
-     ("true", "1") and os.getenv("VLLM_USE_AITER_MOE", "True").lower() in
+    (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+     ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
      ("true", "1")),
 
     # use aiter block scaled moe op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_BSCALED_MOE":
-    lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in
+    lambda: (os.getenv("LLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BSCALED_MOE",
                                          "True").lower() in ("true", "1")),
 
     # use aiter linear op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_LINEAR":
-    lambda:
-    (os.getenv("VLLM_USE_AITER", "False").lower() in
-     ("true", "1") and os.getenv("VLLM_USE_AITER_LINEAR", "True").lower() in
-     ("true", "1")),
+    lambda: (os.getenv("LLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True"
+                                         ).lower() in ("true", "1")),
 
     # use aiter rms norm op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_NORM":
     lambda:
     (os.getenv("VLLM_USE_AITER", "False").lower() in
-     ("true", "1") and os.getenv("VLLM_USE_AITER_NORM", "True").lower() in
+     ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_NORM", "True").lower() in
      ("true", "1")),
 
     # Pad the fp8 weights to 256 bytes for ROCm
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 35b87e7d4eb1..58bd36be2f0e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -21,7 +21,7 @@
     and current_platform.is_rocm()
 if USE_ROCM_AITER_FMOE:
     import aiter
-    import aiter.fused_moe_bf16_asm as aiter_fmoe_asm
+    import aiter.fused_moe_bf16_asm as aiter_asm_fmoe
 
 logger = init_logger(__name__)
 
@@ -1179,7 +1179,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
             sorted_expert_ids,
             num_valid_ids,
             out_asm,
-        ) = aiter_fmoe_asm.moe_sorting_ck(topk_ids,
+        ) = aiter_asm_fmoe.moe_sorting_ck(topk_ids,
                                           topk_weights,
                                           E,
                                           model_dim,
@@ -1207,7 +1207,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
         return out_asm
 
     if use_fp8_w8a8:
-        return aiter_fmoe_asm.asm_moe(hidden_states=hidden_states,
+        return aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
                                       w1=w1,
                                       w2=w2,
                                       topk_weight=topk_weights,
@@ -1222,8 +1222,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
                             w1=w1,
                             w2=w2,
                             topk_weights=topk_weights,
-                            topk_ids=topk_ids,
-                            expert_mask=expert_mask)
+                            topk_ids=topk_ids)
 
 
 def fused_experts(hidden_states: torch.Tensor,
@@ -1246,9 +1245,9 @@ def fused_experts(hidden_states: torch.Tensor,
                   block_shape: Optional[List[int]] = None,
                   expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
     if USE_ROCM_AITER_FMOE:
-        rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights, topk_ids,
-                                 use_fp8_w8a8, w1_scale, w2_scale, block_shape,
-                                 expert_mask)
+        return rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights,
+                                        topk_ids, use_fp8_w8a8, w1_scale,
+                                        w2_scale, block_shape, expert_mask)
     if inplace:
         torch.ops.vllm.inplace_fused_experts(
             hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 3d3fc51723b1..ef9aaa83cb61 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -31,7 +31,7 @@
 USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm(
 )
 if USE_ROCM_AITER_FMOE:
-    import aiter.ops as aiter_ops
+    from aiter.ops.shuffle import shuffle_weight as aiter_shuffle_weight
 
 logger = init_logger(__name__)
 
@@ -102,10 +102,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
         if USE_ROCM_AITER_FMOE:
-            layer.w13_weight = torch.nn.Parameter(aiter_ops.shuffle_weight(
+            layer.w13_weight = torch.nn.Parameter(aiter_shuffle_weight(
                 layer.w13_weight.data),
                                                   requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(aiter_ops.shuffle_weight(
+            layer.w2_weight = torch.nn.Parameter(aiter_shuffle_weight(
                 layer.w2_weight.data),
                                                  requires_grad=False)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 396f544deaec..16ed848ef742 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -36,7 +36,7 @@
 USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
     and current_platform.is_rocm()
 if USE_ROCM_AITER_FMOE:
-    import aiter.ops as aiter_ops
+    from aiter.ops.shuffle import aiter_shuffle_weight
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -668,10 +668,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     -1).expand((-1, layer.w2_weight.shape[1], -1))
                 layer.w2_weight_scale = torch.nn.Parameter(
                     w2_scales.contiguous(), requires_grad=False)
-                layer.w13_weight = torch.nn.Parameter(aiter_ops.shuffle_weight(
+                layer.w13_weight = torch.nn.Parameter(aiter_shuffle_weight(
                     layer.w13_weight),
                                                       requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(aiter_ops.shuffle_weight(
+                layer.w2_weight = torch.nn.Parameter(aiter_shuffle_weight(
                     layer.w2_weight),
                                                      requires_grad=False)
 

From 024cfc57313d9538f7b317aee421ab9e578e9b44 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 28 Feb 2025 14:50:23 +0000
Subject: [PATCH 05/35] rename importing module names from  amd/rocm aiter
 package to avoid confusion withpython builtin aiter function.

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/fused_moe/fused_moe.py             | 52 +++++++++----------
 vllm/model_executor/layers/layernorm.py       |  4 +-
 vllm/model_executor/layers/linear.py          |  4 +-
 .../model_executor/layers/quantization/fp8.py | 10 ++--
 4 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 58bd36be2f0e..b2b2dc6bd0bb 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -20,8 +20,8 @@
 USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
     and current_platform.is_rocm()
 if USE_ROCM_AITER_FMOE:
-    import aiter
-    import aiter.fused_moe_bf16_asm as aiter_asm_fmoe
+    import aiter as rocm_aiter
+    import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
 
 logger = init_logger(__name__)
 
@@ -953,8 +953,8 @@ def fused_topk(
                                         device=hidden_states.device)
 
     if USE_ROCM_AITER_FMOE:
-        aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies,
-                           gating_output.float(), renormalize)
+        rocm_aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies,
+                                gating_output.float(), renormalize)
     else:
         ops.topk_softmax(
             topk_weights,
@@ -1179,15 +1179,15 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
             sorted_expert_ids,
             num_valid_ids,
             out_asm,
-        ) = aiter_asm_fmoe.moe_sorting_ck(topk_ids,
-                                          topk_weights,
-                                          E,
-                                          model_dim,
-                                          dtype,
-                                          expert_mask=expert_mask)
+        ) = rocm_aiter_asm_fmoe.moe_sorting_ck(topk_ids,
+                                               topk_weights,
+                                               E,
+                                               model_dim,
+                                               dtype,
+                                               expert_mask=expert_mask)
 
         a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k)
-        aiter.fmoe_fp8_blockscale_g1u1(
+        rocm_aiter.fmoe_fp8_blockscale_g1u1(
             out_asm,
             a1,
             w1,
@@ -1207,22 +1207,22 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
         return out_asm
 
     if use_fp8_w8a8:
-        return aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
-                                      w1=w1,
-                                      w2=w2,
-                                      topk_weight=topk_weights,
-                                      topk_ids=topk_ids,
-                                      fc1_scale=w1_scale,
-                                      fc2_scale=w2_scale,
-                                      fc1_smooth_scale=None,
-                                      fc2_smooth_scale=None,
-                                      a16=False)
+        return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
+                                           w1=w1,
+                                           w2=w2,
+                                           topk_weight=topk_weights,
+                                           topk_ids=topk_ids,
+                                           fc1_scale=w1_scale,
+                                           fc2_scale=w2_scale,
+                                           fc1_smooth_scale=None,
+                                           fc2_smooth_scale=None,
+                                           a16=False)
     else:
-        return aiter.ck_moe(hidden_states=hidden_states,
-                            w1=w1,
-                            w2=w2,
-                            topk_weights=topk_weights,
-                            topk_ids=topk_ids)
+        return rocm_aiter.ck_moe(hidden_states=hidden_states,
+                                 w1=w1,
+                                 w2=w2,
+                                 topk_weights=topk_weights,
+                                 topk_ids=topk_ids)
 
 
 def fused_experts(hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index b08c8e85e830..0d5d1a454594 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -12,7 +12,7 @@
 USE_ROCM_AITER_NORM = envs.VLLM_ROCM_USE_AITER_NORM \
     and current_platform.is_rocm()
 if USE_ROCM_AITER_NORM:
-    import aiter
+    import aiter as rocm_aiter
 
 
 @CustomOp.register("rms_norm")
@@ -92,7 +92,7 @@ def forward_cuda(
 
         if residual is not None:
             if USE_ROCM_AITER_NORM:
-                aiter.rmsnorm2d_fwd_with_add(
+                rocm_aiter.rmsnorm2d_fwd_with_add(
                     x,
                     x,
                     residual,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 84c438ebb75a..dddf19e4c617 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -31,7 +31,7 @@
 USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \
     and current_platform.is_rocm()
 if USE_ROCM_AITER_LINEAR:
-    from aiter.tuned_gemm import tgemm as aiter_tgemm
+    from aiter.tuned_gemm import tgemm as rocm_aiter_tgemm
 
 logger = init_logger(__name__)
 
@@ -146,7 +146,7 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         if USE_ROCM_AITER_LINEAR:
-            return aiter_tgemm.mm(x, layer.weigt, bias)
+            return rocm_aiter_tgemm.mm(x, layer.weigt, bias)
 
         return F.linear(x, layer.weight, bias)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 16ed848ef742..2edccd0984a3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -36,7 +36,7 @@
 USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
     and current_platform.is_rocm()
 if USE_ROCM_AITER_FMOE:
-    from aiter.ops.shuffle import aiter_shuffle_weight
+    from aiter.ops.shuffle import shuffle_weight as rocm_aiter_shuffle_weight
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -668,10 +668,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     -1).expand((-1, layer.w2_weight.shape[1], -1))
                 layer.w2_weight_scale = torch.nn.Parameter(
                     w2_scales.contiguous(), requires_grad=False)
-                layer.w13_weight = torch.nn.Parameter(aiter_shuffle_weight(
-                    layer.w13_weight),
-                                                      requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(aiter_shuffle_weight(
+                layer.w13_weight = torch.nn.Parameter(
+                    rocm_aiter_shuffle_weight(layer.w13_weight),
+                    requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight(
                     layer.w2_weight),
                                                      requires_grad=False)
 

From 7cfe429744b7447fb60a97636a5daf5bae4ff74d Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 1 Mar 2025 10:50:37 +0000
Subject: [PATCH 06/35] bugfixe on wrong env variable spelling an  add missing 
 statment condition in Fp8MoEMethod

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/envs.py                                    |  4 ++--
 .../layers/fused_moe/fused_moe.py               | 10 ++++++++--
 vllm/model_executor/layers/quantization/fp8.py  | 17 +++++++++++++++++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 023ab0e54f49..49b21a687736 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -529,9 +529,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # use aiter block scaled moe op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_BSCALED_MOE":
-    lambda: (os.getenv("LLM_ROCM_USE_AITER", "False").lower() in
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BSCALED_MOE",
-                                         "True").lower() in ("true", "1")),
+                                         "true").lower() in ("true", "1")),
 
     # use aiter linear op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_LINEAR":
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b2b2dc6bd0bb..0abd2c58d0e8 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1162,8 +1162,10 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
                              w2_scale: Optional[torch.Tensor] = None,
                              block_shape: Optional[List[int]] = None,
                              expert_mask: Optional[torch.Tensor] = None):
-
     if envs.VLLM_ROCM_USE_AITER_BSCALED_MOE and use_fp8_w8a8:
+        assert w1_scale is not None
+        assert w2_scale is not None
+
         local_E = E = w1.shape[0]
         if expert_mask is not None:
             E = expert_mask.numel()
@@ -1171,6 +1173,10 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
         topk = topk_ids.shape[1]
         model_dim = w1.shape[-1]
         dtype = hidden_states.dtype
+        # The default block sizes are 128 in AITER.
+        if block_shape is None:
+            block_shape = [128, 128]
+
         scale_blk_k = block_shape[1]
 
         (
@@ -1206,7 +1212,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
         )
         return out_asm
 
-    if use_fp8_w8a8:
+    elif use_fp8_w8a8:
         return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
                                            w1=w1,
                                            w2=w2,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 2edccd0984a3..e57d05cf2f28 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -593,6 +593,23 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                   requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                  requires_grad=False)
+
+            if USE_ROCM_AITER_FMOE:
+                w13_scales = layer.w13_weight_scale.data.unsqueeze(
+                    -1).unsqueeze(-1).expand(
+                        (-1, layer.w13_weight.shape[1], -1))
+                w2_scales = layer.w2_weight_scale.data.unsqueeze(-1).unsqueeze(
+                    -1).expand((-1, layer.w2_weight.shape[1], -1))
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
+                layer.w13_weight_scale = torch.nn.Parameter(
+                    w13_scales.contiguous(), requires_grad=False)
+                layer.w13_weight = torch.nn.Parameter(
+                    rocm_aiter_shuffle_weight(layer.w13_weight),
+                    requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight(
+                    layer.w2_weight),
+                                                     requires_grad=False)
             return
 
         # If checkpoint is fp8, we need to handle that the

From 41e7e4fa8ce5da4475046ecac6ab426bc3509fb6 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 1 Mar 2025 10:58:04 +0000
Subject: [PATCH 07/35] enabled VLLM_ROCM_USE_AITER in unit-tests

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/kernels/test_moe.py                     | 22 ++++++++++++++-----
 .../decoder_only/language/test_granite.py     | 19 ++++++++--------
 .../decoder_only/language/test_mistral.py     | 18 +++++++--------
 .../decoder_only/language/test_models.py      | 18 +++++++--------
 .../decoder_only/language/test_phimoe.py      | 17 +++++++-------
 5 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 2f5c69046f48..b48978721069 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -202,11 +202,15 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype):
+def test_mixtral_moe(dtype: torch.dtype, use_rocm_aiter: bool, monkeypatch):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
     # Instantiate our and huggingface's MoE blocks
     config = MixtralConfig()
     hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
@@ -242,10 +246,18 @@ def test_mixtral_moe(dtype: torch.dtype):
         torch.bfloat16: 1e-2,
     }
 
-    torch.testing.assert_close(hf_states.flatten(0, 1),
-                               vllm_states,
-                               rtol=mixtral_moe_tol[dtype],
-                               atol=mixtral_moe_tol[dtype])
+    if use_rocm_aiter:
+        # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
+        # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174  # noqa: E501
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=0.01,
+                                   atol=100)
+    else:
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=mixtral_moe_tol[dtype],
+                                   atol=mixtral_moe_tol[dtype])
 
 
 @pytest.mark.parametrize("m", [1, 33, 64, 222])
diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py
index 119b79d64c96..f34b67ca56d5 100644
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -5,6 +5,8 @@
 """
 import pytest
 
+from vllm.platforms import current_platform
+
 from ...utils import check_logprobs_close
 
 MODELS = [
@@ -18,15 +20,14 @@
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int,
+                use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 17923673023f..ce9af7faa410 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -12,6 +12,7 @@
 
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
     MistralToolParser)
+from vllm.platforms import current_platform
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
@@ -174,15 +175,14 @@
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int,
+                use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+
     # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 71e4a9f11ab8..7c21ed0fecda 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -5,6 +5,8 @@
 """
 import pytest
 
+from vllm.platforms import current_platform
+
 from ...utils import check_logprobs_close
 
 
@@ -63,15 +65,13 @@
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int,
+                use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
 
     with hf_runner(model, dtype=dtype) as hf_model:
         if model.startswith("THUDM/chatglm3"):
diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py
index f9757d6ac295..b8948976a03e 100644
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -79,15 +79,14 @@ def test_phimoe_routing_function():
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int,
+                use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)

From 5f668ea2b9546f73bdb12f3002bc590c0b38f92c Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 1 Mar 2025 10:59:24 +0000
Subject: [PATCH 08/35] include the AMD AITER package in rocm_base docker file

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 Dockerfile.rocm      |  6 ------
 Dockerfile.rocm_base | 27 ++++++++++++++++++++-------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 932661dba135..14c522afd7f9 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -109,17 +109,11 @@ ARG COMMON_WORKDIR
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 
-RUN git clone --recursive https://github.com/ROCm/aiter.git
-RUN cd /app/aiter && GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
-
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
-# Enable Aiter. Make sure this only exists on the aiter branch.
-# ENV VLLM_USE_AITER=1
-
 CMD ["/bin/bash"]
 
diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base
index e33e73b30309..494dc17c13ae 100644
--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
@@ -1,17 +1,19 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLASLT_BRANCH="db8e93b4"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="3a585126"
-ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_BRANCH="6c0e7463"
+ARG PYTORCH_VISION_BRANCH="v0.21.0"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="b7d29fb"
-ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG FA_BRANCH="1a7f4dfa"
+ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
+ARG AITER_BRANCH="dfed377"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
 
@@ -108,7 +110,7 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
-    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+    && GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
     && cp /app/vision/dist/*.whl /app/install \
     && cp /app/flash-attention/dist/*.whl /app/install
@@ -129,7 +131,17 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
 
+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter   
+
 ARG BASE_IMAGE
+ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
@@ -155,4 +167,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
+    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file

From 8c5eb52771d81f052218e8f685d3f047a3032a57 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 3 Mar 2025 04:07:28 +0000
Subject: [PATCH 09/35] integrate AITER paged attention

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/rocm_flash_attn.py  |  33 +++-
 vllm/attention/ops/rocm_aiter_paged_attn.py | 195 ++++++++++++++++++++
 vllm/envs.py                                |  17 +-
 3 files changed, 239 insertions(+), 6 deletions(-)
 create mode 100644 vllm/attention/ops/rocm_aiter_paged_attn.py

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 3f40686ee2fd..2a696b7147d0 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -12,20 +12,27 @@
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (CommonAttentionState,
                                            CommonMetadataBuilder)
-from vllm.attention.ops.paged_attn import (PagedAttention,
-                                           PagedAttentionMetadata)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
+USE_ROCM_AITER_PA = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+if USE_ROCM_AITER_PA:
+    from vllm.attention.ops.rocm_aiter_paged_attn import (
+        PagedAttention, PagedAttentionMetadata)
+else:
+    from vllm.attention.ops.paged_attn import (PagedAttention,
+                                               PagedAttentionMetadata)
+
 logger = init_logger(__name__)
 
 _PARTITION_SIZE_ROCM = 512
 _GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
 _ON_NAVI = "gfx1" in _GPU_ARCH
 _ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"])
+USE_ROCM_CUSTOM_PA = envs.VLLM_ROCM_USE_CUSTOM_PAGED_ATTN
 
 
 class ROCmFlashAttentionBackend(AttentionBackend):
@@ -463,6 +470,7 @@ def __init__(
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
+        self.aiter_kv_scales_initialized = False
 
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
@@ -608,6 +616,24 @@ def forward(
         else:
             assert value is None
 
+        if (USE_ROCM_AITER_PA and kv_cache.dtype.itemsize == 1
+                and not self.aiter_kv_scales_initialized
+                and kv_cache.shape != torch.Size([0])):
+            num_blocks = kv_cache.shape[1]
+            block_size = kv_cache.shape[2] // (self.num_kv_heads *
+                                               self.head_size)
+            k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            self.aiter_kv_scales_initialized = True
+            k_scale.fill_(layer._k_scale.item())
+            v_scale.fill_(layer._v_scale.item())
+            layer._k_scale = k_scale
+            layer._v_scale = v_scale
+
         if self.attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
@@ -885,4 +911,5 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and USE_ROCM_CUSTOM_PA)
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
new file mode 100644
index 000000000000..847d39077494
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import aiter as rocm_aiter
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.attention.ops.prefix_prefill import context_attention_fwd
+
+
+@dataclass
+class PagedAttentionMetadata:
+    """Metadata for PagedAttention."""
+    # (batch_size,). The length of sequences (entire tokens seen so far) per
+    # sequence.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
+    max_decode_seq_len: int
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+
+class PagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, block_size * num_kv_heads * head_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = 16 // kv_cache.element_size()
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
+                                   -1, x)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+    ) -> None:
+        if key_cache.dtype.itemsize == 1:
+            if "fp8" in kv_cache_dtype:
+                key_cache = key_cache.view(torch.float8_e4m3fnuz)
+                value_cache = value_cache.view(torch.float8_e4m3fnuz)
+            else:
+                key_cache = key_cache.view(torch.int8)
+                value_cache = value_cache.view(torch.int8)
+            rocm_aiter.reshape_and_cache_with_pertoken_quant(
+                key, value, key_cache, value_cache, k_scale, v_scale,
+                slot_mapping.flatten(), True)
+        else:
+            rocm_aiter.reshape_and_cache(key, value, key_cache, value_cache,
+                                         slot_mapping.flatten(),
+                                         kv_cache_dtype,
+                                         k_scale.view(-1)[0].item(),
+                                         v_scale.view(-1)[0].item(), True)
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> torch.Tensor:
+        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
+            # use blocksparse paged attention
+            block_size = value_cache.size(-1)
+            assert (blocksparse_block_size > 0 and
+                    blocksparse_block_size % block_size == 0), \
+                (f"{blocksparse_block_size=} needs to be a multiple of"
+                 f"{block_size=} used in block_tables.")
+
+        output = torch.empty_like(query)
+        block_size = value_cache.shape[3]
+        max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            k_scale, v_scale = (None, None)
+            query = query.contiguous()
+        elif "fp8" in kv_cache_dtype:
+            key_cache = key_cache.view(torch.float8_e4m3fnuz)
+            value_cache = value_cache.view(torch.float8_e4m3fnuz)
+        rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
+                              seq_lens, max_num_blocks_per_seq, k_scale,
+                              v_scale, output)
+        return output
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache_dtype: str,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        seq_lens_tensor: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_query_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        sliding_window: Optional[int],
+        k_scale: float,
+        v_scale: float,
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+        context_attention_fwd(
+            query,
+            key,
+            value,
+            output,
+            kv_cache_dtype,
+            key_cache,
+            value_cache,
+            block_tables,
+            # query_start_loc is (batch_size + 1,)
+            query_start_loc[:-1],
+            seq_lens_tensor,
+            context_lens,
+            max_query_len,
+            k_scale,
+            v_scale,
+            alibi_slopes,
+            sliding_window,
+        )
+        return output
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
diff --git a/vllm/envs.py b/vllm/envs.py
index 49b21a687736..ab39cfe805dd 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,12 +74,14 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
-    VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True
     VLLM_ROCM_USE_AITER_NORM: bool = True
+    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = True
+    VLLM_ROCM_USE_CUSTOM_PAGED_ATTN: bool = True
+    VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -535,16 +537,25 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # use aiter linear op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_LINEAR":
-    lambda: (os.getenv("LLM_ROCM_USE_AITER", "False").lower() in
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True"
                                          ).lower() in ("true", "1")),
 
     # use aiter rms norm op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_NORM":
     lambda:
-    (os.getenv("VLLM_USE_AITER", "False").lower() in
+    (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
      ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_NORM", "True").lower() in
      ("true", "1")),
+    "VLLM_ROCM_USE_AITER_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN",
+                                         "True").lower() in ("true", "1")),
+
+    # use rocm custom paged attention.
+    "VLLM_ROCM_USE_CUSTOM_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_USE_CUSTOM_PAGED_ATTN", "False").lower() in
+             ("true", "1")),
 
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":

From 77cb4360f9f78c69fb3ce7b9b7e5bd2c31d1129d Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 3 Mar 2025 05:18:47 +0000
Subject: [PATCH 10/35] bugfixes and disable rocm aiter paged attention

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/rocm_flash_attn.py  |  3 +--
 vllm/attention/ops/rocm_aiter_paged_attn.py |  1 +
 vllm/envs.py                                | 10 +++-------
 vllm/model_executor/layers/linear.py        |  2 +-
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 2a696b7147d0..17578069e5d3 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -32,7 +32,6 @@
 _GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
 _ON_NAVI = "gfx1" in _GPU_ARCH
 _ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"])
-USE_ROCM_CUSTOM_PA = envs.VLLM_ROCM_USE_CUSTOM_PAGED_ATTN
 
 
 class ROCmFlashAttentionBackend(AttentionBackend):
@@ -912,4 +911,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
             and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
-            and USE_ROCM_CUSTOM_PA)
+            and not USE_ROCM_AITER_PA)
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
index 847d39077494..8a425988290c 100644
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -126,6 +126,7 @@ def forward_decode(
         elif "fp8" in kv_cache_dtype:
             key_cache = key_cache.view(torch.float8_e4m3fnuz)
             value_cache = value_cache.view(torch.float8_e4m3fnuz)
+
         rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
                               seq_lens, max_num_blocks_per_seq, k_scale,
                               v_scale, output)
diff --git a/vllm/envs.py b/vllm/envs.py
index ab39cfe805dd..689cab23a946 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -80,7 +80,6 @@
     VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True
     VLLM_ROCM_USE_AITER_NORM: bool = True
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = True
-    VLLM_ROCM_USE_CUSTOM_PAGED_ATTN: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
@@ -547,15 +546,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
      ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_NORM", "True").lower() in
      ("true", "1")),
+
+    # use aiter paged attention if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_PAGED_ATTN":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN",
-                                         "True").lower() in ("true", "1")),
-
-    # use rocm custom paged attention.
-    "VLLM_ROCM_USE_CUSTOM_PAGED_ATTN":
-    lambda: (os.getenv("VLLM_ROCM_USE_CUSTOM_PAGED_ATTN", "False").lower() in
-             ("true", "1")),
+                                         "False").lower() in ("false", "0")),
 
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dddf19e4c617..efad68f63c69 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -146,7 +146,7 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         if USE_ROCM_AITER_LINEAR:
-            return rocm_aiter_tgemm.mm(x, layer.weigt, bias)
+            return rocm_aiter_tgemm.mm(x, layer.weight, bias)
 
         return F.linear(x, layer.weight, bias)
 

From 4c4178154fba53539fe4fcf25b3ac90f1db21ad2 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 4 Mar 2025 07:41:12 +0000
Subject: [PATCH 11/35] revert back the custom pa condition

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 122556d656c6..dd7e3a67f6a9 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -910,5 +910,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
-            and not USE_ROCM_AITER_PA)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)

From c09a740cef157ed6322deaa4706bf6a07820a499 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 4 Mar 2025 08:48:09 +0000
Subject: [PATCH 12/35] enable AITER tgemm.mm per tensor scaled mm unittest

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/quantization/test_fp8.py | 24 +++++++++++++++++++++---
 vllm/envs.py                   |  5 +++--
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 3a7f0a196b5b..7081dd3cae0b 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -23,8 +23,13 @@
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", MODELS)
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
-                            monkeypatch) -> None:
+                            use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
@@ -47,7 +52,14 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
+                                     use_rocm_aiter: bool, monkeypatch):
+
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
 
         def check_model(model):
@@ -84,8 +96,14 @@ def check_model(model):
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
-                         monkeypatch) -> None:
+                         use_rocm_aiter: bool, monkeypatch) -> None:
+
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
diff --git a/vllm/envs.py b/vllm/envs.py
index c1e3dede8248..60a0a9eb9163 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -79,7 +79,7 @@
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True
     VLLM_ROCM_USE_AITER_NORM: bool = True
-    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = True
+    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
@@ -555,9 +555,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
      ("true", "1")),
 
     # use aiter paged attention if aiter ops are enabled.
+    # this is disabled by default
     "VLLM_ROCM_USE_AITER_PAGED_ATTN":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
-             ("false", "0")),
+             ("true", "1")),
 
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":

From e19b7f59bc74e6b2cdf043d2ee4b3cbeef150a00 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 4 Mar 2025 09:27:04 +0000
Subject: [PATCH 13/35] bugfix: shuffle the weights when using aiter fmoe block
 scaled kernel

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/envs.py                                      |  9 +++++----
 vllm/model_executor/layers/fused_moe/fused_moe.py |  7 +++++--
 vllm/model_executor/layers/quantization/fp8.py    | 11 ++++++++++-
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 60a0a9eb9163..1fef5141fd58 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -77,7 +77,7 @@
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
-    VLLM_ROCM_USE_AITER_BSCALED_MOE: bool = True
+    VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = True
     VLLM_ROCM_USE_AITER_NORM: bool = True
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_FP8_PADDING: bool = True
@@ -536,10 +536,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
      ("true", "1")),
 
     # use aiter block scaled moe op if aiter ops are enabled
-    "VLLM_ROCM_USE_AITER_BSCALED_MOE":
+    "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
-             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BSCALED_MOE",
-                                         "true").lower() in ("true", "1")),
+             ("true", "1") and os.getenv(
+                 "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "true").lower() in
+             ("true", "1")),
 
     # use aiter linear op if aiter ops are enabled
     "VLLM_ROCM_USE_AITER_LINEAR":
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index f982b8288885..4d5c56874485 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -19,7 +19,10 @@
 
 USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
     and current_platform.is_rocm()
-if USE_ROCM_AITER_FMOE:
+USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE = envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE \
+    and current_platform.is_rocm() # noqa: E501
+
+if USE_ROCM_AITER_FMOE or USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE:
     import aiter as rocm_aiter
     import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
 
@@ -1167,7 +1170,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
                              w2_scale: Optional[torch.Tensor] = None,
                              block_shape: Optional[List[int]] = None,
                              expert_mask: Optional[torch.Tensor] = None):
-    if envs.VLLM_ROCM_USE_AITER_BSCALED_MOE and use_fp8_w8a8:
+    if USE_ROCM_AITER_FMOE and use_fp8_w8a8:
         assert w1_scale is not None
         assert w2_scale is not None
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ff0574477ff5..6d035bbf4a6b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -35,7 +35,9 @@
 
 USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
     and current_platform.is_rocm()
-if USE_ROCM_AITER_FMOE:
+USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE = envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE \
+    and current_platform.is_rocm() # noqa: E501
+if USE_ROCM_AITER_FMOE or USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE:
     from aiter.ops.shuffle import shuffle_weight as rocm_aiter_shuffle_weight
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -561,6 +563,13 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
+            if USE_ROCM_AITER_FMOE and USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE:
+                layer.w13_weight = torch.nn.Parameter(
+                    rocm_aiter_shuffle_weight(layer.w13_weight.data),
+                    requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight(
+                    layer.w2_weight.data),
+                                                     requires_grad=False)
             return
 
         # If checkpoint is fp16, quantize in place.

From 11ac580243225d3480315ce490082903de060f14 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 4 Mar 2025 15:27:05 +0000
Subject: [PATCH 14/35] fix environment wrong variable in unit tests

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/kernels/test_moe.py                     |  2 +-
 .../decoder_only/language/test_granite.py     |  2 +-
 .../decoder_only/language/test_mistral.py     | 54 ++++++++++---------
 .../decoder_only/language/test_models.py      |  2 +-
 .../decoder_only/language/test_phimoe.py      |  2 +-
 tests/quantization/test_fp8.py                |  6 +--
 6 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index b48978721069..0c122b07d2d6 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -210,7 +210,7 @@ def test_mixtral_moe(dtype: torch.dtype, use_rocm_aiter: bool, monkeypatch):
     huggingface."""
 
     if use_rocm_aiter:
-        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
     # Instantiate our and huggingface's MoE blocks
     config = MixtralConfig()
     hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py
index f34b67ca56d5..7dec7bbd0ca1 100644
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -26,7 +26,7 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
-        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index ce9af7faa410..68b27a8e7087 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -181,7 +181,7 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
-        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
@@ -205,14 +205,14 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_mistral_format(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
+                        max_tokens: int, num_logprobs: int,
+                        use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     with vllm_runner(
             model,
             dtype=dtype,
@@ -243,11 +243,13 @@ def test_mistral_format(
 
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_mistral_symbolic_languages(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str,
+                                    use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=8192,
@@ -264,11 +266,13 @@ def test_mistral_symbolic_languages(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("model",
                          MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
-def test_mistral_function_calling(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_mistral_function_calling(vllm_runner, model: str, dtype: str,
+                                  use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     with vllm_runner(model,
                      dtype=dtype,
                      tokenizer_mode="mistral",
@@ -299,11 +303,13 @@ def test_mistral_function_calling(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("guided_backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(
-    vllm_runner,
-    model: str,
-    guided_backend: str,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_mistral_guided_decoding(vllm_runner, model: str, guided_backend: str,
+                                 use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     with vllm_runner(model, dtype='bfloat16',
                      tokenizer_mode="mistral") as vllm_model:
 
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 7c21ed0fecda..eca2ca18dcb7 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -71,7 +71,7 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
-        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with hf_runner(model, dtype=dtype) as hf_model:
         if model.startswith("THUDM/chatglm3"):
diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py
index b8948976a03e..7e42124d7b2c 100644
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -85,7 +85,7 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
-        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 7081dd3cae0b..f279a8e9f664 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -28,7 +28,7 @@
 def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
                             use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
-        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
@@ -58,7 +58,7 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
                                      use_rocm_aiter: bool, monkeypatch):
 
     if use_rocm_aiter:
-        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
 
@@ -102,7 +102,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                          use_rocm_aiter: bool, monkeypatch) -> None:
 
     if use_rocm_aiter:
-        monkeypatch.setenv("VLLM_ROCM_AITER_USE_AITER", "1")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

From 08651240cf4edc4c2c685da48a47bbf5926ea931 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 5 Mar 2025 07:42:05 +0000
Subject: [PATCH 15/35] add aiter block gemm kernel and refactor aiter envs
 conditions

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/envs.py                                  |  7 +++++
 .../layers/fused_moe/fused_moe.py             | 24 +++++++--------
 vllm/model_executor/layers/fused_moe/layer.py | 23 +++++++-------
 vllm/model_executor/layers/layernorm.py       | 12 +++-----
 vllm/model_executor/layers/linear.py          | 12 +++-----
 .../model_executor/layers/quantization/fp8.py | 25 +++++++++-------
 .../layers/quantization/utils/fp8_utils.py    | 24 ++++++++++-----
 vllm/utils.py                                 | 30 +++++++++++++++++++
 8 files changed, 101 insertions(+), 56 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 1fef5141fd58..46c806e89310 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -80,6 +80,7 @@
     VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = True
     VLLM_ROCM_USE_AITER_NORM: bool = True
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
+    VLLM_ROCM_USE_AITER_BLOCK_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
@@ -561,6 +562,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
              ("true", "1")),
 
+    # use aiter w8a8 block gemm kerner if aiter ops are enabled.
+    "VLLM_ROCM_USE_AITER_BLOCK_GEMM":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM",
+                                         "True").lower() in ("true", "1")),
+
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4d5c56874485..6470dbaa555a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -15,16 +15,9 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
-
-USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
-    and current_platform.is_rocm()
-USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE = envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE \
-    and current_platform.is_rocm() # noqa: E501
-
-if USE_ROCM_AITER_FMOE or USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE:
-    import aiter as rocm_aiter
-    import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
+from vllm.utils import (direct_register_custom_op,
+                        rocm_aiter_fp8_block_scaled_moe_enabled,
+                        rocm_aiter_moe_enabled)
 
 logger = init_logger(__name__)
 
@@ -955,7 +948,9 @@ def fused_topk(
                                         dtype=torch.int32,
                                         device=hidden_states.device)
 
-    if USE_ROCM_AITER_FMOE:
+    if rocm_aiter_moe_enabled():
+        import aiter as rocm_aiter
+
         rocm_aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies,
                                 gating_output.float(), renormalize)
     else:
@@ -1170,7 +1165,10 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
                              w2_scale: Optional[torch.Tensor] = None,
                              block_shape: Optional[List[int]] = None,
                              expert_mask: Optional[torch.Tensor] = None):
-    if USE_ROCM_AITER_FMOE and use_fp8_w8a8:
+    import aiter as rocm_aiter
+    import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
+
+    if rocm_aiter_fp8_block_scaled_moe_enabled() and use_fp8_w8a8:
         assert w1_scale is not None
         assert w2_scale is not None
 
@@ -1259,7 +1257,7 @@ def fused_experts(hidden_states: torch.Tensor,
                   a2_scale: Optional[torch.Tensor] = None,
                   block_shape: Optional[List[int]] = None,
                   expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-    if USE_ROCM_AITER_FMOE:
+    if rocm_aiter_moe_enabled():
         return rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights,
                                         topk_ids, use_fp8_w8a8, w1_scale,
                                         w2_scale, block_shape, expert_mask)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index f74137cbab0a..61924c68e393 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# isort: skip_file
 
 from abc import abstractmethod
 from enum import Enum
@@ -18,6 +19,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
+from vllm.utils import rocm_aiter_moe_enabled
 
 if current_platform.is_cuda_alike():
     from .fused_moe import fused_experts
@@ -29,11 +31,6 @@
 else:
     fused_moe_pallas = None  # type: ignore
 
-USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE and current_platform.is_rocm(
-)
-if USE_ROCM_AITER_FMOE:
-    from aiter.ops.shuffle import shuffle_weight as aiter_shuffle_weight
-
 logger = init_logger(__name__)
 
 
@@ -102,12 +99,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
-        if USE_ROCM_AITER_FMOE:
-            layer.w13_weight = torch.nn.Parameter(aiter_shuffle_weight(
-                layer.w13_weight.data),
+        if rocm_aiter_moe_enabled():
+            from aiter.ops.shuffle import (shuffle_weight as
+                                           rocm_aiter_shuffle_weight)
+
+            shuffled_w13_weight = rocm_aiter_shuffle_weight(
+                layer.w13_weight.data)
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13_weight,
                                                   requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(aiter_shuffle_weight(
-                layer.w2_weight.data),
+
+            shuffled_w2_weight = rocm_aiter_shuffle_weight(
+                layer.w2_weight.data)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2_weight,
                                                  requires_grad=False)
 
         if current_platform.is_cpu():
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 0d5d1a454594..5976d9e38253 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -5,14 +5,8 @@
 import torch
 import torch.nn as nn
 
-import vllm.envs as envs
 from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
-
-USE_ROCM_AITER_NORM = envs.VLLM_ROCM_USE_AITER_NORM \
-    and current_platform.is_rocm()
-if USE_ROCM_AITER_NORM:
-    import aiter as rocm_aiter
+from vllm.utils import rocm_aiter_norm_enabled
 
 
 @CustomOp.register("rms_norm")
@@ -91,7 +85,9 @@ def forward_cuda(
         from vllm import _custom_ops as ops
 
         if residual is not None:
-            if USE_ROCM_AITER_NORM:
+            if rocm_aiter_norm_enabled():
+                import aiter as rocm_aiter
+
                 rocm_aiter.rmsnorm2d_fwd_with_add(
                     x,
                     x,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 36085d18ea42..72c2289bb063 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -8,7 +8,6 @@
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 
-import vllm.envs as envs
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -26,12 +25,7 @@
                                            RowvLLMParameter)
 # yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.platforms import current_platform
-
-USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \
-    and current_platform.is_rocm()
-if USE_ROCM_AITER_LINEAR:
-    from aiter.tuned_gemm import tgemm as rocm_aiter_tgemm
+from vllm.utils import rocm_aiter_linear_enabled
 
 logger = init_logger(__name__)
 
@@ -145,7 +139,9 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        if USE_ROCM_AITER_LINEAR:
+        if rocm_aiter_linear_enabled():
+            from aiter.tuned_gemm import tgemm as rocm_aiter_tgemm
+
             return rocm_aiter_tgemm.mm(x, layer.weight, bias)
 
         return F.linear(x, layer.weight, bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6d035bbf4a6b..f6aa35d44cb2 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# isort: skip_file
 
 from typing import Any, Callable, Dict, List, Optional
 
@@ -32,13 +33,8 @@
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-
-USE_ROCM_AITER_FMOE = envs.VLLM_ROCM_USE_AITER_MOE \
-    and current_platform.is_rocm()
-USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE = envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE \
-    and current_platform.is_rocm() # noqa: E501
-if USE_ROCM_AITER_FMOE or USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE:
-    from aiter.ops.shuffle import shuffle_weight as rocm_aiter_shuffle_weight
+from vllm.utils import (rocm_aiter_fp8_block_scaled_moe_enabled,
+                        rocm_aiter_moe_enabled)
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -563,7 +559,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
-            if USE_ROCM_AITER_FMOE and USE_ROCM_AITER_FP8_BLOCK_SCALED_MOE:
+            if rocm_aiter_fp8_block_scaled_moe_enabled():
+                from aiter.ops.shuffle import (shuffle_weight as
+                                               rocm_aiter_shuffle_weight)
+
                 layer.w13_weight = torch.nn.Parameter(
                     rocm_aiter_shuffle_weight(layer.w13_weight.data),
                     requires_grad=False)
@@ -600,7 +599,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                  requires_grad=False)
 
-            if USE_ROCM_AITER_FMOE:
+            if rocm_aiter_moe_enabled():
+                from aiter.ops.shuffle import (shuffle_weight as
+                                               rocm_aiter_shuffle_weight)
+
                 w13_scales = layer.w13_weight_scale.data.unsqueeze(
                     -1).unsqueeze(-1).expand(
                         (-1, layer.w13_weight.shape[1], -1))
@@ -684,7 +686,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                             dq_weight, max_w13_scales[expert_id])
                     start += shard_size
 
-            if USE_ROCM_AITER_FMOE:
+            if rocm_aiter_moe_enabled():
+                from aiter.ops.shuffle import (shuffle_weight as
+                                               rocm_aiter_shuffle_weight)
+
                 max_w13_scales = max_w13_scales.unsqueeze(-1).unsqueeze(
                     -1).expand((-1, layer.w13_weight.shape[1], -1))
                 w2_scales = layer.w2_weight_scale.data.unsqueeze(-1).unsqueeze(
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 7d91d2cf1c6e..894fe775a85a 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -17,7 +17,8 @@
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils import (direct_register_custom_op,
+                        rocm_aiter_fp8_block_scaled_moe_enabled)
 
 logger = init_logger(__name__)
 
@@ -71,12 +72,21 @@ def apply_w8a8_block_fp8_linear(
         q_input, x_scale = per_token_group_quant_fp8(input_2d,
                                                      block_size[1],
                                                      column_major_scales=False)
-        output = w8a8_block_fp8_matmul(q_input,
-                                       weight,
-                                       x_scale,
-                                       weight_scale,
-                                       block_size,
-                                       output_dtype=input.dtype)
+        if rocm_aiter_fp8_block_scaled_moe_enabled():
+            import aiter as rocm_aiter
+
+            output = torch.zeros([q_input.shape[0], weight.shape[0]],
+                                 dtype=input.dtype,
+                                 device=q_input.device)
+            output = rocm_aiter.gemm_a8w8_blockscale(q_input, weight, x_scale,
+                                                     weight_scale, output)
+        else:
+            output = w8a8_block_fp8_matmul(q_input,
+                                           weight,
+                                           x_scale,
+                                           weight_scale,
+                                           block_size,
+                                           output_dtype=input.dtype)
     if bias is not None:
         output = output + bias
     return output.to(dtype=input.dtype).view(*output_shape)
diff --git a/vllm/utils.py b/vllm/utils.py
index 26c9e1a90837..1b87a21ed966 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -997,6 +997,36 @@ def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
         enable_trace_function_call(log_path)
 
 
+def is_hip() -> bool:
+    from vllm.platforms import current_platform
+    return current_platform.is_rocm()
+
+
+def rocm_aiter_moe_enabled() -> bool:
+    return is_hip() and envs.VLLM_ROCM_USE_AITER_MOE
+
+
+def rocm_aiter_paged_attn_enabled() -> bool:
+    return is_hip() and envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+
+
+def rocm_aiter_linear_enabled() -> bool:
+    return is_hip() and envs.VLLM_ROCM_USE_AITER_LINEAR
+
+
+def rocm_aiter_norm_enabled() -> bool:
+    return is_hip() and envs.VLLM_ROCM_USE_AITER_NORM
+
+
+def rocm_aiter_fp8_block_scaled_moe_enabled() -> bool:
+    return rocm_aiter_moe_enabled(
+    ) and envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
+
+
+def rocm_aiter_w8a8_block_gemm_enabled() -> bool:
+    return is_hip() and envs.VLLM_ROCM_USE_AITER_BLOCK_GEMM
+
+
 # `functools` helpers
 def identity(value: T, **kwargs) -> T:
     """Returns the first provided value."""

From 623dadbea8056b0b0d222ecd5117379f3459a2d4 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 5 Mar 2025 10:37:31 +0000
Subject: [PATCH 16/35] add dispatch tests

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 requirements-test.txt                         |  22 ++-
 .../layers/fused_moe/fused_moe.py             | 179 +++++++++++-------
 2 files changed, 130 insertions(+), 71 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index f5722c82e201..e5bf67e099e4 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -23,6 +23,10 @@ anyio==4.6.2.post1
     # via httpx
 argcomplete==3.5.1
     # via datamodel-code-generator
+async-timeout==4.0.3
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -116,6 +120,10 @@ encodec==0.1.1
     # via vocos
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.2.2
+    # via
+    #   anyio
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -544,9 +552,7 @@ sentence-transformers==3.2.1
 sentencepiece==0.2.0
     # via mistral-common
 setuptools==75.8.0
-    # via
-    #   pytablewriter
-    #   torch
+    # via pytablewriter
 six==1.16.0
     # via
     #   python-dateutil
@@ -591,6 +597,12 @@ timm==1.0.11
     # via -r requirements-test.in
 tokenizers==0.21.0
     # via transformers
+toml==0.10.2
+    # via datamodel-code-generator
+tomli==2.2.1
+    # via
+    #   black
+    #   pytest
 torch==2.5.1
     # via
     #   -r requirements-test.in
@@ -651,13 +663,17 @@ typepy==1.3.2
     #   tabledata
 typing-extensions==4.12.2
     # via
+    #   anyio
     #   bitsandbytes
+    #   black
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
 tzdata==2024.2
     # via pandas
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 6470dbaa555a..6b5073d3b913 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -924,6 +924,35 @@ def try_get_optimal_moe_config(
     return config
 
 
+def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
+                            topk_indices: torch.Tensor,
+                            token_expert_indices: torch.Tensor,
+                            gating_output: torch.Tensor,
+                            renormalize: bool) -> None:
+    import aiter as rocm_aiter
+    rocm_aiter.topk_softmax(topk_weights, topk_indices, token_expert_indices,
+                            gating_output, renormalize)
+
+
+def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
+                      token_expert_indices: torch.Tensor,
+                      gating_output: torch.Tensor, renormalize: bool) -> None:
+    ops.topk_softmax(
+        topk_weights,
+        topk_indices,
+        token_expert_indices,
+        gating_output,
+    )
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+
+def dispatch_topk_func() -> Callable[..., torch.Tensor]:
+    if rocm_aiter_moe_enabled():
+        return rocm_aiter_topk_softmax
+    return vllm_topk_softmax
+
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -948,22 +977,10 @@ def fused_topk(
                                         dtype=torch.int32,
                                         device=hidden_states.device)
 
-    if rocm_aiter_moe_enabled():
-        import aiter as rocm_aiter
-
-        rocm_aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies,
-                                gating_output.float(), renormalize)
-    else:
-        ops.topk_softmax(
-            topk_weights,
-            topk_ids,
-            token_expert_indicies,
-            gating_output.float(),  # TODO(woosuk): Optimize this.
-        )
+    gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
 
-        if renormalize:
-            topk_weights = topk_weights / topk_weights.sum(dim=-1,
-                                                           keepdim=True)
+    dispatch_topk_func()(topk_weights, topk_ids, token_expert_indicies,
+                         gating_output_float, renormalize)
 
     del token_expert_indicies  # Not used. Will be used in the future.
     return topk_weights, topk_ids
@@ -1155,16 +1172,21 @@ def outplace_fused_experts_fake(
 )
 
 
-def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
-                             w1: torch.Tensor,
-                             w2: torch.Tensor,
-                             topk_weights: torch.Tensor,
-                             topk_ids: torch.Tensor,
-                             use_fp8_w8a8: bool = False,
-                             w1_scale: Optional[torch.Tensor] = None,
-                             w2_scale: Optional[torch.Tensor] = None,
-                             block_shape: Optional[List[int]] = None,
-                             expert_mask: Optional[torch.Tensor] = None):
+def rocm_aiter_fused_experts(
+        *,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        use_fp8_w8a8: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        **kwagrs  # Ignore additional keyword arguments
+) -> torch.Tensor:
+
     import aiter as rocm_aiter
     import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
 
@@ -1229,51 +1251,72 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
                                            fc1_smooth_scale=None,
                                            fc2_smooth_scale=None,
                                            a16=False)
-    else:
-        return rocm_aiter.ck_moe(hidden_states=hidden_states,
-                                 w1=w1,
-                                 w2=w2,
-                                 topk_weights=topk_weights,
-                                 topk_ids=topk_ids)
-
-
-def fused_experts(hidden_states: torch.Tensor,
-                  w1: torch.Tensor,
-                  w2: torch.Tensor,
-                  topk_weights: torch.Tensor,
-                  topk_ids: torch.Tensor,
-                  inplace: bool = False,
-                  activation: str = "silu",
-                  use_fp8_w8a8: bool = False,
-                  use_int8_w8a16: bool = False,
-                  use_int4_w4a16: bool = False,
-                  global_num_experts: int = -1,
-                  expert_map: Optional[torch.Tensor] = None,
-                  w1_scale: Optional[torch.Tensor] = None,
-                  w2_scale: Optional[torch.Tensor] = None,
-                  w1_zp: Optional[torch.Tensor] = None,
-                  w2_zp: Optional[torch.Tensor] = None,
-                  a1_scale: Optional[torch.Tensor] = None,
-                  a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[List[int]] = None,
-                  expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    return rocm_aiter.ck_moe(hidden_states=hidden_states,
+                             w1=w1,
+                             w2=w2,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids)
+
+
+def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor:
+    hidden_states = kwargs['hidden_states']
+    torch.ops.vllm.inplace_fused_experts(**kwargs)
+    return hidden_states
+
+
+def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
+    return torch.ops.vllm.outplace_fused_experts(**kwargs)
+
+
+def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
     if rocm_aiter_moe_enabled():
-        return rocm_aiter_fused_experts(hidden_states, w1, w2, topk_weights,
-                                        topk_ids, use_fp8_w8a8, w1_scale,
-                                        w2_scale, block_shape, expert_mask)
+        return rocm_aiter_fused_experts
     if inplace:
-        torch.ops.vllm.inplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, activation,
-            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
-            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-            block_shape)
-        return hidden_states
-    else:
-        return torch.ops.vllm.outplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, activation,
-            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
-            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-            block_shape)
+        return torch_vllm_inplace_fused_experts
+    return torch_vllm_outplace_fused_experts
+
+
+def fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+) -> torch.Tensor:
+    return dispatch_fused_experts_func(inplace)(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        activation=activation,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        use_int4_w4a16=use_int4_w4a16,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        w1_zp=w1_zp,
+        w2_zp=w2_zp,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,

From 459bb02fa4b1f52ef8aff0c8f8b0b427dd7b0cdd Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 5 Mar 2025 10:38:56 +0000
Subject: [PATCH 17/35] add dispatch tests

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../model_executor/test_enabled_custom_ops.py |  54 ++++++++++
 .../layers/quantization/utils/fp8_utils.py    | 102 ++++++++++--------
 2 files changed, 114 insertions(+), 42 deletions(-)

diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 4a6a766b8ca0..4d2a61d76160 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -7,7 +7,15 @@
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    dispatch_fused_experts_func, dispatch_topk_func, rocm_aiter_fused_experts,
+    rocm_aiter_topk_softmax, torch_vllm_inplace_fused_experts,
+    torch_vllm_outplace_fused_experts, vllm_topk_softmax)
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    cutlass_scaled_mm, dispatch_blockscale_func,
+    rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
 
 
 # Registered subclass for test
@@ -87,3 +95,49 @@ def test_enabled_ops_invalid(env: str):
             custom_ops=env.split(",")))
         with set_current_vllm_config(vllm_config):
             RMSNorm(1024).enabled()
+
+
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    topk_func = dispatch_topk_func()
+
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        assert topk_func == rocm_aiter_topk_softmax
+    else:
+        assert topk_func == vllm_topk_softmax
+
+
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
+                                monkeypatch):
+
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    fused_experts_func = dispatch_fused_experts_func(inplace)
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        assert fused_experts_func == rocm_aiter_fused_experts
+    elif inplace:
+        assert fused_experts_func == torch_vllm_inplace_fused_experts
+    else:
+        assert fused_experts_func == torch_vllm_outplace_fused_experts
+
+
+@pytest.mark.parametrize("use_cutlass", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_block_gemm", ["0", "1"])
+def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str,
+                             use_rocm_aiter_block_gemm: str, monkeypatch):
+
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM",
+                       use_rocm_aiter_block_gemm)
+    block_scale_func = dispatch_blockscale_func(use_cutlass)
+
+    if use_cutlass:
+        assert block_scale_func == cutlass_scaled_mm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_block_gemm):
+        assert block_scale_func == rocm_aiter_gemm_a8w8_blockscale
+    else:
+        assert block_scale_func == w8a8_block_fp8_matmul
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 894fe775a85a..339fd8c00b95 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -4,7 +4,7 @@
 import functools
 import json
 import os
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import triton
@@ -18,7 +18,7 @@
     CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
 from vllm.platforms import current_platform
 from vllm.utils import (direct_register_custom_op,
-                        rocm_aiter_fp8_block_scaled_moe_enabled)
+                        rocm_aiter_w8a8_block_gemm_enabled)
 
 logger = init_logger(__name__)
 
@@ -33,6 +33,54 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
     return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
 
 
+def shape_supported_by_cutlass(weight: torch.Tensor, block_size: List[int],
+                               weight_scale: torch.Tensor,
+                               input_2d: torch.Tensor) -> bool:
+    if current_platform.is_rocm():
+        scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) +
+                         input_2d.shape[:-1])[::-1]
+        scale_b_shape = (weight_scale.view(-1, 1)
+                         if weight_scale.dim() <= 1 else weight_scale.T).shape
+        ar, ac = scale_a_shape
+        br, bc = scale_b_shape
+        return ac > 1 or bc > 1 or ar not in (1, input_2d.shape[0]) \
+            or br not in (1, weight.shape[0])
+
+    return weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
+
+
+def cutlass_scaled_mm(A: torch.Tensor, B: torch.Tensor, As: torch.Tensor,
+                      Bs: torch.Tensor, output_dtype: torch.dtype,
+                      **kwargs) -> torch.Tensor:
+    return ops.cutlass_scaled_mm(A,
+                                 B.T,
+                                 out_dtype=output_dtype,
+                                 scale_a=As,
+                                 scale_b=Bs.T)
+
+
+def rocm_aiter_gemm_a8w8_blockscale(A: torch.Tensor,
+                                    B: torch.Tensor,
+                                    As: torch.Tensor,
+                                    Bs: torch.Tensor,
+                                    output_dtype: torch.dtype = torch.float16,
+                                    **kwargs) -> torch.Tensor:
+    import aiter as rocm_aiter
+
+    output = torch.zeros([A.shape[0], B.shape[0]],
+                         dtype=output_dtype,
+                         device=A.device)
+    return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, output)
+
+
+def dispatch_blockscale_func(use_cutlass: bool) -> Callable[..., torch.Tensor]:
+    if use_cutlass:
+        return cutlass_scaled_mm
+    if rocm_aiter_w8a8_block_gemm_enabled():
+        return rocm_aiter_gemm_a8w8_blockscale
+    return w8a8_block_fp8_matmul
+
+
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -46,47 +94,17 @@ def apply_w8a8_block_fp8_linear(
     # View input as 2D matrix for fp8 methods
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
+    use_cutlass = cutlass_block_fp8_supported and shape_supported_by_cutlass()
+
+    q_input, x_scale = per_token_group_quant_fp8(
+        input_2d, block_size[1], column_major_scales=use_cutlass)
+    output = dispatch_blockscale_func()(A=q_input,
+                                        B=weight,
+                                        As=x_scale,
+                                        Bs=weight_scale,
+                                        block_size=block_size,
+                                        output_dtype=input.dtype)
 
-    shape_supported_by_cutlass = (weight.shape[0] % 128 == 0
-                                  and weight.shape[1] % 128 == 0)
-    if current_platform.is_rocm():
-        scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) +
-                         input_2d.shape[:-1])[::-1]
-        scale_b_shape = (weight_scale.view(-1, 1)
-                         if weight_scale.dim() <= 1 else weight_scale.T).shape
-        ar, ac = scale_a_shape
-        br, bc = scale_b_shape
-        if (ac > 1 or bc > 1 or ar not in (1, input_2d.shape[0])
-                or br not in (1, weight.shape[0])):
-            shape_supported_by_cutlass = False
-    if cutlass_block_fp8_supported and shape_supported_by_cutlass:
-        q_input, x_scale = per_token_group_quant_fp8(input_2d,
-                                                     block_size[1],
-                                                     column_major_scales=True)
-        output = ops.cutlass_scaled_mm(q_input,
-                                       weight.T,
-                                       out_dtype=input.dtype,
-                                       scale_a=x_scale,
-                                       scale_b=weight_scale.T)
-    else:
-        q_input, x_scale = per_token_group_quant_fp8(input_2d,
-                                                     block_size[1],
-                                                     column_major_scales=False)
-        if rocm_aiter_fp8_block_scaled_moe_enabled():
-            import aiter as rocm_aiter
-
-            output = torch.zeros([q_input.shape[0], weight.shape[0]],
-                                 dtype=input.dtype,
-                                 device=q_input.device)
-            output = rocm_aiter.gemm_a8w8_blockscale(q_input, weight, x_scale,
-                                                     weight_scale, output)
-        else:
-            output = w8a8_block_fp8_matmul(q_input,
-                                           weight,
-                                           x_scale,
-                                           weight_scale,
-                                           block_size,
-                                           output_dtype=input.dtype)
     if bias is not None:
         output = output + bias
     return output.to(dtype=input.dtype).view(*output_shape)

From acc27ffa94e677b8f6fce0f5b593430ce6acbfe4 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 5 Mar 2025 11:48:26 +0000
Subject: [PATCH 18/35] add dispatch tests

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../model_executor/test_enabled_custom_ops.py | 39 ++++++++-
 vllm/model_executor/layers/layernorm.py       | 82 ++++++++++++-------
 vllm/model_executor/layers/linear.py          | 21 +++--
 3 files changed, 106 insertions(+), 36 deletions(-)

diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 4d2a61d76160..9796b2e494d2 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+import torch.nn.functional as F
 
 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.custom_op import CustomOp
@@ -11,7 +12,11 @@
     dispatch_fused_experts_func, dispatch_topk_func, rocm_aiter_fused_experts,
     rocm_aiter_topk_softmax, torch_vllm_inplace_fused_experts,
     torch_vllm_outplace_fused_experts, vllm_topk_softmax)
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import (
+    RMSNorm, dispatch_rmsnorm_func, fused_add_rms_norm, rms_norm,
+    rocm_aiter_rmsnorm2d_fwd_with_add)
+from vllm.model_executor.layers.linear import (
+    dipsatch_unquantized_linear_func, rocm_aiter_tgemm_mm)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     cutlass_scaled_mm, dispatch_blockscale_func,
     rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul)
@@ -141,3 +146,35 @@ def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str,
         assert block_scale_func == rocm_aiter_gemm_a8w8_blockscale
     else:
         assert block_scale_func == w8a8_block_fp8_matmul
+
+
+@pytest.mark.parametrize("add_residual", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
+def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
+                           use_rocm_aiter_norm: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_NORM", use_rocm_aiter_norm)
+    rms_norm_func = dispatch_rmsnorm_func(add_residual)
+
+    if not add_residual:
+        assert rms_norm_func == rms_norm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_norm):
+        assert rms_norm_func == rocm_aiter_rmsnorm2d_fwd_with_add
+    else:
+        assert rms_norm_func == fused_add_rms_norm
+
+
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_linear", ["0", "1"])
+def test_unquantized_linear_dispatch(use_rocm_aiter: str,
+                                     use_rocm_aiter_linear: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR", use_rocm_aiter_linear)
+    linear_func = dipsatch_unquantized_linear_func()
+    if current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_linear):
+        assert linear_func == rocm_aiter_tgemm_mm
+    else:
+        assert linear_func == F.linear
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 5976d9e38253..d31b690a8c0d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Custom normalization layers."""
-from typing import Optional, Tuple, Union
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -9,6 +9,56 @@
 from vllm.utils import rocm_aiter_norm_enabled
 
 
+def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float,
+             **kwargs) -> torch.Tensor:
+    from vllm import _custom_ops as ops
+    out = torch.empty_like(x)
+    ops.rms_norm(
+        out,
+        x,
+        weight,
+        variance_epsilon,
+    )
+    return out
+
+
+def fused_add_rms_norm(*, x: torch.Tensor, residual: torch.Tensor,
+                       weight: torch.Tensor, variance_epsilon: float):
+    from vllm import _custom_ops as ops
+    return ops.fused_add_rms_norm(
+        x,
+        residual,
+        weight,
+        variance_epsilon,
+    )
+
+
+def rocm_aiter_rmsnorm2d_fwd_with_add(
+        *, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+    import aiter as rocm_aiter
+
+    rocm_aiter.rmsnorm2d_fwd_with_add(
+        x,
+        x,
+        residual,
+        residual,
+        weight,
+        variance_epsilon,
+    )
+    return x, residual
+
+
+def dispatch_rmsnorm_func(
+        add_residual: bool
+) -> Callable[..., Tuple[torch.Tensor, torch.Tensor]]:
+    if not add_residual:
+        return rms_norm
+    if rocm_aiter_norm_enabled():
+        return rocm_aiter_rmsnorm2d_fwd_with_add
+    return fused_add_rms_norm
+
+
 @CustomOp.register("rms_norm")
 class RMSNorm(CustomOp):
     """Root mean square normalization.
@@ -82,37 +132,13 @@ def forward_cuda(
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
-        from vllm import _custom_ops as ops
-
-        if residual is not None:
-            if rocm_aiter_norm_enabled():
-                import aiter as rocm_aiter
-
-                rocm_aiter.rmsnorm2d_fwd_with_add(
-                    x,
-                    x,
-                    residual,
-                    residual,
-                    self.weight.data,
-                    self.variance_epsilon,
-                )
-            else:
-                ops.fused_add_rms_norm(
-                    x,
-                    residual,
-                    self.weight.data,
-                    self.variance_epsilon,
-                )
-            return x, residual
-
-        out = torch.empty_like(x)
-        ops.rms_norm(
-            out,
+        add_residual = residual is not None
+        return dispatch_rmsnorm_func(add_residual)(
             x,
+            residual,
             self.weight.data,
             self.variance_epsilon,
         )
-        return out
 
     def forward_hpu(
         self,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 72c2289bb063..f484eb140f53 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -2,7 +2,7 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Optional
+from typing import Callable, Optional
 
 import torch
 import torch.nn.functional as F
@@ -39,6 +39,18 @@
 ]
 
 
+def rocm_aiter_tgemm_mm(x: torch.Tensor, weight: torch.Tensor,
+                        bias: torch.Tensor) -> torch.Tensor:
+    from aiter.tuned_gemm import tgemm
+    return tgemm.mm(x, weight, bias)
+
+
+def dipsatch_unquantized_linear_func() -> Callable[..., torch.Tensor]:
+    if rocm_aiter_linear_enabled():
+        return rocm_aiter_tgemm_mm
+    return F.linear
+
+
 def adjust_marlin_shard(param, shard_size, shard_offset):
     marlin_tile_size = getattr(param, "marlin_tile_size", None)
     if marlin_tile_size is None:
@@ -139,12 +151,7 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        if rocm_aiter_linear_enabled():
-            from aiter.tuned_gemm import tgemm as rocm_aiter_tgemm
-
-            return rocm_aiter_tgemm.mm(x, layer.weight, bias)
-
-        return F.linear(x, layer.weight, bias)
+        return dipsatch_unquantized_linear_func()(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):

From 11b6aba777b6f265cd35d1fcda1ce287c7c413ff Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 5 Mar 2025 15:23:18 +0000
Subject: [PATCH 19/35] bugfixes in layernorm and fix spelling mistakes

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/model_executor/test_enabled_custom_ops.py |  4 ++--
 vllm/model_executor/layers/layernorm.py         | 12 +++++++-----
 vllm/model_executor/layers/linear.py            |  4 ++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 9796b2e494d2..b6d0248b8ab7 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -16,7 +16,7 @@
     RMSNorm, dispatch_rmsnorm_func, fused_add_rms_norm, rms_norm,
     rocm_aiter_rmsnorm2d_fwd_with_add)
 from vllm.model_executor.layers.linear import (
-    dipsatch_unquantized_linear_func, rocm_aiter_tgemm_mm)
+    dispatch_unquantized_linear_func, rocm_aiter_tgemm_mm)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     cutlass_scaled_mm, dispatch_blockscale_func,
     rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul)
@@ -172,7 +172,7 @@ def test_unquantized_linear_dispatch(use_rocm_aiter: str,
                                      use_rocm_aiter_linear: str, monkeypatch):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
     monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR", use_rocm_aiter_linear)
-    linear_func = dipsatch_unquantized_linear_func()
+    linear_func = dispatch_unquantized_linear_func()
     if current_platform.is_rocm() and int(use_rocm_aiter) and int(
             use_rocm_aiter_linear):
         assert linear_func == rocm_aiter_tgemm_mm
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index d31b690a8c0d..ffd2187a1da3 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -25,12 +25,14 @@ def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float,
 def fused_add_rms_norm(*, x: torch.Tensor, residual: torch.Tensor,
                        weight: torch.Tensor, variance_epsilon: float):
     from vllm import _custom_ops as ops
-    return ops.fused_add_rms_norm(
+
+    ops.fused_add_rms_norm(
         x,
         residual,
         weight,
         variance_epsilon,
     )
+    return x, residual
 
 
 def rocm_aiter_rmsnorm2d_fwd_with_add(
@@ -134,10 +136,10 @@ def forward_cuda(
 
         add_residual = residual is not None
         return dispatch_rmsnorm_func(add_residual)(
-            x,
-            residual,
-            self.weight.data,
-            self.variance_epsilon,
+            x=x,
+            residual=residual,
+            weight=self.weight.data,
+            variance_epsilon=self.variance_epsilon,
         )
 
     def forward_hpu(
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index f484eb140f53..dcf28237c85b 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -45,7 +45,7 @@ def rocm_aiter_tgemm_mm(x: torch.Tensor, weight: torch.Tensor,
     return tgemm.mm(x, weight, bias)
 
 
-def dipsatch_unquantized_linear_func() -> Callable[..., torch.Tensor]:
+def dispatch_unquantized_linear_func() -> Callable[..., torch.Tensor]:
     if rocm_aiter_linear_enabled():
         return rocm_aiter_tgemm_mm
     return F.linear
@@ -151,7 +151,7 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return dipsatch_unquantized_linear_func()(x, layer.weight, bias)
+        return dispatch_unquantized_linear_func()(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):

From 0a6b8a0c0a20d2c10f301fb452737aa309a8f31d Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 5 Mar 2025 15:27:40 +0000
Subject: [PATCH 20/35] enable rocm aiter paged attention

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/rocm_flash_attn.py  |  9 +++++----
 vllm/attention/ops/rocm_aiter_paged_attn.py | 21 ++++++++++++++-------
 vllm/envs.py                                |  5 +++--
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index dd7e3a67f6a9..28b3c914b3f6 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -14,12 +14,12 @@
                                            CommonMetadataBuilder)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils import rocm_aiter_paged_attn_enabled
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
-USE_ROCM_AITER_PA = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
-if USE_ROCM_AITER_PA:
+if rocm_aiter_paged_attn_enabled():
     from vllm.attention.ops.rocm_aiter_paged_attn import (
         PagedAttention, PagedAttentionMetadata)
 else:
@@ -615,7 +615,7 @@ def forward(
         else:
             assert value is None
 
-        if (USE_ROCM_AITER_PA and kv_cache.dtype.itemsize == 1
+        if (rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
                 and not self.aiter_kv_scales_initialized
                 and kv_cache.shape != torch.Size([0])):
             num_blocks = kv_cache.shape[1]
@@ -910,4 +910,5 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and not rocm_aiter_paged_attn_enabled())
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
index 8a425988290c..ee48ad928501 100644
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -108,6 +108,20 @@ def forward_decode(
         blocksparse_block_size: int = 64,
         blocksparse_head_sliding_step: int = 0,
     ) -> torch.Tensor:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            if num_kv_heads == 1:
+                k_scale, v_scale = (None, None)
+                query = query.contiguous()
+            else:
+                raise NotImplementedError(
+                    f"ROCM AITER paged attention does not \
+                    support num_kv_heads > 1 \
+                    for kv_cache_dtype: {kv_cache_dtype}")
+
+        elif "fp8" in kv_cache_dtype:
+            key_cache = key_cache.view(torch.float8_e4m3fnuz)
+            value_cache = value_cache.view(torch.float8_e4m3fnuz)
+
         if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
             # use blocksparse paged attention
             block_size = value_cache.size(-1)
@@ -120,13 +134,6 @@ def forward_decode(
         block_size = value_cache.shape[3]
         max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
 
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            k_scale, v_scale = (None, None)
-            query = query.contiguous()
-        elif "fp8" in kv_cache_dtype:
-            key_cache = key_cache.view(torch.float8_e4m3fnuz)
-            value_cache = value_cache.view(torch.float8_e4m3fnuz)
-
         rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
                               seq_lens, max_num_blocks_per_seq, k_scale,
                               v_scale, output)
diff --git a/vllm/envs.py b/vllm/envs.py
index 46c806e89310..f661bc479765 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -559,8 +559,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # use aiter paged attention if aiter ops are enabled.
     # this is disabled by default
     "VLLM_ROCM_USE_AITER_PAGED_ATTN":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
-             ("true", "1")),
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN",
+                                         "False").lower() in ("true", "1")),
 
     # use aiter w8a8 block gemm kerner if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_BLOCK_GEMM":

From 1474828da678efb959776c81332cc960dcd1048a Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 5 Mar 2025 16:07:29 +0000
Subject: [PATCH 21/35] bugfix: add the missing argument in dispatch

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/quantization/utils/fp8_utils.py           | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 339fd8c00b95..b8f7a8ca7636 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -98,12 +98,12 @@ def apply_w8a8_block_fp8_linear(
 
     q_input, x_scale = per_token_group_quant_fp8(
         input_2d, block_size[1], column_major_scales=use_cutlass)
-    output = dispatch_blockscale_func()(A=q_input,
-                                        B=weight,
-                                        As=x_scale,
-                                        Bs=weight_scale,
-                                        block_size=block_size,
-                                        output_dtype=input.dtype)
+    output = dispatch_blockscale_func(use_cutlass)(A=q_input,
+                                                   B=weight,
+                                                   As=x_scale,
+                                                   Bs=weight_scale,
+                                                   block_size=block_size,
+                                                   output_dtype=input.dtype)
 
     if bias is not None:
         output = output + bias

From b78114a8c38bb5f1098aa0f2535a9f0eae9f8a32 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 6 Mar 2025 08:10:05 +0000
Subject: [PATCH 22/35] update rocm AITER commit version

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 Dockerfile.rocm_base | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base
index 494dc17c13ae..8f423d8f1492 100644
--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="dfed377"
+ARG AITER_BRANCH="e1ec015"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base

From d20d7571128bf5927e0e04c918bbb8992edb7aa2 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 6 Mar 2025 09:37:29 +0000
Subject: [PATCH 23/35] bug fix

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../model_executor/test_enabled_custom_ops.py |  8 ++++----
 vllm/model_executor/layers/layernorm.py       | 13 +++++++------
 .../layers/quantization/utils/fp8_utils.py    | 19 +++++++++++--------
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index b6d0248b8ab7..340376d783c8 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -13,12 +13,12 @@
     rocm_aiter_topk_softmax, torch_vllm_inplace_fused_experts,
     torch_vllm_outplace_fused_experts, vllm_topk_softmax)
 from vllm.model_executor.layers.layernorm import (
-    RMSNorm, dispatch_rmsnorm_func, fused_add_rms_norm, rms_norm,
+    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
     rocm_aiter_rmsnorm2d_fwd_with_add)
 from vllm.model_executor.layers.linear import (
     dispatch_unquantized_linear_func, rocm_aiter_tgemm_mm)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm, dispatch_blockscale_func,
+    cutlass_scaled_mm, dispatch_w8a8_blockscale_func,
     rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 
@@ -137,7 +137,7 @@ def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str,
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
     monkeypatch.setenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM",
                        use_rocm_aiter_block_gemm)
-    block_scale_func = dispatch_blockscale_func(use_cutlass)
+    block_scale_func = dispatch_w8a8_blockscale_func(use_cutlass)
 
     if use_cutlass:
         assert block_scale_func == cutlass_scaled_mm
@@ -155,7 +155,7 @@ def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
                            use_rocm_aiter_norm: str, monkeypatch):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
     monkeypatch.setenv("VLLM_ROCM_USE_AITER_NORM", use_rocm_aiter_norm)
-    rms_norm_func = dispatch_rmsnorm_func(add_residual)
+    rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
 
     if not add_residual:
         assert rms_norm_func == rms_norm
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index ffd2187a1da3..8fb55802ca38 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -22,8 +22,9 @@ def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float,
     return out
 
 
-def fused_add_rms_norm(*, x: torch.Tensor, residual: torch.Tensor,
-                       weight: torch.Tensor, variance_epsilon: float):
+def fused_add_rms_norm(
+        *, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
     from vllm import _custom_ops as ops
 
     ops.fused_add_rms_norm(
@@ -51,9 +52,9 @@ def rocm_aiter_rmsnorm2d_fwd_with_add(
     return x, residual
 
 
-def dispatch_rmsnorm_func(
-        add_residual: bool
-) -> Callable[..., Tuple[torch.Tensor, torch.Tensor]]:
+def dispatch_cuda_rmsnorm_func(
+    add_residual: bool
+) -> Callable[..., Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]:
     if not add_residual:
         return rms_norm
     if rocm_aiter_norm_enabled():
@@ -135,7 +136,7 @@ def forward_cuda(
             return self.forward_native(x, residual)
 
         add_residual = residual is not None
-        return dispatch_rmsnorm_func(add_residual)(
+        return dispatch_cuda_rmsnorm_func(add_residual)(
             x=x,
             residual=residual,
             weight=self.weight.data,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index b8f7a8ca7636..33ad52a54334 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -73,7 +73,8 @@ def rocm_aiter_gemm_a8w8_blockscale(A: torch.Tensor,
     return rocm_aiter.gemm_a8w8_blockscale(A, B, As, Bs, output)
 
 
-def dispatch_blockscale_func(use_cutlass: bool) -> Callable[..., torch.Tensor]:
+def dispatch_w8a8_blockscale_func(
+        use_cutlass: bool) -> Callable[..., torch.Tensor]:
     if use_cutlass:
         return cutlass_scaled_mm
     if rocm_aiter_w8a8_block_gemm_enabled():
@@ -94,16 +95,18 @@ def apply_w8a8_block_fp8_linear(
     # View input as 2D matrix for fp8 methods
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
-    use_cutlass = cutlass_block_fp8_supported and shape_supported_by_cutlass()
+    use_cutlass = cutlass_block_fp8_supported and shape_supported_by_cutlass(
+        weight, block_size, weight_scale, input_2d)
 
     q_input, x_scale = per_token_group_quant_fp8(
         input_2d, block_size[1], column_major_scales=use_cutlass)
-    output = dispatch_blockscale_func(use_cutlass)(A=q_input,
-                                                   B=weight,
-                                                   As=x_scale,
-                                                   Bs=weight_scale,
-                                                   block_size=block_size,
-                                                   output_dtype=input.dtype)
+    output = dispatch_w8a8_blockscale_func(use_cutlass)(
+        A=q_input,
+        B=weight,
+        As=x_scale,
+        Bs=weight_scale,
+        block_size=block_size,
+        output_dtype=input.dtype)
 
     if bias is not None:
         output = output + bias

From 7754c2e0260f18690aa893dc504c1cc3d5adc2c3 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 6 Mar 2025 10:03:09 +0000
Subject: [PATCH 24/35] add more comments for code documentation

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/envs.py                                  | 32 +++++++++++--------
 vllm/model_executor/layers/fused_moe/layer.py |  1 +
 .../model_executor/layers/quantization/fp8.py |  3 ++
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index f661bc479765..79baee7b5bf1 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -80,7 +80,7 @@
     VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = True
     VLLM_ROCM_USE_AITER_NORM: bool = True
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
-    VLLM_ROCM_USE_AITER_BLOCK_GEMM: bool = True
+    VLLM_ROCM_USE_AITER_BLOCK_GEMM: bool = False
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
@@ -524,32 +524,34 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_USE_V1":
     lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
 
-    # use aiter ops unless specifically disabled
+    # use aiter ops unless specifically disabled.
+    # Acts as a parent switch to enable the rest of the other operations.
     "VLLM_ROCM_USE_AITER":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
 
-    # use aiter moe op if aiter ops are enabled
+    # use aiter moe op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_MOE":
     lambda:
     (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
      ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
      ("true", "1")),
 
-    # use aiter block scaled moe op if aiter ops are enabled
+    # use aiter block scaled moe op if aiter ops are enabled.
+    # by default this is disabled.
     "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
-             ("true", "1") and os.getenv(
-                 "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "true").lower() in
-             ("true", "1")),
+    lambda:
+    (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+     ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE",
+                                 "false").lower() in ("true", "1")),
 
-    # use aiter linear op if aiter ops are enabled
+    # use aiter linear op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_LINEAR":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True"
                                          ).lower() in ("true", "1")),
 
-    # use aiter rms norm op if aiter ops are enabled
+    # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_NORM":
     lambda:
     (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
@@ -557,19 +559,21 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
      ("true", "1")),
 
     # use aiter paged attention if aiter ops are enabled.
-    # this is disabled by default
+    # this is disabled by default.
     "VLLM_ROCM_USE_AITER_PAGED_ATTN":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN",
                                          "False").lower() in ("true", "1")),
 
-    # use aiter w8a8 block gemm kerner if aiter ops are enabled.
+    # use aiter w8a8 block gemm kernel if aiter ops are enabled.
+    # this is disabled by default.
     "VLLM_ROCM_USE_AITER_BLOCK_GEMM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM",
-                                         "True").lower() in ("true", "1")),
+                                         "False").lower() in ("true", "1")),
 
-    # Pad the fp8 weights to 256 bytes for ROCm
+    # Pad the fp8 weights to 256 bytes for ROCm.
+    # Used only in Fp8LinearMethod
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
     # Divisor for dynamic key scale factor calculation for FP8 KV Cache
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 61924c68e393..08da5a42d171 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -100,6 +100,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
         if rocm_aiter_moe_enabled():
+            # reshaping weights is required for aiter moe kernel.
             from aiter.ops.shuffle import (shuffle_weight as
                                            rocm_aiter_shuffle_weight)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f6aa35d44cb2..e9e73a0aa2e1 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -560,6 +560,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
             if rocm_aiter_fp8_block_scaled_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
                 from aiter.ops.shuffle import (shuffle_weight as
                                                rocm_aiter_shuffle_weight)
 
@@ -600,6 +601,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                  requires_grad=False)
 
             if rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
                 from aiter.ops.shuffle import (shuffle_weight as
                                                rocm_aiter_shuffle_weight)
 
@@ -687,6 +689,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     start += shard_size
 
             if rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
                 from aiter.ops.shuffle import (shuffle_weight as
                                                rocm_aiter_shuffle_weight)
 

From 5e31c3e129e756b87a8886557b58665a0c72df8b Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 7 Mar 2025 04:47:54 +0000
Subject: [PATCH 25/35] disable some model tests

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/run-amd-test.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 96fcafc9dc1c..a18dd1abd41a 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -96,6 +96,15 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_mamba_mixer2.py"
 fi
 
+#ignore certain Model tests
+if [[ $commands == *" models/decoder_only/language "* ]]; then
+  commands=${commands//" models/decoder_only/language "/" models/decoder_only/language \
+  --ignore=models/decoder_only/language/test_mistral.py \
+  --ignore=models/decoder_only/language/test_phimoe.py \
+  --ignore=models/decoder_only/language/test_granite.py \
+  --ignore=models/decoder_only/language/test_models.py "}
+fi
+
 #ignore certain Entrypoints tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
   commands=${commands//" entrypoints/openai "/" entrypoints/openai \

From d21c912f544f16d59297e23aa70d0232760be203 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 7 Mar 2025 04:55:58 +0000
Subject: [PATCH 26/35] move rocm-aiter env flag checks to
 vllm.platforms.current_platform

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../model_executor/test_enabled_custom_ops.py |  4 +--
 vllm/envs.py                                  | 17 +++++------
 .../layers/fused_moe/fused_moe.py             | 11 ++++---
 vllm/model_executor/layers/fused_moe/layer.py |  3 +-
 vllm/model_executor/layers/layernorm.py       |  4 +--
 vllm/model_executor/layers/linear.py          |  4 +--
 .../model_executor/layers/quantization/fp8.py |  8 ++---
 .../layers/quantization/utils/fp8_utils.py    |  5 ++--
 vllm/platforms/interface.py                   | 18 +++++++++++
 vllm/platforms/rocm.py                        | 19 ++++++++++++
 vllm/utils.py                                 | 30 -------------------
 11 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 340376d783c8..7c64ee2b03d9 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -135,7 +135,7 @@ def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str,
                              use_rocm_aiter_block_gemm: str, monkeypatch):
 
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM",
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM",
                        use_rocm_aiter_block_gemm)
     block_scale_func = dispatch_w8a8_blockscale_func(use_cutlass)
 
@@ -154,7 +154,7 @@ def test_block_gemm_dispatch(use_cutlass: bool, use_rocm_aiter: str,
 def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
                            use_rocm_aiter_norm: str, monkeypatch):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER_NORM", use_rocm_aiter_norm)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
     rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
 
     if not add_residual:
diff --git a/vllm/envs.py b/vllm/envs.py
index 79baee7b5bf1..824b2730388f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,9 +78,9 @@
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = True
-    VLLM_ROCM_USE_AITER_NORM: bool = True
+    VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
-    VLLM_ROCM_USE_AITER_BLOCK_GEMM: bool = False
+    VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM: bool = False
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
@@ -552,11 +552,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
                                          ).lower() in ("true", "1")),
 
     # use aiter rms norm op if aiter ops are enabled.
-    "VLLM_ROCM_USE_AITER_NORM":
-    lambda:
-    (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
-     ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_NORM", "True").lower() in
-     ("true", "1")),
+    "VLLM_ROCM_USE_AITER_RMSNORM":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True"
+                                         ).lower() in ("true", "1")),
 
     # use aiter paged attention if aiter ops are enabled.
     # this is disabled by default.
@@ -567,9 +566,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # use aiter w8a8 block gemm kernel if aiter ops are enabled.
     # this is disabled by default.
-    "VLLM_ROCM_USE_AITER_BLOCK_GEMM":
+    "VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
-             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_BLOCK_GEMM",
+             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM",
                                          "False").lower() in ("true", "1")),
 
     # Pad the fp8 weights to 256 bytes for ROCm.
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 6b5073d3b913..6b8048edf32c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -15,9 +15,7 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
 from vllm.platforms import current_platform
-from vllm.utils import (direct_register_custom_op,
-                        rocm_aiter_fp8_block_scaled_moe_enabled,
-                        rocm_aiter_moe_enabled)
+from vllm.utils import direct_register_custom_op
 
 logger = init_logger(__name__)
 
@@ -948,7 +946,7 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
 
 
 def dispatch_topk_func() -> Callable[..., torch.Tensor]:
-    if rocm_aiter_moe_enabled():
+    if current_platform.is_rocm_aiter_moe_enabled():
         return rocm_aiter_topk_softmax
     return vllm_topk_softmax
 
@@ -1190,7 +1188,8 @@ def rocm_aiter_fused_experts(
     import aiter as rocm_aiter
     import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
 
-    if rocm_aiter_fp8_block_scaled_moe_enabled() and use_fp8_w8a8:
+    if current_platform.is_rocm_aiter_fp8_block_scaled_moe_enabled(
+    ) and use_fp8_w8a8:
         assert w1_scale is not None
         assert w2_scale is not None
 
@@ -1270,7 +1269,7 @@ def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
 
 
 def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
-    if rocm_aiter_moe_enabled():
+    if current_platform.is_rocm_aiter_moe_enabled():
         return rocm_aiter_fused_experts
     if inplace:
         return torch_vllm_inplace_fused_experts
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 08da5a42d171..42aa6252ca1c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -19,7 +19,6 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
-from vllm.utils import rocm_aiter_moe_enabled
 
 if current_platform.is_cuda_alike():
     from .fused_moe import fused_experts
@@ -99,7 +98,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
-        if rocm_aiter_moe_enabled():
+        if current_platform.is_rocm_aiter_moe_enabled():
             # reshaping weights is required for aiter moe kernel.
             from aiter.ops.shuffle import (shuffle_weight as
                                            rocm_aiter_shuffle_weight)
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 8fb55802ca38..a3e12bf49d0f 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 
 from vllm.model_executor.custom_op import CustomOp
-from vllm.utils import rocm_aiter_norm_enabled
+from vllm.platforms import current_platform
 
 
 def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float,
@@ -57,7 +57,7 @@ def dispatch_cuda_rmsnorm_func(
 ) -> Callable[..., Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]:
     if not add_residual:
         return rms_norm
-    if rocm_aiter_norm_enabled():
+    if current_platform.is_rocm_aiter_rmsnorm_enabled():
         return rocm_aiter_rmsnorm2d_fwd_with_add
     return fused_add_rms_norm
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dcf28237c85b..9b07a5262443 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -25,7 +25,7 @@
                                            RowvLLMParameter)
 # yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import rocm_aiter_linear_enabled
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -46,7 +46,7 @@ def rocm_aiter_tgemm_mm(x: torch.Tensor, weight: torch.Tensor,
 
 
 def dispatch_unquantized_linear_func() -> Callable[..., torch.Tensor]:
-    if rocm_aiter_linear_enabled():
+    if current_platform.is_rocm_aiter_linear_enabled():
         return rocm_aiter_tgemm_mm
     return F.linear
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e9e73a0aa2e1..9db95d37d4ef 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -33,8 +33,6 @@
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import (rocm_aiter_fp8_block_scaled_moe_enabled,
-                        rocm_aiter_moe_enabled)
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -559,7 +557,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
-            if rocm_aiter_fp8_block_scaled_moe_enabled():
+            if current_platform.is_rocm_aiter_fp8_block_scaled_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
                 from aiter.ops.shuffle import (shuffle_weight as
                                                rocm_aiter_shuffle_weight)
@@ -600,7 +598,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                  requires_grad=False)
 
-            if rocm_aiter_moe_enabled():
+            if current_platform.is_rocm_aiter_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
                 from aiter.ops.shuffle import (shuffle_weight as
                                                rocm_aiter_shuffle_weight)
@@ -688,7 +686,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                             dq_weight, max_w13_scales[expert_id])
                     start += shard_size
 
-            if rocm_aiter_moe_enabled():
+            if current_platform.is_rocm_aiter_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
                 from aiter.ops.shuffle import (shuffle_weight as
                                                rocm_aiter_shuffle_weight)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 33ad52a54334..be7978a323c5 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -17,8 +17,7 @@
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
 from vllm.platforms import current_platform
-from vllm.utils import (direct_register_custom_op,
-                        rocm_aiter_w8a8_block_gemm_enabled)
+from vllm.utils import direct_register_custom_op
 
 logger = init_logger(__name__)
 
@@ -77,7 +76,7 @@ def dispatch_w8a8_blockscale_func(
         use_cutlass: bool) -> Callable[..., torch.Tensor]:
     if use_cutlass:
         return cutlass_scaled_mm
-    if rocm_aiter_w8a8_block_gemm_enabled():
+    if current_platform.is_rocm_aiter_w8a8_block_gemm_enabled():
         return rocm_aiter_gemm_a8w8_blockscale
     return w8a8_block_fp8_matmul
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index e7e55e11775c..4305223e497f 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -143,6 +143,24 @@ def is_cuda_alike(self) -> bool:
         """Stateless version of :func:`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
+    def is_rocm_aiter_moe_enabled(self) -> bool:
+        return False
+
+    def is_rocm_aiter_paged_attn_enabled(self) -> bool:
+        return False
+
+    def is_rocm_aiter_linear_enabled(self) -> bool:
+        return False
+
+    def is_rocm_aiter_rmsnorm_enabled(self) -> bool:
+        return False
+
+    def is_rocm_aiter_fp8_block_scaled_moe_enabled(self) -> bool:
+        return False
+
+    def is_rocm_aiter_w8a8_block_gemm_enabled(self) -> bool:
+        return False
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index a4f18cbfc587..5fc08950f0af 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -231,3 +231,22 @@ def get_current_memory_usage(cls,
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+
+    def is_rocm_aiter_moe_enabled(self) -> bool:
+        return envs.VLLM_ROCM_USE_AITER_MOE
+
+    def is_rocm_aiter_paged_attn_enabled(self) -> bool:
+        return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+
+    def is_rocm_aiter_linear_enabled(self) -> bool:
+        return envs.VLLM_ROCM_USE_AITER_LINEAR
+
+    def is_rocm_aiter_rmsnorm_enabled(self) -> bool:
+        return envs.VLLM_ROCM_USE_AITER_RMSNORM
+
+    def is_rocm_aiter_fp8_block_scaled_moe_enabled(self) -> bool:
+        return self.rocm_aiter_moe_enabled(
+        ) and envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
+
+    def is_rocm_aiter_w8a8_block_gemm_enabled(self) -> bool:
+        return envs.VLLM_ROCM_USE_AITER_W8A8_BLOCK_GEMM
diff --git a/vllm/utils.py b/vllm/utils.py
index 1b87a21ed966..26c9e1a90837 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -997,36 +997,6 @@ def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
         enable_trace_function_call(log_path)
 
 
-def is_hip() -> bool:
-    from vllm.platforms import current_platform
-    return current_platform.is_rocm()
-
-
-def rocm_aiter_moe_enabled() -> bool:
-    return is_hip() and envs.VLLM_ROCM_USE_AITER_MOE
-
-
-def rocm_aiter_paged_attn_enabled() -> bool:
-    return is_hip() and envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
-
-
-def rocm_aiter_linear_enabled() -> bool:
-    return is_hip() and envs.VLLM_ROCM_USE_AITER_LINEAR
-
-
-def rocm_aiter_norm_enabled() -> bool:
-    return is_hip() and envs.VLLM_ROCM_USE_AITER_NORM
-
-
-def rocm_aiter_fp8_block_scaled_moe_enabled() -> bool:
-    return rocm_aiter_moe_enabled(
-    ) and envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
-
-
-def rocm_aiter_w8a8_block_gemm_enabled() -> bool:
-    return is_hip() and envs.VLLM_ROCM_USE_AITER_BLOCK_GEMM
-
-
 # `functools` helpers
 def identity(value: T, **kwargs) -> T:
     """Returns the first provided value."""

From 59f0208ef021cb56be1d1838e5bb7923b182e8a6 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 7 Mar 2025 08:28:59 +0000
Subject: [PATCH 27/35] bugfixes after refactoring the aiter modules enablility
 in current platform

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 8 ++++----
 vllm/platforms/rocm.py                     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 28b3c914b3f6..8d2868f8fded 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -14,12 +14,11 @@
                                            CommonMetadataBuilder)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import rocm_aiter_paged_attn_enabled
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
-if rocm_aiter_paged_attn_enabled():
+if current_platform.is_rocm_aiter_paged_attn_enabled():
     from vllm.attention.ops.rocm_aiter_paged_attn import (
         PagedAttention, PagedAttentionMetadata)
 else:
@@ -615,7 +614,8 @@ def forward(
         else:
             assert value is None
 
-        if (rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
+        if (current_platform.is_rocm_aiter_paged_attn_enabled()
+                and kv_cache.dtype.itemsize == 1
                 and not self.aiter_kv_scales_initialized
                 and kv_cache.shape != torch.Size([0])):
             num_blocks = kv_cache.shape[1]
@@ -911,4 +911,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
             and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
-            and not rocm_aiter_paged_attn_enabled())
+            and not current_platform.is_rocm_aiter_paged_attn_enabled())
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 5fc08950f0af..03c2d1b0a254 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -245,7 +245,7 @@ def is_rocm_aiter_rmsnorm_enabled(self) -> bool:
         return envs.VLLM_ROCM_USE_AITER_RMSNORM
 
     def is_rocm_aiter_fp8_block_scaled_moe_enabled(self) -> bool:
-        return self.rocm_aiter_moe_enabled(
+        return self.is_rocm_aiter_moe_enabled(
         ) and envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
 
     def is_rocm_aiter_w8a8_block_gemm_enabled(self) -> bool:

From 17b4d6adc78a547a6f032851cbb37d2c78122a7e Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 7 Mar 2025 15:53:11 +0000
Subject: [PATCH 28/35] update AMD CI to skip certain test cases

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/run-amd-test.sh                         | 13 ++++---------
 tests/models/decoder_only/language/test_granite.py |  2 ++
 tests/models/decoder_only/language/test_mistral.py | 10 ++++++++++
 tests/models/decoder_only/language/test_models.py  |  2 ++
 tests/models/decoder_only/language/test_phimoe.py  |  2 ++
 5 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index a18dd1abd41a..7a145de9671f 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -72,6 +72,10 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 
+# environment variables
+SKIP_ROCM_ATIER_MODEL_TEST_CASES="True"
+echo $SKIP_ROCM_ATIER_MODEL_TEST_CASES
+
 commands=$@
 echo "Commands:$commands"
 #ignore certain kernels tests
@@ -96,15 +100,6 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_mamba_mixer2.py"
 fi
 
-#ignore certain Model tests
-if [[ $commands == *" models/decoder_only/language "* ]]; then
-  commands=${commands//" models/decoder_only/language "/" models/decoder_only/language \
-  --ignore=models/decoder_only/language/test_mistral.py \
-  --ignore=models/decoder_only/language/test_phimoe.py \
-  --ignore=models/decoder_only/language/test_granite.py \
-  --ignore=models/decoder_only/language/test_models.py "}
-fi
-
 #ignore certain Entrypoints tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
   commands=${commands//" entrypoints/openai "/" entrypoints/openai \
diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py
index 7dec7bbd0ca1..01ca96fd5d33 100644
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -26,6 +26,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
+        if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true":
+            pytest.skip("Skipping test suite for ROCM AITER")
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with hf_runner(model, dtype=dtype) as hf_model:
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 68b27a8e7087..ef240b606bd7 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -181,6 +181,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
+        if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true":
+            pytest.skip("Skipping test suite for ROCM AITER")
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     # TODO(sang): Sliding window should be tested separately.
@@ -211,6 +213,8 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
                         max_tokens: int, num_logprobs: int,
                         use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
+        if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true":
+            pytest.skip("Skipping test suite for ROCM AITER")
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with vllm_runner(
@@ -248,6 +252,8 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
 def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str,
                                     use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
+        if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true":
+            pytest.skip("Skipping test suite for ROCM AITER")
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with vllm_runner(model,
@@ -271,6 +277,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str,
 def test_mistral_function_calling(vllm_runner, model: str, dtype: str,
                                   use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
+        if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true":
+            pytest.skip("Skipping test suite for ROCM AITER")
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with vllm_runner(model,
@@ -308,6 +316,8 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str,
 def test_mistral_guided_decoding(vllm_runner, model: str, guided_backend: str,
                                  use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
+        if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true":
+            pytest.skip("Skipping test suite for ROCM AITER")
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with vllm_runner(model, dtype='bfloat16',
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index eca2ca18dcb7..b6b3d4b315f9 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -71,6 +71,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
+        if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true":
+            pytest.skip("Skipping test suite for ROCM AITER")
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with hf_runner(model, dtype=dtype) as hf_model:
diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py
index 7e42124d7b2c..2badcaf104bd 100644
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -85,6 +85,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
                 dtype: str, max_tokens: int, num_logprobs: int,
                 use_rocm_aiter: bool, monkeypatch) -> None:
     if use_rocm_aiter:
+        if monkeypatch.getenv("SKIP_ROCM_ATIER_MODEL_TEST_CASES") == "true":
+            pytest.skip("Skipping test suite for ROCM AITER")
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     with hf_runner(model, dtype=dtype) as hf_model:

From c32c31f37f40c8481e24255f46aa26897993a3e4 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 7 Mar 2025 17:47:32 +0000
Subject: [PATCH 29/35] refactor dispatching for w8a8 scaled-mm

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../model_executor/test_enabled_custom_ops.py |  44 +++
 .../layers/quantization/utils/w8a8_utils.py   | 270 +++++++++++-------
 2 files changed, 206 insertions(+), 108 deletions(-)

diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 7c64ee2b03d9..43c8bfe37b74 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -20,6 +20,10 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     cutlass_scaled_mm, dispatch_w8a8_blockscale_func,
     rocm_aiter_gemm_a8w8_blockscale, w8a8_block_fp8_matmul)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    cutlass_w8a8_scaled_mm, dispatch_w8a8_scaled_mm,
+    rocm_aiter_per_tensor_w8a8_scaled_mm, torch_channelwise_w8a8_scaled_mm,
+    torch_per_tensor_w8a8_scaled_mm, torch_per_token_w8a8_scaled_mm)
 from vllm.platforms import current_platform
 
 
@@ -178,3 +182,43 @@ def test_unquantized_linear_dispatch(use_rocm_aiter: str,
         assert linear_func == rocm_aiter_tgemm_mm
     else:
         assert linear_func == F.linear
+
+
+@pytest.mark.parametrize("cutlass_fp8_supported", [True, False])
+@pytest.mark.parametrize("per_tensor_weights", [True, False])
+@pytest.mark.parametrize("per_tensor_activations", [True, False])
+@pytest.mark.parametrize("use_per_token_if_dynamic", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_linear", ["0", "1"])
+def test_scaled_mm_dispatch(cutlass_fp8_supported: bool,
+                            per_tensor_weights: bool,
+                            per_tensor_activations: bool,
+                            use_per_token_if_dynamic: bool,
+                            use_rocm_aiter: str, use_rocm_aiter_linear: str,
+                            monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR", use_rocm_aiter_linear)
+
+    w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(cutlass_fp8_supported,
+                                                  per_tensor_weights,
+                                                  per_tensor_activations,
+                                                  use_per_token_if_dynamic)
+
+    if cutlass_fp8_supported:
+        assert w8a8_scaled_mm_func == cutlass_w8a8_scaled_mm
+
+    elif per_tensor_weights and per_tensor_activations:
+
+        if current_platform.is_rocm() and int(use_rocm_aiter) and int(
+                use_rocm_aiter_linear):
+            assert w8a8_scaled_mm_func == rocm_aiter_per_tensor_w8a8_scaled_mm
+        else:
+            assert w8a8_scaled_mm_func == torch_per_tensor_w8a8_scaled_mm
+
+    elif (current_platform.is_rocm()
+          and current_platform.has_device_capability(94)
+          and use_per_token_if_dynamic and not per_tensor_weights
+          and not per_tensor_activations):
+        assert w8a8_scaled_mm_func == torch_per_token_w8a8_scaled_mm
+    else:
+        assert w8a8_scaled_mm_func == torch_channelwise_w8a8_scaled_mm
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 32a0415c4294..60d40e12b1ba 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,19 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.platforms import current_platform
 
-USE_ROCM_AITER_LINEAR = envs.VLLM_ROCM_USE_AITER_LINEAR \
-    and current_platform.is_rocm()
-if USE_ROCM_AITER_LINEAR:
-    from aiter.tuned_gemm import tgemm as aiter_tgemm
-
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = None
@@ -127,6 +121,151 @@ def maybe_create_device_identity():
         TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
+def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
+                           out_dtype: torch.dtype, scale_a: torch.Tensor,
+                           scale_b: torch.Tensor, bias: torch.Tensor,
+                           output_shape: List, **kwargs) -> torch.Tensor:
+
+    # Fused GEMM_DQ
+    output = ops.cutlass_scaled_mm(qinput,
+                                   weight,
+                                   out_dtype=out_dtype,
+                                   scale_a=scale_a,
+                                   scale_b=scale_b,
+                                   bias=bias)
+    return output.view(*output_shape)
+
+
+def rocm_aiter_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                         weight: torch.Tensor,
+                                         out_dtype: torch.dtype,
+                                         scale_a: torch.Tensor,
+                                         scale_b: torch.Tensor,
+                                         bias: torch.Tensor,
+                                         input_2d: torch.Tensor,
+                                         output_shape: List) -> torch.Tensor:
+    from aiter.tuned_gemm import tgemm as aiter_tgemm
+
+    output = aiter_tgemm.mm(qinput,
+                            weight.t(),
+                            otype=out_dtype,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            bias=bias)
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                    weight: torch.Tensor,
+                                    out_dtype: torch.dtype,
+                                    scale_a: torch.Tensor,
+                                    scale_b: torch.Tensor, bias: torch.Tensor,
+                                    input_2d: torch.Tensor,
+                                    output_shape: List) -> torch.Tensor:
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b,
+                              bias=bias)
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: List) -> torch.Tensor:
+    # For now validated on ROCm platform
+    # fp8 rowwise scaling in torch._scaled_mm is introduced in
+    # https://github.com/pytorch/pytorch/pull/144432 using
+    # hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
+    # For CUDA platform please validate if the
+    # torch._scaled_mm support rowwise scaled GEMM
+    # Fused GEMM_DQ Rowwise GEMM
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b.t(),
+                              bias=bias)
+
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    output = output.view(*output_shape)
+    return output
+
+
+def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                     weight: torch.Tensor,
+                                     out_dtype: torch.dtype,
+                                     scale_a: torch.Tensor,
+                                     scale_b: torch.Tensor, bias: torch.Tensor,
+                                     input_2d: torch.Tensor,
+                                     output_shape: List,
+                                     **kwargs) -> torch.Tensor:
+    # use unfused DQ due to limitations with scaled_mm
+
+    # Symmetric quantized GEMM by definition computes the following:
+    #   C = (s_x * X) (s_w * W) + bias
+    # This is equivalent to dequantizing the weights and activations
+    # before applying a GEMM.
+    #
+    # In order to compute quantized operands, a quantized kernel
+    # will rewrite the above like so:
+    #   C = s_w * s_x * (X * W) + bias
+    #
+    # For the scaled_mm fallback case, we break this down, since it
+    # does not support s_w being a vector.
+
+    # GEMM
+    # This computes C = (X * W).
+    # Output in fp32 to allow subsequent ops to happen in-place
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              scale_a=TORCH_DEVICE_IDENTITY,
+                              scale_b=TORCH_DEVICE_IDENTITY,
+                              out_dtype=torch.float32)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+    # Unpad (undo num_token_padding)
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    x_scale = torch.narrow(scale_a, 0, 0, input_2d.shape[0])
+
+    # DQ
+    # C = sw * sx * (X * W) + bias
+    output = output * x_scale * scale_b.t()
+    if bias is not None:
+        output = output + bias
+    return output.to(out_dtype).view(*output_shape)
+
+
+def dispatch_w8a8_scaled_mm(
+        cutlass_fp8_supported: bool, per_tensor_weights: bool,
+        per_tensor_activations: bool,
+        use_per_token_if_dynamic: bool) -> Callable[..., torch.Tensor]:
+
+    if cutlass_fp8_supported:
+        return cutlass_w8a8_scaled_mm
+    if per_tensor_weights and per_tensor_activations:
+        if current_platform.is_rocm_aiter_linear_enabled():
+            return rocm_aiter_per_tensor_w8a8_scaled_mm
+        return torch_per_tensor_w8a8_scaled_mm
+    if (use_per_token_if_dynamic and not per_tensor_weights
+            and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
+        return torch_per_token_w8a8_scaled_mm
+    return torch_channelwise_w8a8_scaled_mm
+
+
 def apply_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -153,23 +292,7 @@ def apply_fp8_linear(
             scale_ub=input_scale_ub,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
-        # Fused GEMM_DQ
-        output = ops.cutlass_scaled_mm(qinput,
-                                       weight,
-                                       out_dtype=input.dtype,
-                                       scale_a=x_scale,
-                                       scale_b=weight_scale,
-                                       bias=bias)
-        return output.view(*output_shape)
-
-    # torch.scaled_mm supports per tensor weights + activations only
-    # so fallback to naive if per channel or per token
     else:
-        # Note: we pad the input because torch._scaled_mm is more performant
-        # for matrices with batch dimension > 16.
-        # This could change in the future.
-        # We also don't pad when using torch.compile,
-        # as it breaks with dynamic shapes.
         config = get_current_vllm_config().compilation_config
         do_pad = config.level < CompilationLevel.PIECEWISE
         qinput, x_scale = ops.scaled_fp8_quant(
@@ -178,91 +301,22 @@ def apply_fp8_linear(
             num_token_padding=17 if do_pad else None,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
-        per_tensor_weights = (weight_scale.numel() == 1)
-        per_tensor_activations = (x_scale.numel() == 1)
-
-        if per_tensor_weights and per_tensor_activations:
-            # Fused GEMM_DQ
-            if USE_ROCM_AITER_LINEAR:
-                output = aiter_tgemm.mm(qinput,
-                                        weight.t(),
-                                        otype=input.dtype,
-                                        scale_a=x_scale,
-                                        scale_b=weight_scale,
-                                        bias=bias)
-            else:
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=input.dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale,
-                                          bias=bias)
-            # A fix for discrepancy in scaled_mm which returns tuple
-            # for torch < 2.5 and a single value in torch >= 2.5
-            if type(output) is tuple and len(output) == 2:
-                output = output[0]
-
-            return torch.narrow(output, 0, 0,
-                                input_2d.shape[0]).view(*output_shape)
-
-        elif (use_per_token_if_dynamic and not per_tensor_weights
-              and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
-            # For now validated on ROCm platform
-            # fp8 rowwise scaling in torch._scaled_mm is introduced in
-            # https://github.com/pytorch/pytorch/pull/144432 using
-            # hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
-            # For CUDA platform please validate if the
-            # torch._scaled_mm support rowwise scaled GEMM
-            # Fused GEMM_DQ Rowwise GEMM
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      out_dtype=input.dtype,
-                                      scale_a=x_scale,
-                                      scale_b=weight_scale.t(),
-                                      bias=bias)
-
-            output = torch.narrow(output, 0, 0, input_2d.shape[0])
-            output = output.view(*output_shape)
-            return output
-
-        else:
-            # Fallback for channelwise case, where we use unfused DQ
-            # due to limitations with scaled_mm
-
-            # Symmetric quantized GEMM by definition computes the following:
-            #   C = (s_x * X) (s_w * W) + bias
-            # This is equivalent to dequantizing the weights and activations
-            # before applying a GEMM.
-            #
-            # In order to compute quantized operands, a quantized kernel
-            # will rewrite the above like so:
-            #   C = s_w * s_x * (X * W) + bias
-            #
-            # For the scaled_mm fallback case, we break this down, since it
-            # does not support s_w being a vector.
-
-            # GEMM
-            # This computes C = (X * W).
-            # Output in fp32 to allow subsequent ops to happen in-place
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      scale_a=TORCH_DEVICE_IDENTITY,
-                                      scale_b=TORCH_DEVICE_IDENTITY,
-                                      out_dtype=torch.float32)
-            # A fix for discrepancy in scaled_mm which returns tuple
-            # for torch < 2.5 and a single value in torch >= 2.5
-            if type(output) is tuple and len(output) == 2:
-                output = output[0]
-            # Unpad (undo num_token_padding)
-            output = torch.narrow(output, 0, 0, input_2d.shape[0])
-            x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
-
-            # DQ
-            # C = sw * sx * (X * W) + bias
-            output = output * x_scale * weight_scale.t()
-            if bias is not None:
-                output = output + bias
-            return output.to(dtype=input.dtype).view(*output_shape)
+    per_tensor_weights = (weight_scale.numel() == 1)
+    per_tensor_activations = (x_scale.numel() == 1)
+
+    w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(cutlass_fp8_supported,
+                                                  per_tensor_weights,
+                                                  per_tensor_activations,
+                                                  use_per_token_if_dynamic)
+
+    return w8a8_scaled_mm_func(qinput=qinput,
+                               weight=weight,
+                               out_dtype=input.dtype,
+                               scale_a=x_scale,
+                               scale_b=weight_scale,
+                               bias=bias,
+                               input_2d=input_2d,
+                               output_shape=output_shape)
 
 
 def normalize_e4m3fn_to_e4m3fnuz(

From a5d73393f7f2162460089042374e2ebf6abad476 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 8 Mar 2025 05:08:25 +0000
Subject: [PATCH 30/35] fix cutlass flag bug

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/quantization/utils/w8a8_utils.py            | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 0ae255415b38..75ee0203f8fd 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -251,8 +251,8 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
 
 def dispatch_w8a8_scaled_mm(
         cutlass_fp8_supported: bool, per_tensor_weights: bool,
-        per_tensor_activations: bool,
-        use_per_token_if_dynamic: bool) -> Callable[..., torch.Tensor]:
+        per_tensor_activations: bool, use_per_token_if_dynamic: Optional[bool]
+) -> Callable[..., torch.Tensor]:
 
     if cutlass_fp8_supported:
         return cutlass_w8a8_scaled_mm
@@ -306,7 +306,7 @@ def apply(
     ) -> torch.Tensor:
         input_2d = input.view(-1, input.shape[-1])
         output_shape = [*input.shape[:-1], weight.shape[1]]
-        if cutlass_fp8_supported:
+        if self.cutlass_fp8_supported:
             qinput, x_scale = ops.scaled_fp8_quant(
                 input_2d,
                 input_scale,
@@ -326,8 +326,8 @@ def apply(
         per_tensor_activations = (x_scale.numel() == 1)
 
         w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(
-            cutlass_fp8_supported, per_tensor_weights, per_tensor_activations,
-            use_per_token_if_dynamic)
+            self.cutlass_fp8_supported, per_tensor_weights,
+            per_tensor_activations, use_per_token_if_dynamic)
 
         return w8a8_scaled_mm_func(qinput=qinput,
                                    weight=weight,

From ce30f63a54ead25ded4429ece2e750776316c2a0 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 8 Mar 2025 05:26:07 +0000
Subject: [PATCH 31/35] revert test requirements

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 requirements-test.in | 718 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 680 insertions(+), 38 deletions(-)

diff --git a/requirements-test.in b/requirements-test.in
index de33f92b37b9..cfc000a1b09f 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -1,41 +1,683 @@
-# testing
-pytest
-tensorizer>=2.9.0
-pytest-forked
-pytest-asyncio
-pytest-rerunfailures
-pytest-shard
-
-# testing utils
-awscli
-decord # required for video tests
-einops # required for MPT, qwen-vl and Mamba
-httpx
-librosa # required for audio tests
-vector_quantize_pytorch # required for minicpmo_26 test
-vocos # required for minicpmo_26 test
-peft
-pqdm
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests
-sentence-transformers # required for embedding tests
-soundfile # required for audio tests
-jiwer # required for audio tests
-timm # required for internvl test
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements-test.in -o requirements-test.txt
+absl-py==2.1.0
+    # via rouge-score
+accelerate==1.0.1
+    # via
+    #   lm-eval
+    #   peft
+aiohappyeyeballs==2.4.3
+    # via aiohttp
+aiohttp==3.10.10
+    # via
+    #   datasets
+    #   fsspec
+    #   lm-eval
+aiosignal==1.3.1
+    # via
+    #   aiohttp
+    #   ray
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.6.2.post1
+    # via httpx
+argcomplete==3.5.1
+    # via datamodel-code-generator
+attrs==24.2.0
+    # via
+    #   aiohttp
+    #   jsonlines
+    #   jsonschema
+    #   referencing
+audioread==3.0.1
+    # via librosa
+awscli==1.35.23
+    # via -r requirements-test.in
+bitsandbytes==0.45.0
+    # via -r requirements-test.in
+black==24.10.0
+    # via datamodel-code-generator
+boto3==1.35.57
+    # via tensorizer
+botocore==1.35.57
+    # via
+    #   awscli
+    #   boto3
+    #   s3transfer
+bounded-pool-executor==0.0.3
+    # via pqdm
+buildkite-test-collector==0.1.9
+    # via -r requirements-test.in
+certifi==2024.8.30
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==1.17.1
+    # via soundfile
+chardet==5.2.0
+    # via mbstrdecoder
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via
+    #   black
+    #   jiwer
+    #   nltk
+    #   ray
+colorama==0.4.6
+    # via
+    #   awscli
+    #   sacrebleu
+    #   tqdm-multiprocess
+contourpy==1.3.0
+    # via matplotlib
+cramjam==2.9.0
+    # via fastparquet
+cupy-cuda12x==13.3.0
+    # via ray
+cycler==0.12.1
+    # via matplotlib
+datamodel-code-generator==0.26.3
+    # via -r requirements-test.in
+dataproperty==1.0.1
+    # via
+    #   pytablewriter
+    #   tabledata
+datasets==3.0.2
+    # via
+    #   evaluate
+    #   lm-eval
+decorator==5.1.1
+    # via librosa
+decord==0.6.0
+    # via -r requirements-test.in
+dill==0.3.8
+    # via
+    #   datasets
+    #   evaluate
+    #   lm-eval
+    #   multiprocess
+dnspython==2.7.0
+    # via email-validator
+docutils==0.16
+    # via awscli
+einops==0.8.0
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vector-quantize-pytorch
+    #   vocos
+einx==0.3.0
+    # via vector-quantize-pytorch
+email-validator==2.2.0
+    # via pydantic
+encodec==0.1.1
+    # via vocos
+evaluate==0.4.3
+    # via lm-eval
+fastparquet==2024.11.0
+    # via genai-perf
+fastrlock==0.8.2
+    # via cupy-cuda12x
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   ray
+    #   torch
+    #   transformers
+    #   triton
+fonttools==4.54.1
+    # via matplotlib
+frozendict==2.4.6
+    # via einx
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+    #   ray
+fsspec==2024.9.0
+    # via
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   huggingface-hub
+    #   torch
+genai-perf==0.0.8
+    # via -r requirements-test.in
+genson==1.3.0
+    # via datamodel-code-generator
+h11==0.14.0
+    # via httpcore
+hiredis==3.0.0
+    # via tensorizer
+httpcore==1.0.6
+    # via httpx
+httpx==0.27.2
+    # via -r requirements-test.in
+huggingface-hub==0.26.2
+    # via
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   peft
+    #   sentence-transformers
+    #   timm
+    #   tokenizers
+    #   transformers
+    #   vocos
+humanize==4.11.0
+    # via runai-model-streamer
+idna==3.10
+    # via
+    #   anyio
+    #   email-validator
+    #   httpx
+    #   requests
+    #   yarl
+inflect==5.6.2
+    # via datamodel-code-generator
+iniconfig==2.0.0
+    # via pytest
+isort==5.13.2
+    # via datamodel-code-generator
+jinja2==3.1.4
+    # via
+    #   datamodel-code-generator
+    #   torch
+jiwer==3.0.5
+    # via -r requirements-test.in
+jmespath==1.0.1
+    # via
+    #   boto3
+    #   botocore
+joblib==1.4.2
+    # via
+    #   librosa
+    #   nltk
+    #   scikit-learn
+jsonlines==4.0.0
+    # via lm-eval
+jsonschema==4.23.0
+    # via
+    #   mistral-common
+    #   ray
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+kaleido==0.2.1
+    # via genai-perf
+kiwisolver==1.4.7
+    # via matplotlib
+lazy-loader==0.4
+    # via librosa
+libnacl==2.1.0
+    # via tensorizer
+librosa==0.10.2.post1
+    # via -r requirements-test.in
+llvmlite==0.43.0
+    # via numba
+lm-eval==0.4.4
+    # via -r requirements-test.in
+lxml==5.3.0
+    # via sacrebleu
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+matplotlib==3.9.2
+    # via -r requirements-test.in
+mbstrdecoder==1.1.3
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
+mistral-common==1.5.1
+    # via -r requirements-test.in
+more-itertools==10.5.0
+    # via lm-eval
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.0
+    # via
+    #   librosa
+    #   ray
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   datasets
+    #   evaluate
+mypy-extensions==1.0.0
+    # via black
+networkx==3.2.1
+    # via torch
+nltk==3.9.1
+    # via rouge-score
+numba==0.60.0
+    # via librosa
+numexpr==2.10.1
+    # via lm-eval
+numpy==1.26.4
+    # via
+    #   -r requirements-test.in
+    #   accelerate
+    #   bitsandbytes
+    #   contourpy
+    #   cupy-cuda12x
+    #   datasets
+    #   decord
+    #   einx
+    #   encodec
+    #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   librosa
+    #   matplotlib
+    #   mistral-common
+    #   numba
+    #   numexpr
+    #   opencv-python-headless
+    #   pandas
+    #   patsy
+    #   peft
+    #   rouge-score
+    #   runai-model-streamer
+    #   sacrebleu
+    #   scikit-learn
+    #   scipy
+    #   soxr
+    #   statsmodels
+    #   tensorizer
+    #   torchvision
+    #   transformers
+    #   tritonclient
+    #   vocos
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opencv-python-headless==4.10.0.84
+    # via mistral-common
+packaging==24.1
+    # via
+    #   accelerate
+    #   black
+    #   datamodel-code-generator
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   huggingface-hub
+    #   lazy-loader
+    #   matplotlib
+    #   peft
+    #   plotly
+    #   pooch
+    #   pytest
+    #   pytest-rerunfailures
+    #   ray
+    #   statsmodels
+    #   transformers
+    #   typepy
+pandas==2.2.3
+    # via
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   statsmodels
+pathspec==0.12.1
+    # via black
+pathvalidate==3.2.1
+    # via pytablewriter
+patsy==1.0.1
+    # via statsmodels
+peft==0.13.2
+    # via
+    #   -r requirements-test.in
+    #   lm-eval
+pillow==10.4.0
+    # via
+    #   genai-perf
+    #   matplotlib
+    #   mistral-common
+    #   sentence-transformers
+    #   torchvision
+platformdirs==4.3.6
+    # via
+    #   black
+    #   pooch
+plotly==5.24.1
+    # via genai-perf
+pluggy==1.5.0
+    # via pytest
+pooch==1.8.2
+    # via librosa
+portalocker==2.10.1
+    # via sacrebleu
+pqdm==0.2.0
+    # via -r requirements-test.in
+propcache==0.2.0
+    # via yarl
+protobuf==5.28.3
+    # via
+    #   ray
+    #   tensorizer
+psutil==6.1.0
+    # via
+    #   accelerate
+    #   peft
+    #   tensorizer
+py==1.11.0
+    # via pytest-forked
+pyarrow==18.0.0
+    # via
+    #   datasets
+    #   genai-perf
+pyasn1==0.6.1
+    # via rsa
+pybind11==2.13.6
+    # via lm-eval
+pycparser==2.22
+    # via cffi
+pydantic==2.9.2
+    # via
+    #   datamodel-code-generator
+    #   mistral-common
+pydantic-core==2.23.4
+    # via pydantic
+pygments==2.18.0
+    # via rich
+pyparsing==3.2.0
+    # via matplotlib
+pytablewriter==1.2.0
+    # via lm-eval
+pytest==8.3.3
+    # via
+    #   -r requirements-test.in
+    #   buildkite-test-collector
+    #   genai-perf
+    #   pytest-asyncio
+    #   pytest-forked
+    #   pytest-mock
+    #   pytest-rerunfailures
+    #   pytest-shard
+pytest-asyncio==0.24.0
+    # via -r requirements-test.in
+pytest-forked==1.6.0
+    # via -r requirements-test.in
+pytest-mock==3.14.0
+    # via genai-perf
+pytest-rerunfailures==14.0
+    # via -r requirements-test.in
+pytest-shard==0.1.2
+    # via -r requirements-test.in
+python-dateutil==2.9.0.post0
+    # via
+    #   botocore
+    #   matplotlib
+    #   pandas
+    #   typepy
+python-rapidjson==1.20
+    # via tritonclient
+pytz==2024.2
+    # via
+    #   pandas
+    #   typepy
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   awscli
+    #   datamodel-code-generator
+    #   datasets
+    #   genai-perf
+    #   huggingface-hub
+    #   peft
+    #   ray
+    #   responses
+    #   timm
+    #   transformers
+    #   vocos
+rapidfuzz==3.12.1
+    # via jiwer
+ray==2.43.0
+    # via -r requirements-test.in
+redis==5.2.0
+    # via tensorizer
+referencing==0.35.1
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2024.9.11
+    # via
+    #   nltk
+    #   sacrebleu
+    #   tiktoken
+    #   transformers
+requests==2.32.3
+    # via
+    #   buildkite-test-collector
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lm-eval
+    #   mistral-common
+    #   pooch
+    #   ray
+    #   responses
+    #   tiktoken
+    #   transformers
+responses==0.25.3
+    # via genai-perf
+rich==13.9.4
+    # via genai-perf
+rouge-score==0.1.2
+    # via lm-eval
+rpds-py==0.20.1
+    # via
+    #   jsonschema
+    #   referencing
+rsa==4.7.2
+    # via awscli
+runai-model-streamer==0.11.0
+    # via -r requirements-test.in
+runai-model-streamer-s3==0.11.0
+    # via -r requirements-test.in
+s3transfer==0.10.3
+    # via
+    #   awscli
+    #   boto3
+sacrebleu==2.4.3
+    # via lm-eval
+safetensors==0.4.5
+    # via
+    #   accelerate
+    #   peft
+    #   timm
+    #   transformers
+scikit-learn==1.5.2
+    # via
+    #   librosa
+    #   lm-eval
+    #   sentence-transformers
+scipy==1.13.1
+    # via
+    #   librosa
+    #   scikit-learn
+    #   sentence-transformers
+    #   statsmodels
+    #   vocos
+sentence-transformers==3.2.1
+    # via -r requirements-test.in
+sentencepiece==0.2.0
+    # via mistral-common
+setuptools==75.8.0
+    # via
+    #   pytablewriter
+    #   torch
+six==1.16.0
+    # via
+    #   python-dateutil
+    #   rouge-score
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+soundfile==0.12.1
+    # via
+    #   -r requirements-test.in
+    #   librosa
+soxr==0.5.0.post1
+    # via librosa
+sqlitedict==2.1.0
+    # via lm-eval
+statsmodels==0.14.4
+    # via genai-perf
+sympy==1.13.1
+    # via
+    #   einx
+    #   torch
+tabledata==1.3.3
+    # via pytablewriter
+tabulate==0.9.0
+    # via sacrebleu
+tcolorpy==0.1.6
+    # via pytablewriter
+tenacity==9.0.0
+    # via
+    #   lm-eval
+    #   plotly
+tensorizer==2.9.0
+    # via -r requirements-test.in
+threadpoolctl==3.5.0
+    # via scikit-learn
+tiktoken==0.7.0
+    # via
+    #   lm-eval
+    #   mistral-common
+timm==1.0.11
+    # via -r requirements-test.in
+tokenizers==0.21.0
+    # via transformers
 torch==2.5.1
+    # via
+    #   -r requirements-test.in
+    #   accelerate
+    #   bitsandbytes
+    #   encodec
+    #   lm-eval
+    #   peft
+    #   runai-model-streamer
+    #   sentence-transformers
+    #   tensorizer
+    #   timm
+    #   torchaudio
+    #   torchvision
+    #   vector-quantize-pytorch
+    #   vocos
 torchaudio==2.5.1
-transformers_stream_generator # required for qwen-vl test
-matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.5.0 # required for pixtral test
-datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.4 # required for model evaluation test
-transformers==4.48.2 
-# quantization
-bitsandbytes>=0.45.0
-buildkite-test-collector==0.1.9
-
-genai_perf==0.0.8
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vocos
+torchvision==0.20.1
+    # via timm
+tqdm==4.66.6
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lm-eval
+    #   nltk
+    #   peft
+    #   pqdm
+    #   sentence-transformers
+    #   tqdm-multiprocess
+    #   transformers
+tqdm-multiprocess==0.0.11
+    # via lm-eval
+transformers==4.48.2
+    # via
+    #   -r requirements-test.in
+    #   genai-perf
+    #   lm-eval
+    #   peft
+    #   sentence-transformers
+    #   transformers-stream-generator
+transformers-stream-generator==0.0.5
+    # via -r requirements-test.in
+triton==3.1.0
+    # via torch
 tritonclient==2.51.0
-
-numpy < 2.0.0
-runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
\ No newline at end of file
+    # via
+    #   -r requirements-test.in
+    #   genai-perf
+typepy==1.3.2
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   tabledata
+typing-extensions==4.12.2
+    # via
+    #   bitsandbytes
+    #   huggingface-hub
+    #   librosa
+    #   mistral-common
+    #   pqdm
+    #   pydantic
+    #   pydantic-core
+    #   torch
+tzdata==2024.2
+    # via pandas
+urllib3==2.2.3
+    # via
+    #   botocore
+    #   requests
+    #   responses
+    #   tritonclient
+vector-quantize-pytorch==1.21.2
+    # via -r requirements-test.in
+vocos==0.1.0
+    # via -r requirements-test.in
+word2number==1.1
+    # via lm-eval
+xxhash==3.5.0
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.17.1
+    # via aiohttp
+zstandard==0.23.0
+    # via lm-eval
\ No newline at end of file

From 375e9db5bc9b87865d829d7ba5d66ed203883a00 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 8 Mar 2025 05:29:10 +0000
Subject: [PATCH 32/35] revert test requirements

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 requirements-test.in  | 718 +++---------------------------------------
 requirements-test.txt |  24 +-
 2 files changed, 42 insertions(+), 700 deletions(-)

diff --git a/requirements-test.in b/requirements-test.in
index cfc000a1b09f..de33f92b37b9 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -1,683 +1,41 @@
-# This file was autogenerated by uv via the following command:
-#    uv pip compile requirements-test.in -o requirements-test.txt
-absl-py==2.1.0
-    # via rouge-score
-accelerate==1.0.1
-    # via
-    #   lm-eval
-    #   peft
-aiohappyeyeballs==2.4.3
-    # via aiohttp
-aiohttp==3.10.10
-    # via
-    #   datasets
-    #   fsspec
-    #   lm-eval
-aiosignal==1.3.1
-    # via
-    #   aiohttp
-    #   ray
-annotated-types==0.7.0
-    # via pydantic
-anyio==4.6.2.post1
-    # via httpx
-argcomplete==3.5.1
-    # via datamodel-code-generator
-attrs==24.2.0
-    # via
-    #   aiohttp
-    #   jsonlines
-    #   jsonschema
-    #   referencing
-audioread==3.0.1
-    # via librosa
-awscli==1.35.23
-    # via -r requirements-test.in
-bitsandbytes==0.45.0
-    # via -r requirements-test.in
-black==24.10.0
-    # via datamodel-code-generator
-boto3==1.35.57
-    # via tensorizer
-botocore==1.35.57
-    # via
-    #   awscli
-    #   boto3
-    #   s3transfer
-bounded-pool-executor==0.0.3
-    # via pqdm
-buildkite-test-collector==0.1.9
-    # via -r requirements-test.in
-certifi==2024.8.30
-    # via
-    #   httpcore
-    #   httpx
-    #   requests
-cffi==1.17.1
-    # via soundfile
-chardet==5.2.0
-    # via mbstrdecoder
-charset-normalizer==3.4.0
-    # via requests
-click==8.1.7
-    # via
-    #   black
-    #   jiwer
-    #   nltk
-    #   ray
-colorama==0.4.6
-    # via
-    #   awscli
-    #   sacrebleu
-    #   tqdm-multiprocess
-contourpy==1.3.0
-    # via matplotlib
-cramjam==2.9.0
-    # via fastparquet
-cupy-cuda12x==13.3.0
-    # via ray
-cycler==0.12.1
-    # via matplotlib
-datamodel-code-generator==0.26.3
-    # via -r requirements-test.in
-dataproperty==1.0.1
-    # via
-    #   pytablewriter
-    #   tabledata
-datasets==3.0.2
-    # via
-    #   evaluate
-    #   lm-eval
-decorator==5.1.1
-    # via librosa
-decord==0.6.0
-    # via -r requirements-test.in
-dill==0.3.8
-    # via
-    #   datasets
-    #   evaluate
-    #   lm-eval
-    #   multiprocess
-dnspython==2.7.0
-    # via email-validator
-docutils==0.16
-    # via awscli
-einops==0.8.0
-    # via
-    #   -r requirements-test.in
-    #   encodec
-    #   vector-quantize-pytorch
-    #   vocos
-einx==0.3.0
-    # via vector-quantize-pytorch
-email-validator==2.2.0
-    # via pydantic
-encodec==0.1.1
-    # via vocos
-evaluate==0.4.3
-    # via lm-eval
-fastparquet==2024.11.0
-    # via genai-perf
-fastrlock==0.8.2
-    # via cupy-cuda12x
-filelock==3.16.1
-    # via
-    #   datasets
-    #   huggingface-hub
-    #   ray
-    #   torch
-    #   transformers
-    #   triton
-fonttools==4.54.1
-    # via matplotlib
-frozendict==2.4.6
-    # via einx
-frozenlist==1.5.0
-    # via
-    #   aiohttp
-    #   aiosignal
-    #   ray
-fsspec==2024.9.0
-    # via
-    #   datasets
-    #   evaluate
-    #   fastparquet
-    #   huggingface-hub
-    #   torch
-genai-perf==0.0.8
-    # via -r requirements-test.in
-genson==1.3.0
-    # via datamodel-code-generator
-h11==0.14.0
-    # via httpcore
-hiredis==3.0.0
-    # via tensorizer
-httpcore==1.0.6
-    # via httpx
-httpx==0.27.2
-    # via -r requirements-test.in
-huggingface-hub==0.26.2
-    # via
-    #   accelerate
-    #   datasets
-    #   evaluate
-    #   peft
-    #   sentence-transformers
-    #   timm
-    #   tokenizers
-    #   transformers
-    #   vocos
-humanize==4.11.0
-    # via runai-model-streamer
-idna==3.10
-    # via
-    #   anyio
-    #   email-validator
-    #   httpx
-    #   requests
-    #   yarl
-inflect==5.6.2
-    # via datamodel-code-generator
-iniconfig==2.0.0
-    # via pytest
-isort==5.13.2
-    # via datamodel-code-generator
-jinja2==3.1.4
-    # via
-    #   datamodel-code-generator
-    #   torch
-jiwer==3.0.5
-    # via -r requirements-test.in
-jmespath==1.0.1
-    # via
-    #   boto3
-    #   botocore
-joblib==1.4.2
-    # via
-    #   librosa
-    #   nltk
-    #   scikit-learn
-jsonlines==4.0.0
-    # via lm-eval
-jsonschema==4.23.0
-    # via
-    #   mistral-common
-    #   ray
-jsonschema-specifications==2024.10.1
-    # via jsonschema
-kaleido==0.2.1
-    # via genai-perf
-kiwisolver==1.4.7
-    # via matplotlib
-lazy-loader==0.4
-    # via librosa
-libnacl==2.1.0
-    # via tensorizer
-librosa==0.10.2.post1
-    # via -r requirements-test.in
-llvmlite==0.43.0
-    # via numba
-lm-eval==0.4.4
-    # via -r requirements-test.in
-lxml==5.3.0
-    # via sacrebleu
-markdown-it-py==3.0.0
-    # via rich
-markupsafe==3.0.2
-    # via jinja2
-matplotlib==3.9.2
-    # via -r requirements-test.in
-mbstrdecoder==1.1.3
-    # via
-    #   dataproperty
-    #   pytablewriter
-    #   typepy
-mdurl==0.1.2
-    # via markdown-it-py
-mistral-common==1.5.1
-    # via -r requirements-test.in
-more-itertools==10.5.0
-    # via lm-eval
-mpmath==1.3.0
-    # via sympy
-msgpack==1.1.0
-    # via
-    #   librosa
-    #   ray
-multidict==6.1.0
-    # via
-    #   aiohttp
-    #   yarl
-multiprocess==0.70.16
-    # via
-    #   datasets
-    #   evaluate
-mypy-extensions==1.0.0
-    # via black
-networkx==3.2.1
-    # via torch
-nltk==3.9.1
-    # via rouge-score
-numba==0.60.0
-    # via librosa
-numexpr==2.10.1
-    # via lm-eval
-numpy==1.26.4
-    # via
-    #   -r requirements-test.in
-    #   accelerate
-    #   bitsandbytes
-    #   contourpy
-    #   cupy-cuda12x
-    #   datasets
-    #   decord
-    #   einx
-    #   encodec
-    #   evaluate
-    #   fastparquet
-    #   genai-perf
-    #   librosa
-    #   matplotlib
-    #   mistral-common
-    #   numba
-    #   numexpr
-    #   opencv-python-headless
-    #   pandas
-    #   patsy
-    #   peft
-    #   rouge-score
-    #   runai-model-streamer
-    #   sacrebleu
-    #   scikit-learn
-    #   scipy
-    #   soxr
-    #   statsmodels
-    #   tensorizer
-    #   torchvision
-    #   transformers
-    #   tritonclient
-    #   vocos
-nvidia-cublas-cu12==12.4.5.8
-    # via
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cuda-cupti-cu12==12.4.127
-    # via torch
-nvidia-cuda-nvrtc-cu12==12.4.127
-    # via torch
-nvidia-cuda-runtime-cu12==12.4.127
-    # via torch
-nvidia-cudnn-cu12==9.1.0.70
-    # via torch
-nvidia-cufft-cu12==11.2.1.3
-    # via torch
-nvidia-curand-cu12==10.3.5.147
-    # via torch
-nvidia-cusolver-cu12==11.6.1.9
-    # via torch
-nvidia-cusparse-cu12==12.3.1.170
-    # via
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-nccl-cu12==2.21.5
-    # via torch
-nvidia-nvjitlink-cu12==12.4.127
-    # via
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
-    #   torch
-nvidia-nvtx-cu12==12.4.127
-    # via torch
-opencv-python-headless==4.10.0.84
-    # via mistral-common
-packaging==24.1
-    # via
-    #   accelerate
-    #   black
-    #   datamodel-code-generator
-    #   datasets
-    #   evaluate
-    #   fastparquet
-    #   huggingface-hub
-    #   lazy-loader
-    #   matplotlib
-    #   peft
-    #   plotly
-    #   pooch
-    #   pytest
-    #   pytest-rerunfailures
-    #   ray
-    #   statsmodels
-    #   transformers
-    #   typepy
-pandas==2.2.3
-    # via
-    #   datasets
-    #   evaluate
-    #   fastparquet
-    #   genai-perf
-    #   statsmodels
-pathspec==0.12.1
-    # via black
-pathvalidate==3.2.1
-    # via pytablewriter
-patsy==1.0.1
-    # via statsmodels
-peft==0.13.2
-    # via
-    #   -r requirements-test.in
-    #   lm-eval
-pillow==10.4.0
-    # via
-    #   genai-perf
-    #   matplotlib
-    #   mistral-common
-    #   sentence-transformers
-    #   torchvision
-platformdirs==4.3.6
-    # via
-    #   black
-    #   pooch
-plotly==5.24.1
-    # via genai-perf
-pluggy==1.5.0
-    # via pytest
-pooch==1.8.2
-    # via librosa
-portalocker==2.10.1
-    # via sacrebleu
-pqdm==0.2.0
-    # via -r requirements-test.in
-propcache==0.2.0
-    # via yarl
-protobuf==5.28.3
-    # via
-    #   ray
-    #   tensorizer
-psutil==6.1.0
-    # via
-    #   accelerate
-    #   peft
-    #   tensorizer
-py==1.11.0
-    # via pytest-forked
-pyarrow==18.0.0
-    # via
-    #   datasets
-    #   genai-perf
-pyasn1==0.6.1
-    # via rsa
-pybind11==2.13.6
-    # via lm-eval
-pycparser==2.22
-    # via cffi
-pydantic==2.9.2
-    # via
-    #   datamodel-code-generator
-    #   mistral-common
-pydantic-core==2.23.4
-    # via pydantic
-pygments==2.18.0
-    # via rich
-pyparsing==3.2.0
-    # via matplotlib
-pytablewriter==1.2.0
-    # via lm-eval
-pytest==8.3.3
-    # via
-    #   -r requirements-test.in
-    #   buildkite-test-collector
-    #   genai-perf
-    #   pytest-asyncio
-    #   pytest-forked
-    #   pytest-mock
-    #   pytest-rerunfailures
-    #   pytest-shard
-pytest-asyncio==0.24.0
-    # via -r requirements-test.in
-pytest-forked==1.6.0
-    # via -r requirements-test.in
-pytest-mock==3.14.0
-    # via genai-perf
-pytest-rerunfailures==14.0
-    # via -r requirements-test.in
-pytest-shard==0.1.2
-    # via -r requirements-test.in
-python-dateutil==2.9.0.post0
-    # via
-    #   botocore
-    #   matplotlib
-    #   pandas
-    #   typepy
-python-rapidjson==1.20
-    # via tritonclient
-pytz==2024.2
-    # via
-    #   pandas
-    #   typepy
-pyyaml==6.0.2
-    # via
-    #   accelerate
-    #   awscli
-    #   datamodel-code-generator
-    #   datasets
-    #   genai-perf
-    #   huggingface-hub
-    #   peft
-    #   ray
-    #   responses
-    #   timm
-    #   transformers
-    #   vocos
-rapidfuzz==3.12.1
-    # via jiwer
-ray==2.43.0
-    # via -r requirements-test.in
-redis==5.2.0
-    # via tensorizer
-referencing==0.35.1
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
-regex==2024.9.11
-    # via
-    #   nltk
-    #   sacrebleu
-    #   tiktoken
-    #   transformers
-requests==2.32.3
-    # via
-    #   buildkite-test-collector
-    #   datasets
-    #   evaluate
-    #   huggingface-hub
-    #   lm-eval
-    #   mistral-common
-    #   pooch
-    #   ray
-    #   responses
-    #   tiktoken
-    #   transformers
-responses==0.25.3
-    # via genai-perf
-rich==13.9.4
-    # via genai-perf
-rouge-score==0.1.2
-    # via lm-eval
-rpds-py==0.20.1
-    # via
-    #   jsonschema
-    #   referencing
-rsa==4.7.2
-    # via awscli
-runai-model-streamer==0.11.0
-    # via -r requirements-test.in
-runai-model-streamer-s3==0.11.0
-    # via -r requirements-test.in
-s3transfer==0.10.3
-    # via
-    #   awscli
-    #   boto3
-sacrebleu==2.4.3
-    # via lm-eval
-safetensors==0.4.5
-    # via
-    #   accelerate
-    #   peft
-    #   timm
-    #   transformers
-scikit-learn==1.5.2
-    # via
-    #   librosa
-    #   lm-eval
-    #   sentence-transformers
-scipy==1.13.1
-    # via
-    #   librosa
-    #   scikit-learn
-    #   sentence-transformers
-    #   statsmodels
-    #   vocos
-sentence-transformers==3.2.1
-    # via -r requirements-test.in
-sentencepiece==0.2.0
-    # via mistral-common
-setuptools==75.8.0
-    # via
-    #   pytablewriter
-    #   torch
-six==1.16.0
-    # via
-    #   python-dateutil
-    #   rouge-score
-sniffio==1.3.1
-    # via
-    #   anyio
-    #   httpx
-soundfile==0.12.1
-    # via
-    #   -r requirements-test.in
-    #   librosa
-soxr==0.5.0.post1
-    # via librosa
-sqlitedict==2.1.0
-    # via lm-eval
-statsmodels==0.14.4
-    # via genai-perf
-sympy==1.13.1
-    # via
-    #   einx
-    #   torch
-tabledata==1.3.3
-    # via pytablewriter
-tabulate==0.9.0
-    # via sacrebleu
-tcolorpy==0.1.6
-    # via pytablewriter
-tenacity==9.0.0
-    # via
-    #   lm-eval
-    #   plotly
-tensorizer==2.9.0
-    # via -r requirements-test.in
-threadpoolctl==3.5.0
-    # via scikit-learn
-tiktoken==0.7.0
-    # via
-    #   lm-eval
-    #   mistral-common
-timm==1.0.11
-    # via -r requirements-test.in
-tokenizers==0.21.0
-    # via transformers
+# testing
+pytest
+tensorizer>=2.9.0
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+
+# testing utils
+awscli
+decord # required for video tests
+einops # required for MPT, qwen-vl and Mamba
+httpx
+librosa # required for audio tests
+vector_quantize_pytorch # required for minicpmo_26 test
+vocos # required for minicpmo_26 test
+peft
+pqdm
+ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests
+sentence-transformers # required for embedding tests
+soundfile # required for audio tests
+jiwer # required for audio tests
+timm # required for internvl test
 torch==2.5.1
-    # via
-    #   -r requirements-test.in
-    #   accelerate
-    #   bitsandbytes
-    #   encodec
-    #   lm-eval
-    #   peft
-    #   runai-model-streamer
-    #   sentence-transformers
-    #   tensorizer
-    #   timm
-    #   torchaudio
-    #   torchvision
-    #   vector-quantize-pytorch
-    #   vocos
 torchaudio==2.5.1
-    # via
-    #   -r requirements-test.in
-    #   encodec
-    #   vocos
-torchvision==0.20.1
-    # via timm
-tqdm==4.66.6
-    # via
-    #   datasets
-    #   evaluate
-    #   huggingface-hub
-    #   lm-eval
-    #   nltk
-    #   peft
-    #   pqdm
-    #   sentence-transformers
-    #   tqdm-multiprocess
-    #   transformers
-tqdm-multiprocess==0.0.11
-    # via lm-eval
-transformers==4.48.2
-    # via
-    #   -r requirements-test.in
-    #   genai-perf
-    #   lm-eval
-    #   peft
-    #   sentence-transformers
-    #   transformers-stream-generator
-transformers-stream-generator==0.0.5
-    # via -r requirements-test.in
-triton==3.1.0
-    # via torch
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+mistral_common[opencv] >= 1.5.0 # required for pixtral test
+datamodel_code_generator # required for minicpm3 test
+lm-eval[api]==0.4.4 # required for model evaluation test
+transformers==4.48.2 
+# quantization
+bitsandbytes>=0.45.0
+buildkite-test-collector==0.1.9
+
+genai_perf==0.0.8
 tritonclient==2.51.0
-    # via
-    #   -r requirements-test.in
-    #   genai-perf
-typepy==1.3.2
-    # via
-    #   dataproperty
-    #   pytablewriter
-    #   tabledata
-typing-extensions==4.12.2
-    # via
-    #   bitsandbytes
-    #   huggingface-hub
-    #   librosa
-    #   mistral-common
-    #   pqdm
-    #   pydantic
-    #   pydantic-core
-    #   torch
-tzdata==2024.2
-    # via pandas
-urllib3==2.2.3
-    # via
-    #   botocore
-    #   requests
-    #   responses
-    #   tritonclient
-vector-quantize-pytorch==1.21.2
-    # via -r requirements-test.in
-vocos==0.1.0
-    # via -r requirements-test.in
-word2number==1.1
-    # via lm-eval
-xxhash==3.5.0
-    # via
-    #   datasets
-    #   evaluate
-yarl==1.17.1
-    # via aiohttp
-zstandard==0.23.0
-    # via lm-eval
\ No newline at end of file
+
+numpy < 2.0.0
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index e5bf67e099e4..cfc000a1b09f 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -23,10 +23,6 @@ anyio==4.6.2.post1
     # via httpx
 argcomplete==3.5.1
     # via datamodel-code-generator
-async-timeout==4.0.3
-    # via
-    #   aiohttp
-    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -120,10 +116,6 @@ encodec==0.1.1
     # via vocos
 evaluate==0.4.3
     # via lm-eval
-exceptiongroup==1.2.2
-    # via
-    #   anyio
-    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -552,7 +544,9 @@ sentence-transformers==3.2.1
 sentencepiece==0.2.0
     # via mistral-common
 setuptools==75.8.0
-    # via pytablewriter
+    # via
+    #   pytablewriter
+    #   torch
 six==1.16.0
     # via
     #   python-dateutil
@@ -597,12 +591,6 @@ timm==1.0.11
     # via -r requirements-test.in
 tokenizers==0.21.0
     # via transformers
-toml==0.10.2
-    # via datamodel-code-generator
-tomli==2.2.1
-    # via
-    #   black
-    #   pytest
 torch==2.5.1
     # via
     #   -r requirements-test.in
@@ -663,17 +651,13 @@ typepy==1.3.2
     #   tabledata
 typing-extensions==4.12.2
     # via
-    #   anyio
     #   bitsandbytes
-    #   black
     #   huggingface-hub
     #   librosa
     #   mistral-common
-    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
-    #   rich
     #   torch
 tzdata==2024.2
     # via pandas
@@ -696,4 +680,4 @@ xxhash==3.5.0
 yarl==1.17.1
     # via aiohttp
 zstandard==0.23.0
-    # via lm-eval
+    # via lm-eval
\ No newline at end of file

From fd3f4e354fa5ff4785ce8737af4bc621ad73ace1 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 8 Mar 2025 05:36:03 +0000
Subject: [PATCH 33/35] revert test requirements

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 requirements-test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index cfc000a1b09f..f5722c82e201 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -680,4 +680,4 @@ xxhash==3.5.0
 yarl==1.17.1
     # via aiohttp
 zstandard==0.23.0
-    # via lm-eval
\ No newline at end of file
+    # via lm-eval

From f64bfe06a557bf9090074b5d8c1a8b8f3af175dc Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 11 Mar 2025 05:22:31 +0000
Subject: [PATCH 34/35] addressing PR comment reviews: fix isort ignores,
 revert back missing comments from merge conflict, code edocumentation

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/rocm_flash_attn.py    |  5 +++
 vllm/model_executor/layers/fused_moe/layer.py | 10 ++----
 vllm/model_executor/layers/layernorm.py       |  4 +--
 .../model_executor/layers/quantization/fp8.py | 34 ++++++++-----------
 .../layers/quantization/utils/w8a8_utils.py   | 23 ++++++++++---
 5 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 8d2868f8fded..9759ff61205d 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -614,6 +614,11 @@ def forward(
         else:
             assert value is None
 
+        # Reshaping kv tensors is required for AITER paged attention kernel
+        # because it works on a different tensor shape,
+        # when the size of one element is one byte (int8/fp8 dtypes).
+        # This reshaping is only required on the first forward call
+        # and the kv cache must not be empty.
         if (current_platform.is_rocm_aiter_paged_attn_enabled()
                 and kv_cache.dtype.itemsize == 1
                 and not self.aiter_kv_scales_initialized
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 99d0628ef5d8..90c915ed40b7 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# isort: skip_file
 
 from abc import abstractmethod
 from enum import Enum
@@ -102,16 +101,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         if current_platform.is_rocm_aiter_moe_enabled():
             # reshaping weights is required for aiter moe kernel.
-            from aiter.ops.shuffle import (shuffle_weight as
-                                           rocm_aiter_shuffle_weight)
+            from aiter.ops.shuffle import shuffle_weight
 
-            shuffled_w13_weight = rocm_aiter_shuffle_weight(
-                layer.w13_weight.data)
+            shuffled_w13_weight = shuffle_weight(layer.w13_weight.data)
             layer.w13_weight = torch.nn.Parameter(shuffled_w13_weight,
                                                   requires_grad=False)
 
-            shuffled_w2_weight = rocm_aiter_shuffle_weight(
-                layer.w2_weight.data)
+            shuffled_w2_weight = shuffle_weight(layer.w2_weight.data)
             layer.w2_weight = torch.nn.Parameter(shuffled_w2_weight,
                                                  requires_grad=False)
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index a3e12bf49d0f..8c3c0d02fafa 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -24,7 +24,7 @@ def rms_norm(*, x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float,
 
 def fused_add_rms_norm(
         *, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
-        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+        variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]:
     from vllm import _custom_ops as ops
 
     ops.fused_add_rms_norm(
@@ -38,7 +38,7 @@ def fused_add_rms_norm(
 
 def rocm_aiter_rmsnorm2d_fwd_with_add(
         *, x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
-        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+        variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]:
     import aiter as rocm_aiter
 
     rocm_aiter.rmsnorm2d_fwd_with_add(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 07b71a4e97cf..0bca83d1d332 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-# isort: skip_file
 
 from typing import Any, Callable, Dict, List, Optional
 
@@ -558,13 +557,12 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                   requires_grad=False)
             if current_platform.is_rocm_aiter_fp8_block_scaled_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
-                from aiter.ops.shuffle import (shuffle_weight as
-                                               rocm_aiter_shuffle_weight)
+                from aiter.ops.shuffle import shuffle_weight
 
-                layer.w13_weight = torch.nn.Parameter(
-                    rocm_aiter_shuffle_weight(layer.w13_weight.data),
-                    requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight(
+                layer.w13_weight = torch.nn.Parameter(shuffle_weight(
+                    layer.w13_weight.data),
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffle_weight(
                     layer.w2_weight.data),
                                                      requires_grad=False)
             return
@@ -599,8 +597,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
 
             if current_platform.is_rocm_aiter_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
-                from aiter.ops.shuffle import (shuffle_weight as
-                                               rocm_aiter_shuffle_weight)
+                from aiter.ops.shuffle import shuffle_weight
 
                 w13_scales = layer.w13_weight_scale.data.unsqueeze(
                     -1).unsqueeze(-1).expand(
@@ -611,10 +608,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     w2_scales.contiguous(), requires_grad=False)
                 layer.w13_weight_scale = torch.nn.Parameter(
                     w13_scales.contiguous(), requires_grad=False)
-                layer.w13_weight = torch.nn.Parameter(
-                    rocm_aiter_shuffle_weight(layer.w13_weight),
-                    requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight(
+                layer.w13_weight = torch.nn.Parameter(shuffle_weight(
+                    layer.w13_weight),
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffle_weight(
                     layer.w2_weight),
                                                      requires_grad=False)
             return
@@ -687,8 +684,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
 
             if current_platform.is_rocm_aiter_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
-                from aiter.ops.shuffle import (shuffle_weight as
-                                               rocm_aiter_shuffle_weight)
+                from aiter.ops.shuffle import shuffle_weight
 
                 max_w13_scales = max_w13_scales.unsqueeze(-1).unsqueeze(
                     -1).expand((-1, layer.w13_weight.shape[1], -1))
@@ -696,10 +692,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     -1).expand((-1, layer.w2_weight.shape[1], -1))
                 layer.w2_weight_scale = torch.nn.Parameter(
                     w2_scales.contiguous(), requires_grad=False)
-                layer.w13_weight = torch.nn.Parameter(
-                    rocm_aiter_shuffle_weight(layer.w13_weight),
-                    requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(rocm_aiter_shuffle_weight(
+                layer.w13_weight = torch.nn.Parameter(shuffle_weight(
+                    layer.w13_weight),
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffle_weight(
                     layer.w2_weight),
                                                      requires_grad=False)
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 75ee0203f8fd..e4a966996af2 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -171,6 +171,8 @@ def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
                               scale_a=scale_a,
                               scale_b=scale_b,
                               bias=bias)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
     if type(output) is tuple and len(output) == 2:
         output = output[0]
 
@@ -211,7 +213,7 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                      input_2d: torch.Tensor,
                                      output_shape: List,
                                      **kwargs) -> torch.Tensor:
-    # use unfused DQ due to limitations with scaled_mm
+    # Use unfused DQ due to limitations with scaled_mm
 
     # Symmetric quantized GEMM by definition computes the following:
     #   C = (s_x * X) (s_w * W) + bias
@@ -260,6 +262,8 @@ def dispatch_w8a8_scaled_mm(
         if current_platform.is_rocm_aiter_linear_enabled():
             return rocm_aiter_per_tensor_w8a8_scaled_mm
         return torch_per_tensor_w8a8_scaled_mm
+    # torch.scaled_mm supports per tensor weights + activations only
+    # so fallback to naive if per channel or per token
     if (use_per_token_if_dynamic and not per_tensor_weights
             and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
         return torch_per_token_w8a8_scaled_mm
@@ -304,8 +308,20 @@ def apply(
         # TODO(luka) remove this parameter in favor of __init__
         use_per_token_if_dynamic: Optional[bool] = None
     ) -> torch.Tensor:
+        # ops.scaled_fp8_quant supports both dynamic and static quant.
+        #   If dynamic, layer.input_scale is None and x_scale computed from x.
+        #   If static, layer.input_scale is scalar and x_scale is input_scale.
+
+        # View input as 2D matrix for fp8 methods
+
         input_2d = input.view(-1, input.shape[-1])
         output_shape = [*input.shape[:-1], weight.shape[1]]
+        # TODO(luka) this is here because currently MLA only decides this
+        #  during the forward method instead of in __init__.
+        if use_per_token_if_dynamic is None:
+            use_per_token_if_dynamic = self.use_per_token_if_dynamic
+
+        # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
         if self.cutlass_fp8_supported:
             qinput, x_scale = ops.scaled_fp8_quant(
                 input_2d,
@@ -314,12 +330,11 @@ def apply(
                 use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         else:
-            config = get_current_vllm_config().compilation_config
-            do_pad = config.level < CompilationLevel.PIECEWISE
+            # Maybe apply padding to output, see comment in __init__
             qinput, x_scale = ops.scaled_fp8_quant(
                 input_2d,
                 input_scale,
-                num_token_padding=17 if do_pad else None,
+                num_token_padding=self.output_padding,
                 use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         per_tensor_weights = (weight_scale.numel() == 1)

From c1297e5d35d8018bb69122e46cbe79e160c5572b Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 11 Mar 2025 05:53:40 +0000
Subject: [PATCH 35/35] add missing comment in fp8_utils

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index b0ab94ccfff5..929396d58b09 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -37,6 +37,7 @@ def shape_supported_by_cutlass(weight: torch.Tensor, block_size: List[int],
                                weight_scale: torch.Tensor,
                                input_2d: torch.Tensor) -> bool:
     if current_platform.is_rocm():
+        # TODO this is never used, as cutlass_block_fp8_supported is False
         scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) +
                          input_2d.shape[:-1])[::-1]
         scale_b_shape = (weight_scale.view(-1, 1)