[mxfp8 moe training] compute prefix sum of group sizes inside kernel intead of precomputing

danielvegamyhre · danielvegamyhre · commit 1e97f002303b · 2025-11-03T15:59:28.000-08:00
stack-info: PR: #3285, branch: danielvegamyhre/stack/82
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
@@ -21,8 +21,6 @@
     triton_fp8_per_group_rowwise_scales,
 )
 from torchao.prototype.moe_training.kernels.mxfp8 import (
-    compute_blocked_scale_offsets_for_K_groups,
-    compute_blocked_scale_offsets_for_M_groups,
     torch_to_blocked_2d_K_groups,
     torch_to_blocked_2d_M_groups,
     torch_to_blocked_per_group_3d,
@@ -236,13 +234,9 @@ def test_triton_mx_block_rearrange_2d_M_groups(
     )
 
     # triton kernel
-    _, output_group_offsets = compute_blocked_scale_offsets_for_M_groups(
-        input_group_offsets
-    )
     triton_out_scales = triton_mx_block_rearrange_2d_M_groups(
         e8m0_scales,
         input_group_offsets,
-        output_group_offsets,
     )
     assert torch.allclose(ref_out_scales, triton_out_scales, atol=0, rtol=0), (
         "blocked scales not equal"
@@ -306,16 +300,9 @@ def test_triton_mx_block_rearrange_2d_K_groups(
     )
 
     # triton kernel
-    _, output_group_offsets = compute_blocked_scale_offsets_for_K_groups(
-        scale_group_offsets
-    )
-    assert torch.equal(output_group_offsets, ref_start_cols_after_padding), (
-        "output scale group start offsets not equal"
-    )
     triton_out_scales = triton_mx_block_rearrange_2d_K_groups(
         e8m0_scales,
         scale_group_offsets,
-        output_group_offsets,
     )
     assert torch.equal(ref_out_scales, triton_out_scales), "blocked scales not equal"
 
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/__init__.py b/torchao/prototype/moe_training/kernels/mxfp8/__init__.py
@@ -1,6 +1,4 @@
 from torchao.prototype.moe_training.kernels.mxfp8.quant import (
-    compute_blocked_scale_offsets_for_K_groups,  # noqa: F401
-    compute_blocked_scale_offsets_for_M_groups,  # noqa: F401
     mxfp8_quantize_cuda_3d,  # noqa: F401
     torch_to_blocked_2d_K_groups,  # noqa: F401
     torch_to_blocked_2d_M_groups,  # noqa: F401
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/quant.py b/torchao/prototype/moe_training/kernels/mxfp8/quant.py
@@ -223,7 +223,6 @@ def compute_blocked_scale_offsets_for_K_groups(
 def triton_mx_block_rearrange_2d_M_groups(
     scales_tensor: torch.Tensor,
     input_group_end_offsets: torch.Tensor,
-    output_group_start_offsets: torch.Tensor,
 ) -> torch.Tensor:
     """
     Rearranges an E8M0 tensor scale to block-scaled swizzle format,
@@ -275,15 +274,14 @@ def triton_mx_block_rearrange_2d_M_groups(
         scales_tensor.stride(1),
         rows,
         cols,
-        num_groups,
         # Original offsets (to read from)
         input_group_end_offsets,
         # Output scales tensor and group offsets after padding (to write to)
         output.view(torch.uint8),
         output.stride(0),
-        output_group_start_offsets,
         output_stride_per_block,
         output_stride_per_row_of_blocks,
+        num_groups=num_groups,
         BLOCK_ROWS=BLOCK_ROWS,
         BLOCK_COLS=BLOCK_COLS,
     )
@@ -297,13 +295,12 @@ def triton_scale_swizzle_M_groups(
     scales_stride_dim1,
     scale_rows,
     scale_cols,
-    num_groups,
     orig_offsets,  # (num_groups,)
     output_scales_ptr,
     output_scales_stride_dim0,
-    output_scales_group_offsets,  # (num_groups,)
     output_stride_per_block,
     output_stride_per_row_of_blocks,
+    num_groups: tl.constexpr,
     BLOCK_ROWS: tl.constexpr,
     BLOCK_COLS: tl.constexpr,
 ):
@@ -316,10 +313,13 @@ def triton_scale_swizzle_M_groups(
     input_group_end_row = tl.load(
         orig_offsets + group_pid, mask=group_pid < num_groups, other=0
     )
-    # Output scales start row we will begin writing to
-    output_group_start_row = tl.load(
-        output_scales_group_offsets + group_pid, mask=group_pid < num_groups, other=0
+
+    # Calculate this group's start row after blocked format padding, by doing a prefix sum
+    # of each previous group's padded size.
+    output_group_start_row = _blocked_group_start_idx(
+        group_pid, orig_offsets, num_groups, 128
     )
+
     # Calculate destination indices for each row and col in block swizzled layout.
     # We can reuse this swizzle transformation on each block of data we read.
     row_offs = tl.arange(0, BLOCK_ROWS)[:, None]
@@ -489,7 +489,6 @@ def triton_scale_swizzle_per_group_3d(
 def triton_mx_block_rearrange_2d_K_groups(
     scales_tensor: torch.Tensor,
     input_group_end_offsets: torch.Tensor,
-    output_group_start_offsets: torch.Tensor,
 ) -> torch.Tensor:
     """
     Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis,
@@ -538,13 +537,10 @@ def triton_mx_block_rearrange_2d_K_groups(
         rows,
         cols,
         padded_rows,
-        num_groups,
-        # Original offsets (to read from)
         input_group_end_offsets,
-        # Output scales tensor and group offsets after padding (to write to)
         output.view(torch.uint8),
-        output_group_start_offsets,
         output_stride_per_block,
+        num_groups=num_groups,
         BLOCK_ROWS=BLOCK_ROWS,
         BLOCK_COLS=BLOCK_COLS,
         DEBUG=False,
@@ -560,11 +556,10 @@ def triton_scale_swizzle_2d_K_groups(
     scale_rows,
     scale_cols,
     padded_rows,
-    num_groups,
     orig_offsets,  # (num_groups,)
     output_scales_ptr,
-    output_scales_group_offsets,  # (num_groups,)
     output_stride_per_block,
+    num_groups: tl.constexpr,
     BLOCK_ROWS: tl.constexpr,
     BLOCK_COLS: tl.constexpr,
     DEBUG: tl.constexpr = False,
@@ -578,8 +573,11 @@ def triton_scale_swizzle_2d_K_groups(
     )
     input_group_end_col = tl.load(orig_offsets + group_pid)
 
-    # Output scales start row we will begin writing to
-    output_group_start_col = tl.load(output_scales_group_offsets + group_pid)
+    # Calculate this group's start row after blocked format padding, by doing a prefix sum
+    # of each previous group's padded size.
+    output_group_start_col = _blocked_group_start_idx(
+        group_pid, orig_offsets, num_groups, 4
+    )
 
     row_offs = tl.arange(0, BLOCK_ROWS)[:, None]
     col_offs = tl.arange(0, BLOCK_COLS)[None, :]
@@ -651,6 +649,31 @@ def _dest_indices_for_block(
     return dest_indices_flat
 
 
+@triton.jit
+def _blocked_group_start_idx(
+    group_pid,
+    orig_offsets,
+    num_groups: tl.constexpr,
+    padding_size: tl.constexpr,
+):
+    """Prefix sum to compute the start index of a given group."""
+    offsets = tl.load(orig_offsets + tl.arange(0, num_groups))
+    prev_offsets = tl.load(
+        orig_offsets + tl.arange(0, num_groups) - 1,
+        mask=tl.arange(0, num_groups) > 0,
+        other=0,
+    )
+    group_sizes = tl.where(
+        tl.arange(0, num_groups) > 0,
+        offsets - prev_offsets,
+        offsets,
+    )
+    padded_sizes = tl.cdiv(group_sizes, padding_size) * padding_size
+    prefix_mask = tl.arange(0, num_groups) < group_pid
+    group_start_idx = tl.sum(tl.where(prefix_mask, padded_sizes, 0))
+    return group_start_idx
+
+
 mxfp8_cuda_extension_available = False
 if is_sm_at_least_100():
     try:
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -17,8 +17,6 @@
     triton_fp8_rowwise_3d_transpose_rhs,
 )
 from torchao.prototype.moe_training.kernels.mxfp8 import (
-    compute_blocked_scale_offsets_for_K_groups,
-    compute_blocked_scale_offsets_for_M_groups,
     mxfp8_quantize_cuda_3d,
     triton_mx_block_rearrange_2d_K_groups,
     triton_mx_block_rearrange_2d_M_groups,
@@ -329,13 +327,9 @@ def forward(
             )
 
         # Convert scales to blocked format for 2d-3d grouped mm
-        _, blocked_scales_group_offsets_2d3d = (
-            compute_blocked_scale_offsets_for_M_groups(offs)
-        )
         A_scales_blocked = triton_mx_block_rearrange_2d_M_groups(
             A_scale,
             offs,
-            blocked_scales_group_offsets_2d3d,
         )
         B_scales_blocked = triton_mx_block_rearrange_per_group_3d(B_scales)
 
@@ -350,7 +344,7 @@ def forward(
             out_dtype=out_dtype,
         )
 
-        ctx.save_for_backward(A, B_t, offs, blocked_scales_group_offsets_2d3d)
+        ctx.save_for_backward(A, B_t, offs)
         ctx.block_size = block_size
         ctx.out_dtype = out_dtype
         ctx.emulated = emulated
@@ -359,7 +353,7 @@ def forward(
 
     @staticmethod
     def backward(ctx, grad_out: torch.Tensor):
-        A, B_t, offs, blocked_scales_group_offsets_2d3d = ctx.saved_tensors
+        A, B_t, offs = ctx.saved_tensors
         block_size = ctx.block_size
         out_dtype = ctx.out_dtype
         use_triton_for_dim0_cast = ctx.use_triton_for_dim0_cast
@@ -390,7 +384,6 @@ def backward(ctx, grad_out: torch.Tensor):
         grad_out_scales_blocked = triton_mx_block_rearrange_2d_M_groups(
             grad_out_scale,
             offs,
-            blocked_scales_group_offsets_2d3d,
         )
         B_scales_blocked = triton_mx_block_rearrange_per_group_3d(B_scales)
 
@@ -436,18 +429,13 @@ def backward(ctx, grad_out: torch.Tensor):
 
         # Convert scales to blocked format for 2d-2d grouped mm
         scale_group_offsets = offs // block_size
-        _, blocked_scale_group_offsets = compute_blocked_scale_offsets_for_K_groups(
-            scale_group_offsets
-        )
         grad_out_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
             grad_out_t_scales,
             scale_group_offsets,
-            blocked_scale_group_offsets,
         )
         A_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
             A_t_scales,
             scale_group_offsets,
-            blocked_scale_group_offsets,
         )
 
         # grad_B_t = scaled grouped mm of (N,total_M) @ (total_M,K) = (E,N,K)