rank-reduced

makslevental · makslevental · commit 9cb3d6b88651 · 2025-05-03T15:53:07.000-04:00
diff --git a/examples/flash_attention.py b/examples/flash_attention.py
@@ -3,10 +3,11 @@
 import mlir.extras.types as T
 import numpy as np
 from hip import hip
-from mlir.ir import InsertionPoint, IntegerAttr, UnitAttr, Type
+from mlir.ir import InsertionPoint, IntegerAttr, UnitAttr
 from mlir.extras.ast.canonicalize import canonicalize
 from mlir.extras.context import RAIIMLIRContextModule
-from mlir.extras.dialects.ext import memref, scf, arith, gpu, llvm, affine
+from mlir.extras.dialects.ext import memref, scf, arith, gpu, llvm
+from mlir.dialects import math
 
 # noinspection PyUnresolvedReferences
 from mlir.extras.dialects.ext.gpu import (
@@ -25,12 +26,12 @@
 from util import hip_check, launch_kernel, hip_synchronize
 
 
-def init_copy_host_device():
-    q_h = np.random.randint(0, 10, (B * nh * N * d)).astype(dtype=np.float32)
-    k_h = np.random.randint(0, 10, (B * nh * N * d)).astype(dtype=np.float32)
-    v_h = np.random.randint(0, 10, (B * nh * N * d)).astype(dtype=np.float32)
-    l_h = np.zeros((B * nh * N), dtype=np.float32)
-    m_h = np.full((B * nh * N), float(np.finfo(np.float32).min), dtype=np.float32)
+def init_copy_host_device(B, nh, N, d):
+    q_h = np.random.randint(0, 10, (B, nh, N, d)).astype(dtype=np.float32)
+    k_h = np.random.randint(0, 10, (B, nh, N, d)).astype(dtype=np.float32)
+    v_h = np.random.randint(0, 10, (B, nh, N, d)).astype(dtype=np.float32)
+    l_h = np.zeros((B, nh, N), dtype=np.float32)
+    m_h = np.full((B, nh, N), float(np.finfo(np.float32).min), dtype=np.float32)
     O_h = np.zeros_like(q_h, dtype=np.float32)
 
     host = [q_h, k_h, v_h, l_h, m_h, O_h]
@@ -87,11 +88,7 @@ def gpu_module():
 N = 128
 d = 128
 
-import math
-
-Tc = math.ceil(N / Bc)
-Tr = math.ceil(N / Br)
-softmax_scale = 1.0 / math.sqrt(d)
+softmax_scale = 1.0 / float(np.sqrt(d))
 
 
 def softmax(x, axis=None):
@@ -101,20 +98,13 @@ def softmax(x, axis=None):
 
 
 def manual_attn(q, k, v):
-    # the kernel below overwrites the global math.........
-    import math
-
-    q = q.reshape(B, nh, N, d)
-    k = k.reshape(B, nh, N, d)
-    v = v.reshape(B, nh, N, d)
-
-    att = q @ k.transpose(0, 1, 3, 2) * (1.0 / math.sqrt(k.shape[-1]))
+    att = q @ k.transpose(0, 1, 3, 2) * (1.0 / float(np.sqrt(k.shape[-1])))
     att = softmax(att, axis=-1)
     y = att @ v
-    return y.flatten()
+    return y
 
 
-from mlir.dialects import math
+rank_reduce = memref.MemRef.rank_reduce
 
 
 # https://github.com/tspeterkim/flash-attention-minimal/blob/main/flash.cu
@@ -134,32 +124,18 @@ def flash_attention(
     # gpu.printf("bx %ld, by %ld\n", bx, by)
 
     # Offset into Q,K,V,O,l,m - different for each batch and head
-    K_ = K[bx, by, :, :]
-    V_ = V[bx, by, :, :]
-    Q_ = Q[bx, by, :, :]
-    O_ = O[bx, by, :, :]
-    l_ = l[bx, by, :]
-    m_ = m[bx, by, :]
+    K = K[bx, by, :, :, rank_reduce]
+    V = V[bx, by, :, :, rank_reduce]
+    Q = Q[bx, by, :, :, rank_reduce]
+    O = O[bx, by, :, :, rank_reduce]
+    l = l[bx, by, :, rank_reduce]
+    m = m[bx, by, :, rank_reduce]
 
     # Define SRAM for Q,K,V,S
     sram = gpu.dynamic_shared_memory()
-    Qi = memref.view(
-        sram,
-        (Br, d),
-        dtype=T.f32(),
-    )
-    Kj = memref.view(
-        sram,
-        (Bc, d),
-        dtype=T.f32(),
-        shift=Qi.n_elements,
-    )
-    Vj = memref.view(
-        sram,
-        (Bc, d),
-        dtype=T.f32(),
-        shift=Qi.n_elements + Kj.n_elements,
-    )
+    Qi = memref.view(sram, (Br, d), dtype=T.f32())
+    Kj = memref.view(sram, (Bc, d), dtype=T.f32(), shift=Qi.n_elements)
+    Vj = memref.view(sram, (Bc, d), dtype=T.f32(), shift=Qi.n_elements + Kj.n_elements)
     S = memref.view(
         sram,
         (Br, Bc),
@@ -169,22 +145,22 @@ def flash_attention(
 
     for bc in scf.range_(0, N, Bc):
         # Load Kj, Vj to SRAM
-        K_ = K_[:, :, bc : bc + 1, :]
-        V_ = V_[:, :, bc : bc + 1, :]
+        K_ = K[bc : bc + 1, :]
+        V_ = V[bc : bc + 1, :]
         for x in scf.range_(0, d):
-            Kj[tx, x] = K_[0, 0, tx, x]
-            Vj[tx, x] = V_[0, 0, tx, x]
+            Kj[tx, x] = K_[tx, x]
+            Vj[tx, x] = V_[tx, x]
 
         for br in scf.range_(0, N, Br):
             # Load Qi to SRAM, l and m to registers
-            Q_ = Q_[:, :, br : br + 1, :]
+            Q_ = Q[br : br + 1, :]
             for x in scf.range_(0, d):
-                Qi[tx, x] = Q_[0, 0, tx, x]
+                Qi[tx, x] = Q_[tx, x]
 
-            l_ = l_[:, :, br : br + 1]
-            m_ = m_[:, :, br : br + 1]
-            row_l_prev = l_[0, 0, tx]
-            row_m_prev = m_[0, 0, tx]
+            l_ = l[br : br + 1]
+            m_ = m[br : br + 1]
+            row_l_prev = l_[tx]
+            row_m_prev = m_[tx]
 
             # S = QK^T, row_m = rowmax(S)
             row_m: T.f32() = float(np.finfo(np.float32).min)
@@ -218,22 +194,21 @@ def flash_attention(
                 + math.exp(row_m - row_m_new) * row_l
             )
             div = 1.0 / row_l_new
-            c = row_l_prev * math.exp(row_m_prev - row_m_new)
+            f1 = row_l_prev * math.exp(row_m_prev - row_m_new)
+            f2 = math.exp(row_m - row_m_new)
 
             # Write O, l, m to HBM
-            O_ = O_[:, :, br : br + 1, :]
+            O_ = O[br : br + 1, :]
             for x in scf.range_(0, d):
                 pv: T.f32() = 0.0  # Pij * Vj
                 for y, pv, _ in scf.range_(0, Bc, iter_args=[pv]):
                     pv += S[tx, y] * Vj[y, x]
                     pv = yield pv
 
-                O_[0, 0, tx, x] = div * (
-                    c * O_[0, 0, tx, x] + math.exp(row_m - row_m_new) * pv
-                )
+                O_[tx, x] = div * (f1 * O_[tx, x] + f2 * pv)
 
-            l_[0, 0, tx] = row_l_new
-            m_[0, 0, tx] = row_m_new
+            l_[tx] = row_l_new
+            m_[tx] = row_m_new
 
             gpu.barrier()
 
@@ -305,7 +280,7 @@ def flash_attention(
 )
 hsaco = get_compile_object_bytes(lowered_module)
 if output_format in {"isa", "llvm", "offloading"}:
-    with open(Path(__file__).parent / "flashattention.amdgcn", "wb") as f:
+    with open(Path(__file__).parent / f"flashattention.{output_format}", "wb") as f:
         f.write(hsaco)
     exit()
 
@@ -338,7 +313,7 @@ def flash_attention(
             shared_memory,
         ) = launch_params[kernel.__name__]
 
-        host, device = init_copy_host_device()
+        host, device = init_copy_host_device(B, nh, N, d)
         q_h, k_h, v_h, *_ = host
         correct = manual_attn(q_h, k_h, v_h)
 
@@ -360,8 +335,7 @@ def flash_attention(
             with np.printoptions(threshold=np.inf, linewidth=np.inf):
                 print(
                     "correct - output:\n",
-                    correct.round().reshape(B, nh, N, d)
-                    - O_h.round().reshape(B, nh, N, d),
+                    correct.round() - O_h.round(),
                 )
             print(f"{kernel.__name__} failed\n")
         else:
diff --git a/mlir/extras/dialects/ext/memref.py b/mlir/extras/dialects/ext/memref.py
@@ -1,6 +1,6 @@
 import inspect
 import operator
-from itertools import accumulate
+from itertools import accumulate, zip_longest
 from typing import Sequence, Union, Optional
 
 import numpy as np
@@ -24,7 +24,11 @@
     MixedValues,
     _dispatch_mixed_values,
 )
-from ....dialects.memref import _is_static_int_like, _infer_memref_subview_result_type
+from ....dialects.memref import (
+    _is_static_int_like,
+    _infer_memref_subview_result_type,
+    _generated_subview,
+)
 from ....dialects.memref import *
 from ....ir import (
     DenseElementsAttr,
@@ -175,6 +179,8 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
+    rank_reduce = object()
+
     def __getitem__(self, idx: tuple) -> "MemRef":
         loc = get_user_code_loc()
 
@@ -189,6 +195,10 @@ def __getitem__(self, idx: tuple) -> "MemRef":
             return expand_shape(self, (0,), loc=loc)
 
         idx = list((idx,) if isinstance(idx, (int, Scalar, slice)) else idx)
+        rank_reduce = MemRef.rank_reduce in idx
+        if rank_reduce:
+            idx.remove(MemRef.rank_reduce)
+
         for i, d in enumerate(idx):
             # TODO(max): rethink this since subview and etc probably take constant attributes?
             if isinstance(d, int):
@@ -197,7 +207,7 @@ def __getitem__(self, idx: tuple) -> "MemRef":
         if all(isinstance(d, Scalar) for d in idx) and len(idx) == len(self.shape):
             return load(self, idx, loc=loc)
         else:
-            return _subview(self, tuple(idx), loc=loc)
+            return _subview(self, tuple(idx), rank_reduce=rank_reduce, loc=loc)
 
     def __setitem__(self, idx, val):
         loc = get_user_code_loc()
@@ -306,10 +316,89 @@ def _maybe_compute_size(start, stop, step):
         return stop - start
 
 
+def subview(
+    source: Value,
+    offsets: MixedValues,
+    sizes: MixedValues,
+    strides: MixedValues,
+    *,
+    rank_reduce=False,
+    result_type: Optional[MemRefType] = None,
+    loc=None,
+    ip=None,
+):
+    if offsets is None:
+        offsets = []
+    if sizes is None:
+        sizes = []
+    if strides is None:
+        strides = []
+    source_strides, source_offset = source.type.get_strides_and_offset()
+    if result_type is None and all(
+        all(_is_static_int_like(i) for i in s) for s in [sizes, strides, source_strides]
+    ):
+        # If any are arith.constant results then this will canonicalize to python int
+        # (which can then be used to fully specify the subview).
+        (
+            offsets,
+            sizes,
+            strides,
+            result_type,
+        ) = _infer_memref_subview_result_type(source.type, offsets, sizes, strides)
+    elif result_type is None:
+        raise ValueError(
+            "mixed static/dynamic offset/sizes/strides requires explicit result type."
+        )
+
+    offsets, _packed_offsets, static_offsets = _dispatch_mixed_values(offsets)
+    sizes, _packed_sizes, static_sizes = _dispatch_mixed_values(sizes)
+    strides, _packed_strides, static_strides = _dispatch_mixed_values(strides)
+
+    if rank_reduce:
+        result_shape = list(result_type.shape)
+        layout_strides = None
+        if result_type.layout:
+            layout_strides = result_type.layout.strides
+        for i, (s, ss) in reversed(
+            list(enumerate(list(zip_longest(sizes, static_sizes))))
+        ):
+            if (
+                s is not None and _is_static_int_like(s) and s.literal_value == 1
+            ) or ss == 1:
+                del result_shape[i]
+                if layout_strides is not None:
+                    del layout_strides[i]
+        reduced_layout = None
+        if layout_strides is not None:
+            reduced_layout = StridedLayoutAttr.get(
+                result_type.layout.offset, layout_strides
+            )
+        result_type = MemRefType.get(
+            result_shape,
+            result_type.element_type,
+            reduced_layout,
+            result_type.memory_space,
+        )
+
+    return _generated_subview(
+        result_type,
+        source,
+        offsets,
+        sizes,
+        strides,
+        static_offsets,
+        static_sizes,
+        static_strides,
+        loc=loc,
+        ip=ip,
+    )
+
+
 def _subview(
     mem: MemRef,
     idx,
     *,
+    rank_reduce=False,
     loc=None,
     ip=None,
 ) -> MemRef:
@@ -320,14 +409,9 @@ def _subview(
     out = mem
 
     if indexer.is_constant():
-        out = subview(
-            out,
-            offsets=indexer.static_offsets(),
-            sizes=indexer.static_sizes(),
-            strides=indexer.static_strides(),
-            loc=loc,
-            ip=ip,
-        )
+        offsets = indexer.static_offsets()
+        sizes = indexer.static_sizes()
+        strides = indexer.static_strides()
     else:
         # special tile case
         offsets = [None] * len(indexer.in_shape)
@@ -354,14 +438,16 @@ def _subview(
         assert all(
             map(lambda x: x is not None, offsets + sizes + strides)
         ), f"not each slice is statically known: {indexer.indices}"
-        out = subview(
-            out,
-            offsets=offsets,
-            sizes=sizes,
-            strides=strides,
-            loc=loc,
-            ip=ip,
-        )
+
+    out = subview(
+        out,
+        offsets=offsets,
+        sizes=sizes,
+        strides=strides,
+        rank_reduce=rank_reduce,
+        loc=loc,
+        ip=ip,
+    )
 
     # This adds newaxis/None dimensions.
     return expand_shape(out, indexer.newaxis_dims, loc=loc, ip=ip)