works

makslevental · makslevental · commit 87f39e62a12b · 2025-05-02T15:33:14.000-04:00
diff --git a/examples/flash_attention.py b/examples/flash_attention.py
@@ -1,8 +1,9 @@
+from pathlib import Path
+
 import mlir.extras.types as T
 import numpy as np
 from hip import hip
 from mlir.ir import InsertionPoint, IntegerAttr, UnitAttr
-
 from mlir.extras.ast.canonicalize import canonicalize
 from mlir.extras.context import RAIIMLIRContextModule
 from mlir.extras.dialects.ext import memref, scf, arith, gpu, llvm
@@ -23,6 +24,46 @@
 # noinspection PyUnresolvedReferences
 from util import hip_check, launch_kernel, hip_synchronize
 
+
+def init_copy_host_device():
+    q_h = np.random.randint(0, 10, (batch_size * n_head * seq_len * head_embd)).astype(
+        dtype=np.float32
+    )
+    k_h = np.random.randint(0, 10, (batch_size * n_head * seq_len * head_embd)).astype(
+        dtype=np.float32
+    )
+    v_h = np.random.randint(0, 10, (batch_size * n_head * seq_len * head_embd)).astype(
+        dtype=np.float32
+    )
+    l_h = np.zeros((B * nh * N), dtype=np.float32)
+    m_h = np.full((B * nh * N), float(np.finfo(np.float32).min), dtype=np.float32)
+    O_h = np.zeros_like(q_h, dtype=np.float32)
+
+    host = [q_h, k_h, v_h, l_h, m_h, O_h]
+    device = [hip_check(hip.hipMalloc(h.size * h.itemsize)) for h in host]
+
+    for d, h in zip(device, host):
+        hip_check(
+            hip.hipMemcpy(
+                d, h, h.size * h.itemsize, hip.hipMemcpyKind.hipMemcpyHostToDevice
+            )
+        )
+
+    return host, device
+
+
+def copy_device_host(host, device):
+    for d, h in zip(device, host):
+        hip_check(
+            hip.hipMemcpy(
+                h, d, h.size * h.itemsize, hip.hipMemcpyKind.hipMemcpyDeviceToHost
+            )
+        )
+        hip_check(hip.hipFree(d))
+
+    return host
+
+
 # just so it doesn't get DCE'd by black/reformat
 # TypeError: 'mlir._mlir_libs._mlir.ir.BlockArgument' object is not subscriptable
 _ = memref
@@ -79,7 +120,7 @@ def manual_attn(q, k, v):
     k = k.reshape(batch_size, n_head, seq_len, head_embd)
     v = v.reshape(batch_size, n_head, seq_len, head_embd)
 
-    att = q @ k.transpose(0, 1, -2, -1) * (1.0 / math.sqrt(k.shape[-1]))
+    att = q @ k.transpose(0, 1, 3, 2) * (1.0 / math.sqrt(k.shape[-1]))
     att = softmax(att, axis=-1)
     y = att @ v
     return y.flatten()
@@ -100,12 +141,12 @@ def flash_attention(
     O: T.memref(batch_size * n_head * seq_len * head_embd, T.f32()),
 ):
     tx = thread_idx.x
-    bx = block_idx.x
-    by = block_idx.y  # batch and head index
+    bx, by = block_idx.x, block_idx.y
+    gy = grid_dim.y
 
     # Offset into Q,K,V,O,l,m - different for each batch and head
-    qkv_offset = bx * grid_dim.y * N * d + by * N * d  # gridDim.y = nh
-    lm_offset = bx * grid_dim.y * N + by * N  # offset for l and m
+    qkv_offset = bx * gy * N * d + by * N * d  # gridDim.y = nh
+    lm_offset = bx * gy * N + by * N  # offset for l and m
 
     # Define SRAM for Q,K,V,S
     sram = gpu.dynamic_shared_memory()
@@ -120,8 +161,6 @@ def flash_attention(
             Kj[tx * d + x] = K[qkv_offset + tile_size * j + tx * d + x]
             Vj[tx * d + x] = V[qkv_offset + tile_size * j + tx * d + x]
 
-        gpu.barrier()  # such that the inner loop can use the correct Kj, Vj
-
         for i in scf.range_(0, Tr):
             # Load Qi to SRAM, l and m to registers
             for x in scf.range_(0, d):
@@ -175,13 +214,10 @@ def flash_attention(
                 ii = qkv_offset + tile_size * i + tx * d + x
                 O[ii] = div * (c * O[ii] + math.exp(row_m - row_m_new) * pv)
 
-            gpu.barrier()  # otherwise, thread can use the wrong Kj, Vj in inner loop
-
             m[lm_offset + Br * i + tx] = row_m_new
             l[lm_offset + Br * i + tx] = row_l_new
 
-            # gpu.barrier()  # otherwise, thread can use the wrong Kj, Vj in inner loop
-        # gpu.barrier()  # otherwise, thread can use the wrong Kj, Vj in inner loop
+            gpu.barrier()  # otherwise, thread can use the wrong Kj, Vj in inner loop
 
 
 ip.__exit__(None, None, None)
@@ -236,68 +272,33 @@ def flash_attention(
         T.index(), np.prod(thread_dims)
     )
 
-lowered_module = run_pipeline(lowered_module, Pipeline().gpu_module_to_binary())
+output_format = "bin"
+# output_format = "isa"
+
+lowered_module = run_pipeline(
+    lowered_module, Pipeline().gpu_module_to_binary(format=output_format)
+)
 hsaco = get_compile_object_bytes(lowered_module)
+if output_format == "isa":
+    with open(Path(__file__).parent / "flashattention.amdgcn", "w") as f:
+        f.write(hsaco.decode())
+    exit()
 
 hip_module = hip_check(hip.hipModuleLoadData(hsaco))
 
-q_h = np.random.randint(0, 10, (batch_size * n_head * seq_len * head_embd)).astype(
-    dtype=np.float32
-)
-k_h = np.random.randint(0, 10, (batch_size * n_head * seq_len * head_embd)).astype(
-    dtype=np.float32
-)
-v_h = np.random.randint(0, 10, (batch_size * n_head * seq_len * head_embd)).astype(
-    dtype=np.float32
-)
-l_h = np.zeros((B * nh * N), dtype=np.float32)
-m_h = np.full((B * nh * N), float(np.finfo(np.float32).min), dtype=np.float32)
-O_h = np.zeros_like(q_h, dtype=np.float32)
-
-q_num_bytes = q_h.size * q_h.itemsize
-k_num_bytes = k_h.size * k_h.itemsize
-v_num_bytes = v_h.size * v_h.itemsize
-l_num_bytes = l_h.size * l_h.itemsize
-m_num_bytes = m_h.size * m_h.itemsize
-O_num_bytes = O_h.size * O_h.itemsize
-
-q_d = hip_check(hip.hipMalloc(q_num_bytes))
-k_d = hip_check(hip.hipMalloc(k_num_bytes))
-v_d = hip_check(hip.hipMalloc(v_num_bytes))
-l_d = hip_check(hip.hipMalloc(l_num_bytes))
-m_d = hip_check(hip.hipMalloc(m_num_bytes))
-O_d = hip_check(hip.hipMalloc(O_num_bytes))
-
 stream = 0
 
 times = {
     flash_attention: 0,
 }
-# random.shuffle(kernels)
-runs = 16
+runs = 32
 for kernel in times:
     for i in range(runs):
         function = hip_check(
             hip.hipModuleGetFunction(hip_module, kernel.__name__.encode())
         )
         hip_check(hip.hipDeviceSynchronize())
 
-        for d, h, num_bytes in zip(
-            [q_d, k_d, v_d, l_d, m_d, O_d],
-            [q_h, k_h, v_h, l_h, m_h, O_h],
-            [
-                q_num_bytes,
-                k_num_bytes,
-                v_num_bytes,
-                l_num_bytes,
-                m_num_bytes,
-                O_num_bytes,
-            ],
-        ):
-            hip_check(
-                hip.hipMemcpy(d, h, num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice)
-            )
-
         (
             (
                 blocks_per_grid_x,
@@ -312,6 +313,10 @@ def flash_attention(
             shared_memory,
         ) = launch_params[kernel.__name__]
 
+        host, device = init_copy_host_device()
+        q_h, k_h, v_h, *_ = host
+        correct = manual_attn(q_h, k_h, v_h)
+
         time_compute = launch_kernel(
             function.as_c_void_p(),
             blocks_per_grid_x,
@@ -322,36 +327,16 @@ def flash_attention(
             threads_per_block_z,
             stream,
             shared_memory,
-            q_d,
-            k_d,
-            v_d,
-            l_d,
-            m_d,
-            O_d,
+            *device,
         )
 
-        hip_check(
-            hip.hipMemcpy(
-                l_h, l_d, l_num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost
-            )
-        )
-        hip_check(
-            hip.hipMemcpy(
-                m_h, m_d, m_num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost
-            )
-        )
-        hip_check(
-            hip.hipMemcpy(
-                O_h, O_d, O_num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost
-            )
-        )
-        correct = manual_attn(q_h, k_h, v_h)
+        *_, O_h = copy_device_host(host, device)
         if not np.allclose(correct, O_h):
             print("correct", correct)
-            print("l_h", l_h)
-            print("m_h", m_h)
             print("output", O_h)
             print(f"{kernel.__name__} failed")
+        else:
+            print(f"{kernel.__name__}: {time_compute:.03f}ms")
 
         times[kernel] += time_compute