Add jagged hstu attention example (i.e. ragged_attention) (#527)

xuanzhang816 · web-flow · commit ec6099b8f06c · 2025-09-03T14:27:44.000-07:00
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -61,6 +61,12 @@ class RunResult:
 KERNEL_MAPPINGS: dict[str, tuple[str, ...]] = {  # pyright: ignore[reportAssignmentType]
     # <tritonbench_op_name>: (<tritonbench_module_path>, <helion_kernel_module_path>, <helion_kernel_function_name>)
     "vector_add": ("tritonbench.operators.vector_add.operator", "examples.add", "add"),
+    "ragged_attention": (
+        "tritonbench.operators.ragged_attention.operator",
+        "examples.jagged_hstu_attn",
+        "ragged_attention_tritonbench",
+        {"target_size": 0},
+    ),
     "embedding": (
         "tritonbench.operators.embedding.operator",
         "examples.embedding",
diff --git a/examples/jagged_hstu_attn.py b/examples/jagged_hstu_attn.py
@@ -0,0 +1,266 @@
+"""
+Simplified Jagged HSTU Attention Forward Example
+===============================================
+
+This example demonstrates a simplified version of jagged HSTU attention using Helion.
+"""
+
+# %%
+# Imports
+# -------
+from __future__ import annotations
+
+import torch
+
+import helion
+from helion._testing import run_example
+import helion.language as hl
+
+try:
+    from generative_recommenders.ops.triton.triton_hstu_attention import (  # pyright: ignore[reportMissingImports]
+        triton_hstu_mha,
+    )
+
+    HAS_HAMMER = True
+except ImportError:
+    HAS_HAMMER = False
+
+
+# %%
+# Reference Implementation
+# --------------------
+def reference_jagged_hstu_kernel_pytorch(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: torch.Tensor | None,
+    max_seq_len: int,
+) -> torch.Tensor:
+    """Simple PyTorch implementation of HSTU jagged attention"""
+    # Initialize output
+    output = torch.zeros_like(v)
+
+    # Scale factor
+    scale = 1.0 / max_seq_len
+    alpha = 1.0 / v.size(2) ** 2
+
+    # Compute per-batch sequence lengths
+    seq_lens = seq_offsets[1:] - seq_offsets[:-1]
+
+    q_split = torch.split(q, seq_lens.tolist(), dim=0)
+    k_split = torch.split(k, seq_lens.tolist(), dim=0)
+    v_split = torch.split(v, seq_lens.tolist(), dim=0)
+
+    # Get the batches
+    for i, (q_batch, k_batch, v_batch) in enumerate(
+        zip(q_split, k_split, v_split, strict=False)
+    ):
+        q_batch = q_batch.transpose(0, 1)  # [heads, seq_len, head_dim]
+        k_batch = k_batch.permute(1, 2, 0)  # [heads, head_dim, seq_len]
+        v_batch = v_batch.transpose(0, 1)  # [heads, seq_len, head_dim]
+
+        # Compute attention scores using batch matrix multiplication
+        scores = torch.bmm(q_batch, k_batch) * alpha
+
+        # Apply SiLU activation
+        scores = (scores / (1.0 + torch.exp(-scores))) * scale
+
+        # Apply lower triangular mask (causal attention)
+        invalid_mask = torch.tril(torch.ones_like(scores, dtype=torch.bool), diagonal=0)
+        scores = torch.where(invalid_mask, scores, torch.zeros_like(scores))
+
+        # Compute and store output
+        output_batch = torch.bmm(scores, v_batch)
+        output[seq_offsets[i] : seq_offsets[i + 1]] = output_batch.transpose(0, 1)
+
+    return output
+
+
+# %%
+# Jagged HSTU Attention Kernel
+# ---------------
+@helion.kernel()
+def _helion_jagged_attention_kernel(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+) -> torch.Tensor:
+    """Helion implementation of HSTU jagged attention"""
+    scale = 1.0 / max_seq_len
+    num_heads = hl.specialize(q.size(1))
+    num_batches = hl.specialize(seq_offsets.size(0) - 1)
+    dimV = hl.specialize(v.size(2))
+
+    out = torch.zeros_like(v)
+
+    # Tile over batch, head, sequence
+    for tile_b, tile_h, tile_q in hl.tile(
+        [num_batches, num_heads, max_seq_len], block_size=[1, 1, None]
+    ):
+        starts = seq_offsets[tile_b.begin]
+        ends = seq_offsets[tile_b.begin + 1]
+        seq_len = ends - starts
+
+        if tile_q.begin < seq_len:
+            mask_q = tile_q.index < seq_len
+            q_blk = q[tile_q.index + starts, tile_h.begin, :]
+            acc = hl.zeros([tile_q, dimV], dtype=torch.float32)
+
+            # Causal attention: only attend to previous tokens
+            for tile_kv in hl.tile(0, tile_q.end, block_size=None):
+                mask_kv = tile_kv.index < seq_len
+                k_blk = k[tile_kv.index + starts, tile_h.begin, :]
+                v_blk = v[tile_kv.index + starts, tile_h.begin, :]
+
+                # Compute attention scores with SiLU activation
+                scores = (
+                    torch.nn.functional.silu(torch.matmul(q_blk, k_blk.T) * alpha)
+                    * scale
+                )
+
+                # Apply causal mask: only attend to previous positions
+                scores = torch.where(
+                    (tile_q.index.unsqueeze(1) > tile_kv.index.unsqueeze(0))
+                    & mask_q[:, None]
+                    & mask_kv[None, :],
+                    scores,
+                    0.0,
+                )
+
+                acc += torch.matmul(scores.to(v.dtype), v_blk)
+
+            # Store result
+            out[tile_q.index + starts, tile_h.begin, :] = acc.to(out.dtype)
+
+    return out
+
+
+# %%
+# Benchmark Wrapper
+# --------------
+def ragged_attention_tritonbench(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: torch.Tensor | None,
+    max_seq_len: int,
+) -> torch.Tensor:
+    """Wrapper function for jagged attention kernel"""
+    return _helion_jagged_attention_kernel(
+        max_seq_len=max_seq_len,
+        alpha=1.0 / v.size(2) ** 2,
+        q=q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+    )
+
+
+# %%
+# Testing Function
+# -------------
+def test(
+    batch_size: int,
+    max_seq_len: int,
+    heads: int,
+    head_dim: int,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device | str = "cuda",
+) -> None:
+    """
+    Test the jagged HSTU attention kernel implementation.
+
+    Args:
+        batch_size: Number of sequences in the batch
+        max_seq_len: Maximum sequence length
+        heads: Number of attention heads
+        head_dim: Dimension of each attention head
+        dtype: Data type for the tensors
+        device: Device to run the test on
+    """
+    device = torch.device(device)
+
+    # Generate random sequence lengths
+    min_seq_len = max_seq_len // 2
+    seq_lengths = torch.randint(
+        min_seq_len, max_seq_len + 1, (batch_size,), dtype=torch.int32, device=device
+    )
+    seq_offsets = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32, device=device),
+            torch.cumsum(seq_lengths, dim=0),
+        ]
+    )
+    total_seq_len = int(seq_offsets[-1].item())
+
+    # q, k, v: [total_seq_len, heads, head_dim]
+    q = torch.randn(
+        (total_seq_len, heads, head_dim),
+        dtype=dtype,
+        device=device,
+        requires_grad=True,
+    )
+    k = torch.randn(
+        (total_seq_len, heads, head_dim),
+        dtype=dtype,
+        device=device,
+        requires_grad=True,
+    )
+    v = torch.randn(
+        (total_seq_len, heads, head_dim),
+        dtype=dtype,
+        device=device,
+        requires_grad=True,
+    )
+
+    baselines = {
+        "torch": reference_jagged_hstu_kernel_pytorch,
+    }
+    if HAS_HAMMER:
+
+        def _triton_hstu_mha(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            seq_offsets: torch.Tensor,
+            num_targets: torch.Tensor | None,
+            max_seq_len: int,
+        ) -> torch.Tensor:
+            return triton_hstu_mha(  # pyright: ignore[reportPossiblyUnboundVariable,reportCallIssue]
+                max_seq_len,
+                alpha=1.0 / v.size(2) ** 2,
+                q=q,
+                k=k,
+                v=v,
+                seq_offsets=seq_offsets,
+                num_targets=num_targets,
+                max_attn_len=0,
+                contextual_seq_len=0,
+            )
+
+        baselines["tritonbench"] = _triton_hstu_mha
+
+    run_example(
+        ragged_attention_tritonbench,
+        baselines,
+        (q, k, v, seq_offsets, None, max_seq_len),
+    )
+
+
+# %%
+# Main Function
+# -----------
+def main() -> None:
+    """
+    Main entry point for testing the simplified jagged HSTU attention kernel.
+    """
+    test(batch_size=1024, max_seq_len=1024, heads=4, head_dim=128, dtype=torch.bfloat16)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -910,6 +910,110 @@ def jagged_dense_add_2d(x_data: torch.Tensor, x_offsets: torch.Tensor, y: torch.
     _launcher(_helion_jagged_dense_add_2d, (triton.cdiv(num_rows, _BLOCK_SIZE_0),), x_offsets, x_data, y, out, y.size(1), out.stride(0), out.stride(1), x_data.stride(0), x_offsets.stride(0), y.stride(0), y.stride(1), num_rows, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=3)
     return out
 
+--- assertExpectedJournal(TestExamples.test_jagged_hstu_attn)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion__helion_jagged_attention_kernel(seq_offsets, q, k, v, out, k_stride_0, k_stride_1, k_stride_2, out_stride_0, out_stride_1, out_stride_2, q_stride_0, q_stride_1, q_stride_2, seq_offsets_stride_0, v_stride_0, v_stride_1, v_stride_2, max_seq_len, alpha, scale, _BLOCK_SIZE_2: tl.constexpr, _RDIM_SIZE_3: tl.constexpr, _BLOCK_SIZE_4: tl.constexpr):
+    num_blocks_0 = 4
+    num_blocks_1 = 8
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0 % num_blocks_1
+    pid_2 = tl.program_id(0) // (num_blocks_0 * num_blocks_1)
+    offset_0 = pid_0
+    offset_1 = pid_1
+    offset_2 = pid_2 * _BLOCK_SIZE_2
+    indices_2 = (offset_2 + tl.arange(0, _BLOCK_SIZE_2)).to(tl.int32)
+    mask_2 = indices_2 < max_seq_len
+    indices_5 = tl.arange(0, _RDIM_SIZE_3).to(tl.int32)
+    starts = tl.load(seq_offsets + offset_0 * seq_offsets_stride_0, None)
+    add = 1 + offset_0
+    ends = tl.load(seq_offsets + add * seq_offsets_stride_0, None)
+    v_0 = ends - starts
+    v_1 = v_0 > offset_2
+    if v_1:
+        v_0_copy = v_0
+        starts_copy = starts
+        v_0_copy_0 = v_0_copy
+        starts_copy_0 = starts_copy
+        v_2 = v_0_copy_0[None]
+        v_3 = v_2.to(tl.int32)
+        v_4 = indices_2 < v_3
+        v_5 = starts_copy_0[None]
+        v_6 = v_5.to(tl.int32)
+        v_7 = indices_2 + v_6
+        q_blk = tl.load(q + (v_7[:, None] * q_stride_0 + offset_1 * q_stride_1 + indices_5[None, :] * q_stride_2), mask_2[:, None], other=0)
+        acc = tl.full([_BLOCK_SIZE_2, 32], 0.0, tl.float32)
+        tile_end = tl.minimum(offset_2 + _BLOCK_SIZE_2, max_seq_len)
+        for offset_3 in tl.range(0, tile_end.to(tl.int32), _BLOCK_SIZE_4):
+            indices_3 = offset_3 + tl.arange(0, _BLOCK_SIZE_4).to(tl.int32)
+            mask_4 = indices_3 < tile_end
+            v_0_copy_0_copy = v_0_copy_0
+            starts_copy_0_copy = starts_copy_0
+            q_blk_copy = q_blk
+            v_4_copy = v_4
+            acc_copy = acc
+            v_0_copy_0_copy_0 = v_0_copy_0_copy
+            starts_copy_0_copy_0 = starts_copy_0_copy
+            q_blk_copy_0 = q_blk_copy
+            v_4_copy_0 = v_4_copy
+            acc_copy_0 = acc_copy
+            v_8 = v_0_copy_0_copy_0[None]
+            v_9 = v_8.to(tl.int32)
+            v_10 = indices_3 < v_9
+            v_11 = starts_copy_0_copy_0[None]
+            v_12 = v_11.to(tl.int32)
+            v_13 = indices_3 + v_12
+            k_blk = tl.load(k + (v_13[:, None] * k_stride_0 + offset_1 * k_stride_1 + indices_5[None, :] * k_stride_2), mask_4[:, None], other=0)
+            v_14 = starts_copy_0_copy_0[None]
+            v_15 = v_14.to(tl.int32)
+            v_16 = indices_3 + v_15
+            v_blk = tl.load(v + (v_16[:, None] * v_stride_0 + offset_1 * v_stride_1 + indices_5[None, :] * v_stride_2), mask_4[:, None], other=0)
+            permute = tl.permute(k_blk, [1, 0])
+            mm = tl.dot(q_blk_copy_0, permute, input_precision='tf32')
+            v_17 = alpha.to(tl.bfloat16)
+            v_18 = mm * v_17
+            v_19 = v_18.to(tl.float32)
+            v_20 = tl.sigmoid(v_19)
+            v_21 = v_19 * v_20
+            v_22 = v_21.to(tl.bfloat16)
+            v_23 = scale.to(tl.bfloat16)
+            v_24 = v_22 * v_23
+            unsqueeze = indices_2[:, None]
+            unsqueeze_1 = indices_3[None, :]
+            v_25 = unsqueeze > unsqueeze_1
+            subscript = v_4_copy_0[:, None]
+            v_26 = v_25 & subscript
+            subscript_1 = v_10[None, :]
+            v_27 = v_26 & subscript_1
+            v_28 = tl.full([], 0.0, tl.bfloat16)
+            v_29 = v_28[None, None]
+            v_30 = tl.where(v_27, v_24, v_29)
+            _mask_to_2 = tl.where(mask_2[:, None] & mask_4[None, :], v_30, 0)
+            mm_1 = tl.dot(_mask_to_2, v_blk, input_precision='tf32')
+            v_31 = mm_1.to(tl.float32)
+            acc = acc_copy_0 + v_31
+        v_33 = acc.to(tl.bfloat16)
+        v_34 = starts_copy_0[None]
+        v_35 = v_34.to(tl.int32)
+        v_36 = indices_2 + v_35
+        tl.store(out + (v_36[:, None] * out_stride_0 + offset_1 * out_stride_1 + indices_5[None, :] * out_stride_2), v_33, mask_2[:, None])
+
+def _helion_jagged_attention_kernel(max_seq_len: int, alpha: float, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seq_offsets: torch.Tensor, *, _launcher=_default_launcher):
+    """Helion implementation of HSTU jagged attention"""
+    scale = 1.0 / max_seq_len
+    out = torch.zeros_like(v)
+    _BLOCK_SIZE_2 = 16
+    _RDIM_SIZE_3 = 32
+    _BLOCK_SIZE_4 = 16
+    _launcher(_helion__helion_jagged_attention_kernel, (4 * q.size(1) * triton.cdiv(max_seq_len, _BLOCK_SIZE_2),), seq_offsets, q, k, v, out, k.stride(0), k.stride(1), k.stride(2), out.stride(0), out.stride(1), out.stride(2), q.stride(0), q.stride(1), q.stride(2), seq_offsets.stride(0), v.stride(0), v.stride(1), v.stride(2), max_seq_len, alpha, scale, _BLOCK_SIZE_2, _RDIM_SIZE_3, _BLOCK_SIZE_4, num_warps=4, num_stages=3)
+    return out
+
 --- assertExpectedJournal(TestExamples.test_jagged_mean)
 from __future__ import annotations
 
diff --git a/test/test_examples.py b/test/test_examples.py