jagged hstu example

xuanzhang816 · xuanzhang816 · commit fc73acb890a4 · 2025-09-02T13:55:00.000-07:00
ghstack-source-id: e114db1 Pull Request resolved: #527
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -38,6 +38,12 @@
 KERNEL_MAPPINGS: dict[str, tuple[str, ...]] = {  # pyright: ignore[reportAssignmentType]
     # <tritonbench_op_name>: (<tritonbench_module_path>, <helion_kernel_module_path>, <helion_kernel_function_name>)
     "vector_add": ("tritonbench.operators.vector_add.operator", "examples.add", "add"),
+    "ragged_attention": (
+        "tritonbench.operators.ragged_attention.operator",
+        "examples.jagged_hstu_attn",
+        "ragged_attention_wrapper",
+        {"target_size": 0},
+    ),
     "embedding": (
         "tritonbench.operators.embedding.operator",
         "examples.embedding",
diff --git a/examples/jagged_hstu_attn.py b/examples/jagged_hstu_attn.py
@@ -0,0 +1,251 @@
+"""
+Simplified Jagged HSTU Attention Forward Example
+===============================================
+
+This example demonstrates a simplified version of jagged HSTU attention using Helion.
+"""
+
+# %%
+# Imports
+# -------
+from __future__ import annotations
+
+import torch
+
+import helion
+from helion._testing import run_example
+import helion.language as hl
+
+try:
+    from generative_recommenders.ops.triton.triton_hstu_attention import (  # pyright: ignore[reportMissingImports]
+        triton_hstu_mha,
+    )
+
+    HAS_HAMMER = True
+except ImportError:
+    HAS_HAMMER = False
+
+
+def reference_ragged_hstu_kernel_pytorch(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: torch.Tensor | None,
+    max_seq_len: int,
+) -> torch.Tensor:
+    """Simple PyTorch implementation of HSTU ragged attention"""
+    # Initialize output
+    output = torch.zeros_like(v)
+
+    # Scale factor
+    scale = 1.0 / max_seq_len
+    alpha = 1.0 / v.size(2) ** 2
+
+    # Compute per-batch sequence lengths
+    seq_lens = seq_offsets[1:] - seq_offsets[:-1]
+
+    q_split = torch.split(q, seq_lens.tolist(), dim=0)
+    k_split = torch.split(k, seq_lens.tolist(), dim=0)
+    v_split = torch.split(v, seq_lens.tolist(), dim=0)
+
+    # Get the batches
+    for i, (q_batch, k_batch, v_batch) in enumerate(
+        zip(q_split, k_split, v_split, strict=False)
+    ):
+        q_batch = q_batch.transpose(0, 1)  # [heads, seq_len, head_dim]
+        k_batch = k_batch.permute(1, 2, 0)  # [heads, head_dim, seq_len]
+        v_batch = v_batch.transpose(0, 1)  # [heads, seq_len, head_dim]
+
+        # Compute attention scores using batch matrix multiplication
+        scores = torch.bmm(q_batch, k_batch) * alpha
+
+        # Apply SiLU activation
+        scores = (scores / (1.0 + torch.exp(-scores))) * scale
+
+        # Apply lower triangular mask (causal attention)
+        invalid_mask = torch.tril(torch.ones_like(scores, dtype=torch.bool), diagonal=0)
+        scores = torch.where(invalid_mask, scores, torch.zeros_like(scores))
+
+        # Compute and store output
+        output_batch = torch.bmm(scores, v_batch)
+        output[seq_offsets[i] : seq_offsets[i + 1]] = output_batch.transpose(0, 1)
+
+    return output
+
+
+@helion.kernel()
+def _helion_ragged_attention_kernel(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+) -> torch.Tensor:
+    """Helion implementation of HSTU ragged attention"""
+    scale = 1.0 / max_seq_len
+    num_heads = hl.specialize(q.size(1))
+    num_batches = hl.specialize(seq_offsets.size(0) - 1)
+    dimV = hl.specialize(v.size(2))
+
+    out = torch.zeros_like(v)
+
+    # Tile over batch, head, sequence
+    for tile_b, tile_h, tile_q in hl.tile(
+        [num_batches, num_heads, max_seq_len], block_size=[1, 1, None]
+    ):
+        starts = seq_offsets[tile_b.begin]
+        ends = seq_offsets[tile_b.begin + 1]
+        seq_len = ends - starts
+
+        if tile_q.begin < seq_len:
+            mask_q = tile_q.index < seq_len
+            q_blk = q[tile_q.index + starts, tile_h.begin, :]
+            acc = hl.zeros([tile_q, dimV], dtype=torch.float32)
+
+            # Causal attention: only attend to previous tokens
+            for tile_kv in hl.tile(0, tile_q.end, block_size=None):
+                mask_kv = tile_kv.index < seq_len
+                k_blk = k[tile_kv.index + starts, tile_h.begin, :]
+                v_blk = v[tile_kv.index + starts, tile_h.begin, :]
+
+                # Compute attention scores with SiLU activation
+                scores = (
+                    torch.nn.functional.silu(torch.matmul(q_blk, k_blk.T) * alpha)
+                    * scale
+                )
+
+                # Apply causal mask: only attend to previous positions
+                scores = torch.where(
+                    (tile_q.index.unsqueeze(1) > tile_kv.index.unsqueeze(0))
+                    & mask_q[:, None]
+                    & mask_kv[None, :],
+                    scores,
+                    0.0,
+                )
+
+                acc += torch.matmul(scores.to(v.dtype), v_blk)
+
+            # Store result
+            out[tile_q.index + starts, tile_h.begin, :] = acc
+
+    return out
+
+
+def ragged_attention_wrapper(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: torch.Tensor | None,
+    max_seq_len: int,
+) -> torch.Tensor:
+    """Wrapper function for ragged attention kernel"""
+    return _helion_ragged_attention_kernel(
+        max_seq_len=max_seq_len,
+        alpha=1.0 / v.size(2) ** 2,
+        q=q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+    )
+
+
+def test(
+    batch_size: int,
+    max_seq_len: int,
+    heads: int,
+    head_dim: int,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device | str = "cuda",
+) -> None:
+    """
+    Test the ragged HSTU attention kernel implementation.
+
+    Args:
+        batch_size: Number of sequences in the batch
+        max_seq_len: Maximum sequence length
+        heads: Number of attention heads
+        head_dim: Dimension of each attention head
+        dtype: Data type for the tensors
+        device: Device to run the test on
+    """
+    device = torch.device(device)
+
+    # Generate random sequence lengths
+    min_seq_len = max_seq_len // 2
+    seq_lengths = torch.randint(
+        min_seq_len, max_seq_len + 1, (batch_size,), dtype=torch.int32, device=device
+    )
+    seq_offsets = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32, device=device),
+            torch.cumsum(seq_lengths, dim=0),
+        ]
+    )
+    total_seq_len = int(seq_offsets[-1].item())
+
+    # q, k, v: [total_seq_len, heads, head_dim]
+    q = torch.randn(
+        (total_seq_len, heads, head_dim),
+        dtype=dtype,
+        device=device,
+        requires_grad=True,
+    )
+    k = torch.randn(
+        (total_seq_len, heads, head_dim),
+        dtype=dtype,
+        device=device,
+        requires_grad=True,
+    )
+    v = torch.randn(
+        (total_seq_len, heads, head_dim),
+        dtype=dtype,
+        device=device,
+        requires_grad=True,
+    )
+
+    baselines = {
+        "torch": reference_ragged_hstu_kernel_pytorch,
+    }
+    if HAS_HAMMER:
+
+        def _triton_hstu_mha(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            seq_offsets: torch.Tensor,
+            num_targets: torch.Tensor | None,
+            max_seq_len: int,
+        ) -> torch.Tensor:
+            return triton_hstu_mha(  # pyright: ignore[reportPossiblyUnboundVariable,reportCallIssue]
+                max_seq_len,
+                alpha=1.0 / v.size(2) ** 2,
+                q=q,
+                k=k,
+                v=v,
+                seq_offsets=seq_offsets,
+                num_targets=num_targets,
+                max_attn_len=0,
+                contextual_seq_len=0,
+            )
+
+        baselines["tritonbench"] = _triton_hstu_mha
+
+    run_example(
+        ragged_attention_wrapper,
+        baselines,
+        (q, k, v, seq_offsets, None, max_seq_len),
+    )
+
+
+def main() -> None:
+    """
+    Main entry point for testing the simplified ragged HSTU attention kernel.
+    """
+    test(batch_size=1024, max_seq_len=1024, heads=4, head_dim=128, dtype=torch.bfloat16)
+
+
+if __name__ == "__main__":
+    main()