Allow passing tritonbench operator instance into kernel benchmark wrapper; Always return lambda for timing measurement (#596)

yf225 · web-flow · commit 65995f5315d4 · 2025-09-12T14:45:13.000-07:00
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -393,16 +393,9 @@ def run_kernel_variants(
     """Run kernel variants in the same benchmark run."""
 
     # Import tritonbench components
-    try:
-        from tritonbench.utils.parser import (  # pyright: ignore[reportMissingImports]
-            get_parser,
-        )
-    except ImportError:
-        print(
-            "Error: Could not import tritonbench. Make sure it's in the path.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
+    from tritonbench.utils.parser import (  # pyright: ignore[reportMissingImports]
+        get_parser,
+    )
 
     # Get the tritonbench operator name
     operator_name = kernel_name
@@ -500,14 +493,16 @@ def helion_method(
                             attr.settings.force_autotune = True
                             attr.settings.static_shape = True  # pyright: ignore[reportAttributeAccessIssue]
 
-                def _inner() -> Callable[..., Any] | object:
-                    # BENCHMARK HOT PATH, do not add any new logic here
-                    result = kfunc(*args, **kwargs)
-                    if callable(result):
-                        return result()
-                    return result
+                if isinstance(kfunc, Kernel):
+                    # Helion kernel - we call it in a lambda to delay execution until measurement
+                    measured_func_callable = lambda: kfunc(*args, **kwargs)  # noqa: E731
+                else:
+                    # tritonbench integration wrapper - pass tritonbench operator instance as first argument
+                    # The wrapper must return a callable that does the actual computation, for delayed execution
+                    measured_func_callable = kfunc(self, *args, **kwargs)
 
-                return _inner
+                assert callable(measured_func_callable)
+                return measured_func_callable
 
             return helion_method
 
diff --git a/examples/embedding.py b/examples/embedding.py
@@ -10,6 +10,8 @@
 # -------
 from __future__ import annotations
 
+from typing import Callable
+
 import torch
 
 import helion
@@ -49,21 +51,22 @@ def embedding(x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
 # Benchmark Wrapper
 # --------------
 def embedding_tritonbench(
-    V: int, D: int, inp: torch.Tensor, shared_weight: torch.Tensor
-) -> torch.Tensor:
+    tb_op: object, V: int, D: int, inp: torch.Tensor, shared_weight: torch.Tensor
+) -> Callable[[], torch.Tensor]:
     """
     Wrapper for tritonbench that matches its interface.
 
     Args:
+        tb_op: TritonBench operator instance
         V: Vocabulary size (unused, provided for compatibility)
         D: Embedding dimension (unused, provided for compatibility)
         inp: Input tensor of indices
         shared_weight: Embedding weight matrix
 
     Returns:
-        Output tensor containing the embedding vectors
+        Callable that returns output tensor containing the embedding vectors
     """
-    return embedding(inp, shared_weight)
+    return lambda: embedding(inp, shared_weight)
 
 
 # %%
diff --git a/examples/exp.py b/examples/exp.py
@@ -10,6 +10,8 @@
 # -------
 from __future__ import annotations
 
+from typing import Callable
+
 import torch
 
 import helion
@@ -40,17 +42,20 @@ def exp(x: torch.Tensor) -> torch.Tensor:
 # %%
 # Benchmark Wrapper
 # --------------
-def exp_tritonbench(x: torch.Tensor) -> dict[str, torch.Tensor]:
+def exp_tritonbench(
+    tb_op: object, x: torch.Tensor
+) -> Callable[[], dict[str, torch.Tensor]]:
     """
     Wrapper for tritonbench that returns output in expected format.
 
     Args:
+        tb_op: TritonBench operator instance
         x: Input tensor
 
     Returns:
-        Dictionary containing the output tensor
+        Callable that returns dictionary containing the output tensor
     """
-    return {"output": exp(x)}
+    return lambda: {"output": exp(x)}
 
 
 # %%
diff --git a/examples/fp8_attention.py b/examples/fp8_attention.py
@@ -135,12 +135,13 @@ def preprocess_fp8_attention_inputs(
 
 # %%
 def fp8_attention_tritonbench(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+    tb_op: object, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
 ) -> Callable[[], torch.Tensor]:
     """
     Creates a callable function for benchmarking FP8 attention with tritonbench.
     Preprocesses inputs and returns a lambda function that calls the FP8 attention kernel.
     Args:
+        tb_op: TritonBench operator instance
         q: Query tensor of shape [batch, heads, seq_len, head_dim]
         k: Key tensor of shape [batch, heads, seq_len, head_dim]
         v: Value tensor of shape [batch, heads, seq_len, head_dim]
@@ -272,7 +273,7 @@ def check(batch: int, heads: int, seq_len: int, head_dim: int) -> None:
     v = torch.randn(batch, heads, seq_len, head_dim, dtype=torch.float16, device="cuda")
     from helion._testing import run_example
 
-    helion_fn = fp8_attention_tritonbench(q, k, v)
+    helion_fn = fp8_attention_tritonbench(None, q, k, v)
     pytorch_fn = fp8_attention_pytorch(q, k, v)
     run_example(
         helion_fn,
diff --git a/examples/fp8_gemm.py b/examples/fp8_gemm.py
@@ -11,6 +11,7 @@
 from __future__ import annotations
 
 import os
+from typing import Callable
 
 import torch
 
@@ -79,16 +80,25 @@ def reference_fp8_gemm_pytorch(
 
 
 # %%
-def fp8_gemm_tritonbench(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+def fp8_gemm_tritonbench(
+    tb_op: object,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+) -> Callable[[], torch.Tensor]:
     """
     Wrapper for TritonBench compatibility.
     Args:
+        tb_op: TritonBench operator instance
         a (torch.Tensor): Left input tensor in FP8 format.
         b (torch.Tensor): Right input tensor in FP8 format.
+        scale_a (torch.Tensor): Scale factor for tensor a (unused in our implementation).
+        scale_b (torch.Tensor): Scale factor for tensor b (unused in our implementation).
     Returns:
-        torch.Tensor: Output tensor in FP16 format.
+        Callable that returns output tensor in FP16 format.
     """
-    return fp8_gemm(a, b)
+    return lambda: fp8_gemm(a, b)
 
 
 # %%
diff --git a/examples/jagged_hstu_attn.py b/examples/jagged_hstu_attn.py
@@ -10,6 +10,8 @@
 # -------
 from __future__ import annotations
 
+from typing import Callable
+
 import torch
 
 import helion
@@ -143,15 +145,16 @@ def _helion_jagged_attention_kernel(
 # Benchmark Wrapper
 # --------------
 def ragged_attention_tritonbench(
+    tb_op: object,
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
     seq_offsets: torch.Tensor,
     num_targets: torch.Tensor | None,
     max_seq_len: int,
-) -> torch.Tensor:
+) -> Callable[[], torch.Tensor]:
     """Wrapper function for jagged attention kernel"""
-    return _helion_jagged_attention_kernel(
+    return lambda: _helion_jagged_attention_kernel(
         max_seq_len=max_seq_len,
         alpha=1.0 / v.size(2) ** 2,
         q=q,
@@ -246,7 +249,7 @@ def _triton_hstu_mha(
         baselines["tritonbench"] = _triton_hstu_mha
 
     run_example(
-        ragged_attention_tritonbench,
+        lambda *args: ragged_attention_tritonbench(None, *args)(),
         baselines,
         (q, k, v, seq_offsets, None, max_seq_len),
     )
diff --git a/examples/jagged_mean.py b/examples/jagged_mean.py
@@ -11,6 +11,8 @@
 # -------
 from __future__ import annotations
 
+from typing import Callable
+
 import torch
 
 import helion
@@ -136,20 +138,21 @@ def reference_jagged_mean_kernel_pytorch(
 # Benchmark Wrapper
 # --------------
 def jagged_mean_tritonbench(
-    x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
-) -> torch.Tensor:
+    tb_op: object, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
+) -> Callable[[], torch.Tensor]:
     """
     Wrapper for tritonbench that matches the expected interface.
 
     Args:
+        tb_op: TritonBench operator instance
         x: Nested tensor in jagged format with shape (B, *, M)
         B: Batch size
         M: Number of features
         seqlen: Maximum sequence length
         sparsity: Sparsity factor (not used)
 
     Returns:
-        Tensor of shape (B, M) with mean values per row and feature
+        Callable that returns tensor of shape (B, M) with mean values per row and feature
     """
     x_values = x._values
     x_offsets = x._offsets  # pyright: ignore[reportAttributeAccessIssue]
@@ -160,7 +163,7 @@ def jagged_mean_tritonbench(
         dtype=torch.int32,
         device=x_values.device,  # pyright: ignore[reportAttributeAccessIssue]
     )
-    return jagged_mean_kernel(x_values, x_offsets, feature_counts, M)
+    return lambda: jagged_mean_kernel(x_values, x_offsets, feature_counts, M)
 
 
 # %%
diff --git a/examples/jagged_softmax.py b/examples/jagged_softmax.py
@@ -11,6 +11,7 @@
 from __future__ import annotations
 
 import itertools
+from typing import Callable
 
 import torch
 
@@ -135,22 +136,23 @@ def jagged_softmax_kernel(
 # Benchmark Wrapper
 # --------------
 def jagged_softmax_tritonbench(
-    x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
-) -> torch.Tensor:
+    tb_op: object, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
+) -> Callable[[], torch.Tensor]:
     """
     Wrapper for tritonbench that matches the expected interface.
 
     Args:
+        tb_op: TritonBench operator instance
         x: Nested tensor in jagged format with shape (B, *, M)
         B: Batch size (unused)
         M: Number of features (unused)
         seqlen: Maximum sequence length (unused)
         sparsity: Sparsity factor (unused)
 
     Returns:
-        Tensor of shape (N, M), where N = total number of rows in the jagged tensor
+        Callable that returns tensor of shape (N, M), where N = total number of rows in the jagged tensor
     """
-    return jagged_softmax_kernel(x._values, x._offsets)  # pyright: ignore[reportArgumentType, reportAttributeAccessIssue]
+    return lambda: jagged_softmax_kernel(x._values, x._offsets)  # pyright: ignore[reportArgumentType, reportAttributeAccessIssue]
 
 
 # %%
diff --git a/examples/matmul.py b/examples/matmul.py
@@ -132,11 +132,12 @@ def baseline_wrapper(x: Tensor, y: Tensor) -> Tensor:
 
 # %%
 def matmul_tritonbench(
-    a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None
+    tb_op: object, a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None
 ) -> Callable:
     """
     Wrapper for tritonbench that matches its interface.
     Args:
+        tb_op: TritonBench operator instance
         a (torch.Tensor): Left matrix.
         b (torch.Tensor): Right matrix.
         bias (torch.Tensor or None): Optional bias to add in the epilogue.
@@ -148,7 +149,9 @@ def matmul_tritonbench(
     return lambda: matmul(a, b)
 
 
-def addmm_tritonbench(bias: Tensor, mat1: Tensor, mat2: Tensor) -> Callable:
+def addmm_tritonbench(
+    tb_op: object, bias: Tensor, mat1: Tensor, mat2: Tensor
+) -> Callable:
     """
     Wrapper for tritonbench that performs a matrix multiplication of the matrices
     `mat1` and `mat2` followed by adding `bias` to the result.
diff --git a/examples/matmul_split_k.py b/examples/matmul_split_k.py
@@ -97,11 +97,12 @@ def check(m: int, k: int, n: int) -> None:
 
 # %%
 def matmul_split_k_tritonbench(
-    a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None
+    tb_op: object, a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None
 ) -> Callable:
     """
     Wrapper for tritonbench that matches its interface.
     Args:
+        tb_op: TritonBench operator instance
         a (torch.Tensor): Left input matrix.
         b (torch.Tensor): Right input matrix.
         bias (torch.Tensor or None): Optional bias to add in the epilogue.
diff --git a/examples/rms_norm.py b/examples/rms_norm.py
@@ -11,6 +11,8 @@
 # -------
 from __future__ import annotations
 
+from typing import Callable
+
 import torch
 
 import helion
@@ -60,19 +62,22 @@ def rms_norm(x: torch.Tensor, weight: torch.Tensor, eps: float = 1e-5) -> torch.
 # %%
 # Benchmark Wrapper
 # --------------
-def rms_norm_tritonbench(H: int, inp: torch.Tensor) -> torch.Tensor:
+def rms_norm_tritonbench(
+    tb_op: object, H: int, inp: torch.Tensor
+) -> Callable[[], torch.Tensor]:
     """
     Wrapper for tritonbench that matches expected interface.
 
     Args:
+        tb_op: TritonBench operator instance
         H: Hidden dimension size
         inp: Input tensor
 
     Returns:
-        Normalized tensor
+        Callable that returns normalized tensor
     """
     weight = torch.ones(H, device=inp.device, dtype=inp.dtype)
-    return rms_norm(inp, weight, eps=1e-6)
+    return lambda: rms_norm(inp, weight, eps=1e-6)
 
 
 # %%
diff --git a/examples/sum.py b/examples/sum.py