pytorch
diff --git a/‎examples/matmul.py‎
Lines changed: 78 additions & 2 deletions b/‎examples/matmul.py‎
Lines changed: 78 additions & 2 deletions
diff --git a/‎examples/matmul_split_k.py‎
Lines changed: 46 additions & 2 deletions b/‎examples/matmul_split_k.py‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎test/test_autotuner.py‎
Lines changed: 1 addition & 1 deletion b/‎test/test_autotuner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/test_examples.expected‎
Lines changed: 190 additions & 0 deletions b/‎test/test_examples.expected‎
Lines changed: 190 additions & 0 deletions
@@ -1,15 +1,56 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import torch
 
 import helion
 from helion._testing import run_example
 import helion.language as hl
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
 
 # static_shapes=True gives a performance boost for matmuls
 @helion.kernel(static_shapes=True)
-def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+def matmul_with_epilogue(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor],
+) -> torch.Tensor:
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f"size mismatch {k} != {k2}"
+    out = torch.empty(
+        [m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device
+    )
+    for tile_m, tile_n in hl.tile([m, n]):
+        acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+        for tile_k in hl.tile(k):
+            acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+        out[tile_m, tile_n] = epilogue(acc, [tile_m, tile_n])
+    return out
+
+
+def matmul(
+    x: torch.Tensor, y: torch.Tensor, bias: torch.Tensor | None = None
+) -> torch.Tensor:
+    """Wrapper function for tritonbench that dispatches based on bias presence."""
+    if bias is None:
+        # No epilogue, just return the accumulated value
+        return matmul_with_epilogue(x, y, lambda acc, tile: acc)
+    # Create a closure that captures the bias
+
+    def epilogue_with_bias(acc: torch.Tensor, tile: list[torch.Tensor]) -> torch.Tensor:
+        # Use tile_n to index into the bias
+        return acc + bias[tile[1]]
+
+    return matmul_with_epilogue(x, y, epilogue_with_bias)
+
+
+@helion.kernel(static_shapes=True)
+def matmul_no_bias(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f"size mismatch {k} != {k2}"
@@ -24,10 +65,45 @@ def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     return out
 
 
+@helion.kernel(static_shapes=True)
+def matmul_with_bias(
+    x: torch.Tensor, y: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f"size mismatch {k} != {k2}"
+    out = torch.empty(
+        [m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device
+    )
+    for tile_m, tile_n in hl.tile([m, n]):
+        acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+        for tile_k in hl.tile(k):
+            acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+        out[tile_m, tile_n] = acc + bias[tile_n]
+    return out
+
+
 def check(m: int, k: int, n: int) -> None:
     x = torch.randn([m, k], device="cuda", dtype=torch.float16)
     y = torch.randn([k, n], device="cuda", dtype=torch.float16)
-    run_example(matmul, torch.matmul, (x, y))
+
+    # Test without bias using closure approach
+    def kernel_no_bias(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return matmul_with_epilogue(x, y, lambda acc, tile: acc)
+
+    run_example(kernel_no_bias, torch.matmul, (x, y))
+
+    # Test with bias using closure approach
+    bias = torch.randn([n], device="cuda", dtype=torch.float16)
+
+    def kernel_with_bias(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        def epilogue(acc: torch.Tensor, tile: list[torch.Tensor]) -> torch.Tensor:
+            return acc + bias[tile[1]]
+
+        return matmul_with_epilogue(x, y, epilogue)
+
+    expected_with_bias = lambda x, y: torch.matmul(x, y) + bias  # noqa: E731
+    run_example(kernel_with_bias, expected_with_bias, (x, y))
 
 
 def main() -> None:
 
@@ -10,7 +10,7 @@
 
 # static_shapes=True gives a performance boost for matmuls
 @helion.kernel(static_shapes=True)
-def matmul_split_k(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+def matmul_split_k_no_bias(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f"size mismatch {k} != {k2}"
@@ -27,10 +27,54 @@ def matmul_split_k(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     return out
 
 
+@helion.kernel(static_shapes=True)
+def matmul_split_k_with_bias(
+    x: torch.Tensor, y: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f"size mismatch {k} != {k2}"
+    bias_size = bias.size(0)
+    assert bias_size == n, f"bias size mismatch, expected {n}, got {bias_size}"
+
+    # Initialize output with zeros
+    out = torch.zeros(
+        [m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device
+    )
+
+    split_k = hl.register_tunable("split_k", PowerOfTwoFragment(1, 256))
+    k_block = helion.next_power_of_2(helion.cdiv(k, split_k))
+    for tile_m, tile_n, outer_k in hl.tile([m, n, k], block_size=[None, None, k_block]):
+        acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+        for inner_k in hl.tile(outer_k.begin, outer_k.end):
+            acc = torch.addmm(acc, x[tile_m, inner_k], y[inner_k, tile_n])
+        # Add bias only on the first k-split iteration
+        if outer_k.begin == 0:
+            acc = acc + bias[tile_n]
+        hl.atomic_add(out, [tile_m, tile_n], acc)
+    return out
+
+
+def matmul_split_k(
+    x: torch.Tensor, y: torch.Tensor, bias: torch.Tensor | None = None
+) -> torch.Tensor:
+    """Wrapper function for tritonbench that dispatches based on bias presence."""
+    if bias is None:
+        return matmul_split_k_no_bias(x, y)
+    return matmul_split_k_with_bias(x, y, bias)
+
+
 def check(m: int, k: int, n: int) -> None:
     x = torch.randn([m, k], device="cuda", dtype=torch.float16)
     y = torch.randn([k, n], device="cuda", dtype=torch.float16)
-    run_example(matmul_split_k, torch.matmul, (x, y), atol=1)
+
+    # Test without bias
+    run_example(matmul_split_k_no_bias, torch.matmul, (x, y), atol=1)
+
+    # Test with bias
+    bias = torch.randn([n], device="cuda", dtype=torch.float16)
+    expected_with_bias = lambda x, y, bias: torch.matmul(x, y) + bias  # noqa: E731
+    run_example(matmul_split_k_with_bias, expected_with_bias, (x, y, bias), atol=1)
 
 
 def main() -> None:
 
@@ -24,7 +24,7 @@
 datadir = Path(__file__).parent / "data"
 basic_kernels = import_path(datadir / "basic_kernels.py")
 examples_dir = Path(__file__).parent.parent / "examples"
-examples_matmul = import_path(examples_dir / "matmul.py").matmul
+examples_matmul = import_path(examples_dir / "matmul.py").matmul_no_bias
 
 
 class TestAutotuner(TestCase):
 
@@ -1103,6 +1103,45 @@ def matmul_layernorm(x: torch.Tensor, y: torch.Tensor, weight: torch.Tensor, bia
     _launcher(_matmul_layernorm_kernel, (triton.cdiv(128, _BLOCK_SIZE_1),), x, y, weight, bias, out, out.stride(0), _BLOCK_SIZE_1, _RDIM_SIZE_0, _BLOCK_SIZE_2, num_warps=4, num_stages=3)
     return out
 
+--- assertExpectedJournal(TestExamples.test_matmul_no_bias)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _matmul_no_bias_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+    num_blocks_0 = tl.cdiv(128, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
+    for offset_2 in tl.range(0, 256, _BLOCK_SIZE_2):
+        indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_2).to(tl.int32)
+        acc_copy = acc
+        acc_copy_0 = acc_copy
+        load = tl.load(x + (indices_0[:, None] * 256 + indices_2[None, :] * 1), None)
+        load_1 = tl.load(y + (indices_2[:, None] * 512 + indices_1[None, :] * 1), None)
+        acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
+    v_0 = acc.to(tl.float16)
+    tl.store(out + (indices_0[:, None] * 512 + indices_1[None, :] * 1), v_0, None)
+
+def matmul_no_bias(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f'size mismatch {k} != {k2}'
+    out = torch.empty([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _BLOCK_SIZE_2 = 32
+    _launcher(_matmul_no_bias_kernel, (triton.cdiv(128, _BLOCK_SIZE_0) * triton.cdiv(512, _BLOCK_SIZE_1),), x, y, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=3)
+    return out
+
 --- assertExpectedJournal(TestExamples.test_matmul_split_k)
 from __future__ import annotations
 
@@ -1152,6 +1191,157 @@ def matmul_split_k(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launc
     _launcher(_matmul_split_k_kernel, (triton.cdiv(64, _BLOCK_SIZE_0) * triton.cdiv(64, _BLOCK_SIZE_1) * triton.cdiv(1024, _BLOCK_SIZE_2),), x, y, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
     return out
 
+--- assertExpectedJournal(TestExamples.test_matmul_split_k_no_bias)
+from __future__ import annotations
+
+import torch
+import helion
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+import helion._testing.matmul_split_k as _source_module
+
+@triton.jit
+def _matmul_split_k_no_bias_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr, _BLOCK_SIZE_3: tl.constexpr):
+    num_blocks_0 = tl.cdiv(64, _BLOCK_SIZE_0)
+    num_blocks_1 = tl.cdiv(64, _BLOCK_SIZE_1)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0 % num_blocks_1
+    pid_2 = tl.program_id(0) // (num_blocks_0 * num_blocks_1)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    offset_2 = pid_2 * _BLOCK_SIZE_2
+    acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
+    tile_end = tl.minimum(offset_2 + _BLOCK_SIZE_2, 1024)
+    for offset_3 in tl.range(offset_2.to(tl.int32), tile_end.to(tl.int32), _BLOCK_SIZE_3):
+        indices_3 = offset_3 + tl.arange(0, _BLOCK_SIZE_3).to(tl.int32)
+        mask_3 = indices_3 < tile_end
+        acc_copy = acc
+        acc_copy_0 = acc_copy
+        load = tl.load(x + (indices_0[:, None] * 1024 + indices_3[None, :] * 1), mask_3[None, :], other=0)
+        load_1 = tl.load(y + (indices_3[:, None] * 64 + indices_1[None, :] * 1), mask_3[:, None], other=0)
+        acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
+    tl.atomic_add(out + (indices_0[:, None] * 64 + indices_1[None, :] * 1), acc, mask=None, sem='relaxed')
+
+def matmul_split_k_no_bias(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f'size mismatch {k} != {k2}'
+    out = torch.zeros([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
+    split_k = 8
+    k_block = helion.next_power_of_2(helion.cdiv(k, split_k))
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    _BLOCK_SIZE_2 = k_block
+    _BLOCK_SIZE_3 = 32
+    _launcher(_matmul_split_k_no_bias_kernel, (triton.cdiv(64, _BLOCK_SIZE_0) * triton.cdiv(64, _BLOCK_SIZE_1) * triton.cdiv(1024, _BLOCK_SIZE_2),), x, y, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestExamples.test_matmul_split_k_with_bias)
+from __future__ import annotations
+
+import torch
+import helion
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+import helion._testing.matmul_split_k as _source_module
+
+@triton.jit
+def _matmul_split_k_with_bias_kernel(x, y, bias, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr, _BLOCK_SIZE_3: tl.constexpr):
+    num_blocks_0 = tl.cdiv(64, _BLOCK_SIZE_0)
+    num_blocks_1 = tl.cdiv(64, _BLOCK_SIZE_1)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0 % num_blocks_1
+    pid_2 = tl.program_id(0) // (num_blocks_0 * num_blocks_1)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    offset_2 = pid_2 * _BLOCK_SIZE_2
+    acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
+    tile_end = tl.minimum(offset_2 + _BLOCK_SIZE_2, 1024)
+    for offset_3 in tl.range(offset_2.to(tl.int32), tile_end.to(tl.int32), _BLOCK_SIZE_3):
+        indices_3 = offset_3 + tl.arange(0, _BLOCK_SIZE_3).to(tl.int32)
+        mask_3 = indices_3 < tile_end
+        acc_copy = acc
+        acc_copy_0 = acc_copy
+        load = tl.load(x + (indices_0[:, None] * 1024 + indices_3[None, :] * 1), mask_3[None, :], other=0)
+        load_1 = tl.load(y + (indices_3[:, None] * 64 + indices_1[None, :] * 1), mask_3[:, None], other=0)
+        acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
+    eq = offset_2 == 0
+    if eq:
+        acc_copy_1 = acc
+        acc_copy_1_0 = acc_copy_1
+        load_2 = tl.load(tl.make_block_ptr(bias, [64], [1], [offset_1], [_BLOCK_SIZE_1], [0]), boundary_check=[0], padding_option='zero')
+        v_0 = load_2[None, :]
+        v_1 = v_0.to(tl.float32)
+        acc = acc_copy_1_0 + v_1
+    tl.atomic_add(out + (indices_0[:, None] * 64 + indices_1[None, :] * 1), acc, mask=None, sem='relaxed')
+
+def matmul_split_k_with_bias(x: torch.Tensor, y: torch.Tensor, bias: torch.Tensor, *, _launcher=_default_launcher):
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f'size mismatch {k} != {k2}'
+    bias_size = bias.size(0)
+    assert bias_size == n, f'bias size mismatch, expected {n}, got {bias_size}'
+    out = torch.zeros([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
+    split_k = 8
+    k_block = helion.next_power_of_2(helion.cdiv(k, split_k))
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    _BLOCK_SIZE_2 = k_block
+    _BLOCK_SIZE_3 = 32
+    _launcher(_matmul_split_k_with_bias_kernel, (triton.cdiv(64, _BLOCK_SIZE_0) * triton.cdiv(64, _BLOCK_SIZE_1) * triton.cdiv(1024, _BLOCK_SIZE_2),), x, y, bias, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestExamples.test_matmul_with_bias)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _matmul_with_bias_kernel(x, y, bias, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+    num_blocks_0 = tl.cdiv(128, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
+    for offset_2 in tl.range(0, 256, _BLOCK_SIZE_2):
+        indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_2).to(tl.int32)
+        acc_copy = acc
+        acc_copy_0 = acc_copy
+        load = tl.load(x + (indices_0[:, None] * 256 + indices_2[None, :] * 1), None)
+        load_1 = tl.load(y + (indices_2[:, None] * 512 + indices_1[None, :] * 1), None)
+        acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
+    load_2 = tl.load(bias + indices_1 * 1, None)
+    v_0 = load_2[None, :]
+    v_1 = v_0.to(tl.float32)
+    v_2 = acc + v_1
+    v_3 = v_2.to(tl.float16)
+    tl.store(out + (indices_0[:, None] * 512 + indices_1[None, :] * 1), v_3, None)
+
+def matmul_with_bias(x: torch.Tensor, y: torch.Tensor, bias: torch.Tensor, *, _launcher=_default_launcher):
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f'size mismatch {k} != {k2}'
+    out = torch.empty([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _BLOCK_SIZE_2 = 32
+    _launcher(_matmul_with_bias_kernel, (triton.cdiv(128, _BLOCK_SIZE_0) * triton.cdiv(512, _BLOCK_SIZE_1),), x, y, bias, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=3)
+    return out
+
 --- assertExpectedJournal(TestExamples.test_moe_matmul_ogs)
 from __future__ import annotations