[Benchmark] Add low mem dropout example

karthickai · karthickai · commit 29bd548d9652 · 2025-09-21T15:27:58.000-07:00
stack-info: PR: #641, branch: karthickai/stack/1
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -205,6 +205,11 @@ class RunResult:
         "examples.int4_gemm",
         "int4_gemm_tritonbench",
     ),
+    "low_mem_dropout": (
+        "tritonbench.operators.low_mem_dropout.operator",
+        "examples.low_mem_dropout",
+        "low_mem_dropout_tritonbench",
+    ),
 }
 
 
@@ -321,6 +326,14 @@ class RunResult:
         "helion_grouped_gemm_jagged_persistent_tritonbench-speedup": "helion_speedup",
         "helion_grouped_gemm_jagged_persistent_tritonbench-accuracy": "helion_accuracy",
     },
+    "low_mem_dropout": {
+        "seeded_dropout-accuracy": "triton_accuracy",
+        "seeded_dropout-speedup": "triton_speedup",
+        "torch_compile_dropout-accuracy": "torch_compile_accuracy",
+        "torch_compile_dropout-speedup": "torch_compile_speedup",
+        "helion_low_mem_dropout_tritonbench-accuracy": "helion_accuracy",
+        "helion_low_mem_dropout_tritonbench-speedup": "helion_speedup",
+    },
 }
 
 
diff --git a/examples/low_mem_dropout.py b/examples/low_mem_dropout.py
@@ -0,0 +1,144 @@
+"""
+Low mem dropout Example
+================
+
+This example demonstrates how to implement a Low mem dropout using Helion.
+"""
+
+# %%
+# Imports
+# -------
+from __future__ import annotations
+
+from typing import Callable
+
+import torch
+
+import helion
+import helion.language as hl
+
+
+# %%
+# Low mem dropout forward implementations
+# -------------------
+@helion.kernel()
+def low_mem_dropout(p: float, x: torch.Tensor) -> torch.Tensor:
+    """
+    Applies dropout on x using p
+    Args:
+        p (float): dropout probability
+        x (torch.Tensor): input tensor
+    Returns:
+        Output tensor, mask tensor
+    """
+    scale = 1.0 / (1.0 - p)
+    # flatten to 1D so we can use tile
+    n = x.numel()
+    x_flat = x.view(-1)
+    out_flat = torch.empty_like(x_flat)
+    mask_flat = torch.empty_like(x_flat, dtype=torch.bool)
+
+    for tidx in hl.tile(n):
+        xi = x_flat[tidx].to(torch.float32)
+        r = torch.rand_like(xi, dtype=torch.float32)
+        keep = r > p
+        yscaled = xi * scale
+        yi = torch.where(keep, yscaled, 0.0)
+        out_flat[tidx] = yi.to(x.dtype)
+        mask_flat[tidx] = keep
+    return out_flat.view_as(x), mask_flat.view_as(x)
+
+
+# %%
+# Low mem dropout backward implementation
+# -------------------
+@helion.kernel()
+def low_mem_dropout_bwd(p: float, grad_y: torch.Tensor) -> torch.Tensor:
+    """
+    For low mem dropout we are applying randomness inside both fwd and bwd
+    technically dropout bwd is same as fwd
+    Args:
+        p (float): Dropout probability
+        grad_y (torch.Tensor): Gradient tensor
+    Returns:
+        Output tensor, mask tensor
+    """
+    scale = 1.0 / (1.0 - p)
+    n = grad_y.numel()
+    grad_y_flat = grad_y.view(-1)
+    out_flat = torch.empty_like(grad_y_flat)
+    mask_flat = torch.empty_like(grad_y_flat, dtype=torch.bool)
+    for tidx in hl.tile(n):
+        gi = grad_y_flat[tidx].to(torch.float32)
+        r = torch.rand_like(gi, dtype=torch.float32)
+        keep = r > p
+        g_scaled = gi * scale
+        gxi = torch.where(keep, g_scaled, 0.0)
+        out_flat[tidx] = gxi.to(grad_y.dtype)
+        mask_flat[tidx] = keep
+    return out_flat.view_as(grad_y), mask_flat.view_as(grad_y)
+
+
+# %%
+# TritonBench Wrapper
+# -------------------
+def low_mem_dropout_tritonbench(tb_op: object, p: float, x: torch.Tensor) -> Callable:
+    """
+    Wrapper for TritonBench compatibility.
+
+    Args:
+        tb_op: TritonBench operator instance
+        p (float): dropout probability
+        x (torch.Tensor): Input tensor
+
+    Returns:
+        Callable: A function that performs the low_mem_dropout.
+    """
+
+    def _inner() -> torch.Tensor:
+        out, _ = low_mem_dropout(p, x)
+        return out
+
+    return _inner
+
+
+# %%
+# Verification Function
+# -------------------
+def check(p: float, size: int) -> None:
+    """
+    Verify the low mem dropout kernel implementation against PyTorch's native dropout implementation.
+
+    Args:
+        p (float): dropout probability
+        size (int): input tensor size
+    """
+
+    x = torch.randn(size=(size,)).cuda()
+
+    torch.manual_seed(123)
+    y, fwd_mask = low_mem_dropout(p, x)
+
+    # need to set seed again else we can't reproduce
+    torch.manual_seed(123)
+    grad_y = torch.ones_like(x)
+    grad_x, bwd_mask = low_mem_dropout_bwd(p, grad_y)
+    assert torch.equal(fwd_mask, bwd_mask)
+
+
+# %%
+# Main Function
+# -----------
+def main() -> None:
+    """
+    Main entry point that runs the low mem dropout kernel verification with different tensor sizes.
+    Tests with two configurations:
+    - p=0.25, s=8192
+    - p=0.25, s=32768
+    """
+    check(0.25, 8192)
+    check(0.25, 32768)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -2276,6 +2276,43 @@ def layer_norm_fwd(x: torch.Tensor, normalized_shape: list[int], weight: torch.T
     _launcher(_helion_layer_norm_fwd, (triton.cdiv(m, _BLOCK_SIZE_0),), x, weight, out, mean, rstd, mean.stride(0), out.stride(0), out.stride(1), rstd.stride(0), weight.stride(0), x.stride(0), x.stride(1), m, n, eps, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=3)
     return (out, mean, rstd)
 
+--- assertExpectedJournal(TestExamples.test_low_mem_dropout)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_low_mem_dropout(x_flat, out_flat, mask_flat, mask_flat_stride_0, out_flat_stride_0, x_flat_stride_0, n, p, scale, _BLOCK_SIZE_0: tl.constexpr, rng_seed_buffer):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < n
+    xi = tl.load(x_flat + indices_0 * x_flat_stride_0, mask_0, other=0)
+    rand = tl.rand(tl.load(rng_seed_buffer + 0), indices_0).to(tl.float32)
+    v_0 = rand > p
+    v_1 = xi * scale
+    v_2 = 0.0
+    v_3 = v_2[None]
+    v_4 = tl.where(v_0, v_1, v_3)
+    tl.store(out_flat + indices_0 * out_flat_stride_0, v_4, mask_0)
+    tl.store(mask_flat + indices_0 * mask_flat_stride_0, v_0, mask_0)
+
+def low_mem_dropout(p: float, x: torch.Tensor, *, _launcher=_default_launcher):
+    from torch._inductor import inductor_prims
+    _rng_seed_buffer = inductor_prims.seeds(1, torch.device('cuda'))
+    '\n    Applies dropout on x using p\n    Args:\n        p (float): dropout probability\n        x (torch.Tensor): input tensor\n    Returns:\n        Output tensor, mask tensor\n    '
+    scale = 1.0 / (1.0 - p)
+    n = x.numel()
+    x_flat = x.view(-1)
+    out_flat = torch.empty_like(x_flat)
+    mask_flat = torch.empty_like(x_flat, dtype=torch.bool)
+    _BLOCK_SIZE_0 = 8
+    _launcher(_helion_low_mem_dropout, (triton.cdiv(n, _BLOCK_SIZE_0),), x_flat, out_flat, mask_flat, mask_flat.stride(0), out_flat.stride(0), x_flat.stride(0), n, p, scale, _BLOCK_SIZE_0, _rng_seed_buffer, num_warps=4, num_stages=3)
+    return (out_flat.view_as(x), mask_flat.view_as(x))
+
 --- assertExpectedJournal(TestExamples.test_matmul)
 from __future__ import annotations
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -308,6 +308,30 @@ def test_welford(self):
             )
         )
 
+    def test_low_mem_dropout(self):
+        from examples.low_mem_dropout import low_mem_dropout
+        from examples.low_mem_dropout import low_mem_dropout_bwd
+
+        from helion._testing import code_and_output
+
+        p, size = 0.25, 8
+
+        x = torch.randn(size=(size,)).cuda()
+
+        torch.manual_seed(123)
+        code, (_, fwd_mask) = code_and_output(
+            low_mem_dropout,
+            (p, x),
+        )
+
+        # need to set seed again else we can't reproduce
+        torch.manual_seed(123)
+        grad_y = torch.ones_like(x)
+        _, bwd_mask = low_mem_dropout_bwd(p, grad_y)
+        assert torch.equal(fwd_mask, bwd_mask)
+
+        self.assertExpectedJournal(code)
+
     def test_rms_norm_fwd(self):
         args = (
             torch.randn([128, 256], device=DEVICE, dtype=torch.float16),