Fix hl.rand to use tile specific offsets instead of fixed offsets, ensure unique random num per tile

karthickai · karthickai · commit 29ceafbdbf71 · 2025-10-01T01:13:00.000-07:00
stack-info: PR: #685, branch: karthickai/stack/3
diff --git a/helion/language/__init__.py b/helion/language/__init__.py
@@ -21,6 +21,7 @@
 from .matmul_ops import dot as dot
 from .memory_ops import load as load
 from .memory_ops import store as store
+from .random_ops import rand as rand
 from .reduce_ops import reduce as reduce
 from .scan_ops import associative_scan as associative_scan
 from .scan_ops import cumprod as cumprod
diff --git a/helion/language/random_ops.py b/helion/language/random_ops.py
@@ -0,0 +1,194 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from .._compiler.ast_extension import expr_from_string
+from .._compiler.compile_environment import CompileEnvironment
+from .._compiler.device_function import SymbolArgument
+from ..exc import NotInsideKernel
+from . import _decorators
+from .ref_tile import RefTile
+
+if TYPE_CHECKING:
+    import ast
+
+    from .._compiler.inductor_lowering import CodegenState
+
+__all__ = ["rand"]
+
+
+@_decorators.api(tiles_as_sizes=True)
+def rand(
+    shape: list[object],
+    seed: int,
+    dtype: torch.dtype = torch.float32,
+    device: torch.device | None = None,
+) -> torch.Tensor:
+    """
+    The main propose of ``hl.rand`` is to explicitly pass a seed arg for deterministic
+    randomness in helion kernels, whereas ``torch.rand_like`` doesn't take seed arg
+    (though it can seeded globally)`. ``hl.rand`` lower to ``tl.rand(seed, offset)`` with ``offset``
+    built from a linear range over the allocation and reshaped to the given shape.
+
+    Note:
+        Only use within ``hl.tile()`` loops for creating local tensors.
+        For host allocations, use ``torch.rand()``.
+
+    Args:
+        shape: A list of sizes
+        seed: int seed for the random number generator
+        dtype: currently only float32 supported
+
+    Returns:
+        torch.Tensor: A device tensor of the given shape and dtype filled with random values
+
+    Examples:
+        .. code-block:: python
+
+            @helion.kernel
+            def process_kernel(x: torch.Tensor) -> torch.Tensor:
+                output = torch.zeros_like(x)
+                (m,) = x.shape
+                for (tile_m,) in hl.tile([m]):
+                    output[tile_m] = hl.rand([tile_m], seed=seed)
+                return output
+
+    """
+    raise NotInsideKernel
+
+
+@_decorators.register_fake(rand)
+def _rand_fake(
+    shape: list[int | torch.SymInt],
+    seed: int,
+    dtype: torch.dtype = torch.float32,
+    device: torch.device | None = None,
+) -> torch.Tensor:
+    if not isinstance(shape, (list, tuple)):
+        raise TypeError(f"Expected list[SymInt], got {type(shape).__name__}")
+    env = CompileEnvironment.current()
+    env.add_kernel_tensor_size(shape)
+    return torch.empty(
+        [*shape],
+        dtype=dtype,
+        device=env.device if device is None else device,
+    )
+
+
+@_decorators.codegen(rand)
+def _rand_codegen(state: CodegenState) -> ast.AST:
+    """
+    Generate tl.rand() code with global indices for deterministic RNG per element.
+    """
+    fake_value = state.fake_value
+    assert isinstance(fake_value, torch.Tensor)
+
+    tensor_shape = fake_value.size()
+    ndim = len(tensor_shape)
+    if ndim == 0:
+        raise ValueError("hl.rand() requires at least one dimension")
+
+    seed_ast = state.ast_arg(1)
+    env = CompileEnvironment.current()
+
+    symbol_args = []
+    rdim_args = {}
+    for arg in state.device_function.arguments:
+        if isinstance(arg, SymbolArgument) and arg.name != "seed":
+            symbol_args.append(arg.name)
+        elif arg.name.startswith("_RDIM_SIZE_"):
+            rdim_args[arg.name] = arg
+
+    index_vars = []
+    size_names = []
+    used_rdims = set()
+    symbol_idx = 0
+
+    for i in range(ndim):
+        block_id = env.get_block_id(tensor_shape[i])
+        if block_id is not None:
+            rdim_name = f"_RDIM_SIZE_{block_id}"
+            if rdim_name in rdim_args:
+                index_vars.append(f"tl.arange(0, {rdim_name})")
+                size_names.append(rdim_name)
+                used_rdims.add(rdim_name)
+                continue
+
+        if block_id is not None:
+            index_vars.append(state.codegen.index_var(block_id))
+            if symbol_idx < len(symbol_args):
+                size_names.append(symbol_args[symbol_idx])
+                symbol_idx += 1
+            else:
+                size_names.append(str(tensor_shape[i]))
+            continue
+
+        available_rdims = [name for name in rdim_args if name not in used_rdims]
+        if available_rdims:
+            rdim_name = available_rdims[0]
+            index_vars.append(f"tl.arange(0, {rdim_name})")
+            size_names.append(rdim_name)
+            used_rdims.add(rdim_name)
+        else:
+            raise RuntimeError(
+                "hl.rand() requires tiled dimensions. "
+                "Use hl.rand() inside hl.tile() loops with tile variables."
+            )
+
+    if ndim == 1:
+        offset_expr = expr_from_string(index_vars[0])
+    else:
+        broadcast_slices = []
+        for i in range(ndim):
+            slice_parts = ["None"] * ndim
+            slice_parts[i] = ":"
+            broadcast_slices.append(f"[{', '.join(slice_parts)}]")
+
+        offset_parts = []
+        for i in range(ndim):
+            broadcasted_index = f"{index_vars[i]}{broadcast_slices[i]}"
+
+            if i < ndim - 1:
+                stride_expr = " * ".join(size_names[i + 1 :])
+                offset_parts.append(f"{broadcasted_index} * {stride_expr}")
+            else:
+                offset_parts.append(broadcasted_index)
+
+        offset_expr = expr_from_string(" + ".join(offset_parts))
+
+    return expr_from_string(
+        "tl.rand({seed}, {offset})", seed=seed_ast, offset=offset_expr
+    )
+
+
+@_decorators.get_masked_value(rand)
+def _(
+    node: torch.fx.Node,
+) -> float:
+    return 0
+
+
+@_decorators.ref(rand)
+def _(
+    shape: list[int | RefTile],
+    seed: int,
+    dtype: torch.dtype = torch.float32,
+    device: torch.device | None = None,
+) -> torch.Tensor:
+    processed_shape: list[int] = []
+    for s in shape:
+        if isinstance(s, RefTile):
+            processed_shape.append(s.end - s.begin)
+        else:
+            processed_shape.append(int(s))
+    env = CompileEnvironment.current()
+    gen = torch.Generator(device=env.device if device is None else device)
+    gen.manual_seed(seed)
+    return torch.rand(
+        processed_shape,
+        dtype=dtype,
+        generator=gen,
+        device=env.device if device is None else device,
+    )
diff --git a/test/test_random.expected b/test/test_random.expected
@@ -0,0 +1,124 @@
+This file is automatically generated by assertExpectedJournal calls in test_random.py.
+Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.
+
+--- assertExpectedJournal(TestRandom.test_hl_rand_1d)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_rand_kernel_tiled_1d(output, output_stride_0, m, seed, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < m
+    rand = tl.rand(seed, indices_0)
+    tl.store(output + indices_0 * output_stride_0, rand, mask_0)
+
+def rand_kernel_tiled_1d(x: torch.Tensor, seed: int, *, _launcher=_default_launcher):
+    output = torch.zeros_like(x)
+    m, = x.shape
+    _BLOCK_SIZE_0 = 128
+    _launcher(_helion_rand_kernel_tiled_1d, (triton.cdiv(m, _BLOCK_SIZE_0),), output, output.stride(0), m, seed, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return output
+
+--- assertExpectedJournal(TestRandom.test_hl_rand_2d)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_rand_kernel_tiled_2d(output, output_stride_0, output_stride_1, m, n, seed, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    num_blocks_0 = tl.cdiv(m, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < m
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < n
+    rand = tl.rand(seed, indices_0[:, None] * n + indices_1[None, :])
+    tl.store(output + (indices_0[:, None] * output_stride_0 + indices_1[None, :] * output_stride_1), rand, mask_0[:, None] & mask_1[None, :])
+
+def rand_kernel_tiled_2d(x: torch.Tensor, seed: int, *, _launcher=_default_launcher):
+    output = torch.zeros_like(x)
+    m, n = x.shape
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _launcher(_helion_rand_kernel_tiled_2d, (triton.cdiv(m, _BLOCK_SIZE_0) * triton.cdiv(n, _BLOCK_SIZE_1),), output, output.stride(0), output.stride(1), m, n, seed, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    return output
+
+--- assertExpectedJournal(TestRandom.test_hl_rand_3d)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_rand_kernel_tiled_3d(output, output_stride_0, output_stride_1, output_stride_2, b, m, n, seed, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+    num_blocks_0 = tl.cdiv(b, _BLOCK_SIZE_0)
+    num_blocks_1 = tl.cdiv(m, _BLOCK_SIZE_1)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0 % num_blocks_1
+    pid_2 = tl.program_id(0) // (num_blocks_0 * num_blocks_1)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < b
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < m
+    offset_2 = pid_2 * _BLOCK_SIZE_2
+    indices_2 = (offset_2 + tl.arange(0, _BLOCK_SIZE_2)).to(tl.int32)
+    mask_2 = indices_2 < n
+    rand = tl.rand(seed, indices_0[:, None, None] * m * n + indices_1[None, :, None] * n + indices_2[None, None, :])
+    tl.store(output + (indices_0[:, None, None] * output_stride_0 + indices_1[None, :, None] * output_stride_1 + indices_2[None, None, :] * output_stride_2), rand, mask_0[:, None, None] & mask_1[None, :, None] & mask_2[None, None, :])
+
+def rand_kernel_tiled_3d(x: torch.Tensor, seed: int, *, _launcher=_default_launcher):
+    output = torch.zeros_like(x)
+    b, m, n = x.shape
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    _BLOCK_SIZE_2 = 16
+    _launcher(_helion_rand_kernel_tiled_3d, (triton.cdiv(b, _BLOCK_SIZE_0) * triton.cdiv(m, _BLOCK_SIZE_1) * triton.cdiv(n, _BLOCK_SIZE_2),), output, output.stride(0), output.stride(1), output.stride(2), b, m, n, seed, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=3)
+    return output
+
+--- assertExpectedJournal(TestRandom.test_hl_rand_non_tiled_dimensions)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_rand_kernel_partial_tile(output, output_stride_0, output_stride_1, output_stride_2, m, n, seed, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _RDIM_SIZE_2: tl.constexpr):
+    num_blocks_0 = tl.cdiv(m, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < m
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < n
+    indices_2 = tl.arange(0, _RDIM_SIZE_2).to(tl.int32)
+    rand = tl.rand(seed, indices_0[:, None, None] * n * _RDIM_SIZE_2 + indices_1[None, :, None] * _RDIM_SIZE_2 + tl.arange(0, _RDIM_SIZE_2)[None, None, :])
+    tl.store(output + (indices_0[:, None, None] * output_stride_0 + indices_1[None, :, None] * output_stride_1 + indices_2[None, None, :] * output_stride_2), rand, mask_0[:, None, None] & mask_1[None, :, None])
+
+def rand_kernel_partial_tile(x: torch.Tensor, seed: int, *, _launcher=_default_launcher):
+    output = torch.zeros_like(x)
+    m, n, k = x.shape
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _RDIM_SIZE_2 = 8
+    _launcher(_helion_rand_kernel_partial_tile, (triton.cdiv(m, _BLOCK_SIZE_0) * triton.cdiv(n, _BLOCK_SIZE_1),), output, output.stride(0), output.stride(1), output.stride(2), m, n, seed, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _RDIM_SIZE_2, num_warps=4, num_stages=3)
+    return output
diff --git a/test/test_random.py b/test/test_random.py