Add hl.split and hl.join (#791)

jansel · web-flow · commit ca832fa4c9f9 · 2025-10-02T21:47:03.000-07:00
diff --git a/docs/api/index.md b/docs/api/index.md
@@ -88,6 +88,8 @@ runtime
    full
    arange
    subscript
+   split
+   join
    reduce
    associative_scan
    cumsum
diff --git a/docs/api/language.md b/docs/api/language.md
@@ -194,6 +194,18 @@ The `Tile` class represents a portion of an iteration space with the following k
 .. autofunction:: subscript
 ```
 
+### split()
+
+```{eval-rst}
+.. autofunction:: split
+```
+
+### join()
+
+```{eval-rst}
+.. autofunction:: join
+```
+
 ## StackTensor
 ### StackTensor class
 ```{eval-rst}
diff --git a/helion/_compiler/roll_reduction.py b/helion/_compiler/roll_reduction.py
@@ -22,6 +22,8 @@
 from ..language.matmul_ops import dot as hl_dot
 from ..language.memory_ops import store
 from ..language.reduce_ops import _reduce
+from ..language.view_ops import join as hl_join
+from ..language.view_ops import split as hl_split
 from .compile_environment import CompileEnvironment
 from .inductor_lowering import APIFuncLowering
 from .inductor_lowering import ReductionLowering
@@ -119,6 +121,28 @@ def should_go_in_inner_graph(self, node: torch.fx.Node) -> bool:
                 return self.should_go_in_inner_graph(arg)
             return False
 
+        if node.target is hl_split:
+            base = node.args[0]
+            if isinstance(base, torch.fx.Node):
+                return self.should_go_in_inner_graph(base)
+            return False
+
+        if node.target is operator.getitem:
+            base = node.args[0]
+            if isinstance(base, torch.fx.Node) and base.target is hl_split:
+                return self.should_go_in_inner_graph(base)
+
+        if node.target is hl_join:
+            left = node.args[0]
+            right = node.args[1]
+            left_inner = isinstance(
+                left, torch.fx.Node
+            ) and self.should_go_in_inner_graph(left)
+            right_inner = isinstance(
+                right, torch.fx.Node
+            ) and self.should_go_in_inner_graph(right)
+            return left_inner or right_inner
+
         if self.is_reduction(node):
             return True
 
@@ -178,8 +202,13 @@ def start_new_graph(self) -> None:
 
         inner_nodes: dict[torch.fx.Node, torch.fx.Node] = self.inner_nodes
         outputs = {}
+        inner_node_set = set(inner_nodes)
         for orig_node, inner_node in inner_nodes.items():
-            if self.is_reduction(orig_node) and orig_node not in self.outer_nodes:
+            needs_output = orig_node not in self.outer_nodes and (
+                self.is_reduction(orig_node)
+                or any(user not in inner_node_set for user in orig_node.users)
+            )
+            if needs_output:
                 outputs[orig_node] = inner_node
             self.available.add(orig_node)
         graph = self.inner_graph
diff --git a/helion/language/__init__.py b/helion/language/__init__.py
@@ -38,6 +38,8 @@
 from .tunable_ops import register_block_size as register_block_size
 from .tunable_ops import register_reduction_dim as register_reduction_dim
 from .tunable_ops import register_tunable as register_tunable
+from .view_ops import join as join
+from .view_ops import split as split
 from .view_ops import subscript as subscript
 
 _MEMORY_OPS = (
diff --git a/helion/language/view_ops.py b/helion/language/view_ops.py
@@ -2,6 +2,7 @@
 
 import collections
 from typing import TYPE_CHECKING
+from typing import cast
 
 import torch
 
@@ -15,7 +16,7 @@
 
     from .._compiler.inductor_lowering import CodegenState
 
-__all__ = ["subscript"]
+__all__ = ["join", "split", "subscript"]
 
 
 @_decorators.api(tiles_as_sizes=True)
@@ -114,3 +115,93 @@ def _(node: torch.fx.Node) -> float | bool | None:
     other = node.args[0]
     assert isinstance(other, torch.fx.Node)
     return cached_masked_value(other)
+
+
+@_decorators.api(is_device_only=True)
+def split(tensor: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Split the last dimension of a tensor with size two into two separate tensors.
+
+    Args:
+        tensor: The input tensor whose last dimension has length two.
+
+    Returns:
+        A tuple ``(lo, hi)`` where each tensor has the same shape as ``tensor``
+        without its last dimension.
+
+    See Also:
+        - :func:`~helion.language.join`
+    """
+    raise NotInsideKernel
+
+
+@_decorators.register_fake(split)
+def _(tensor: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    out_shape = tensor.shape[:-1]
+    return (
+        tensor.new_empty(out_shape),
+        tensor.new_empty(out_shape),
+    )
+
+
+@_decorators.codegen(split)
+def _(state: CodegenState) -> list[ast.AST]:
+    split_call = expr_from_string("tl.split({tensor})", tensor=state.ast_arg(0))
+    return [
+        expr_from_string("{value}[0]", value=split_call),
+        expr_from_string("{value}[1]", value=split_call),
+    ]
+
+
+@_decorators.ref(split)
+def _(tensor: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    return cast("tuple[torch.Tensor, torch.Tensor]", torch.unbind(tensor, dim=-1))
+
+
+@_decorators.api(is_device_only=True)
+def join(
+    tensor0: torch.Tensor,
+    tensor1: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Join two tensors along a new minor dimension.
+
+    Args:
+        tensor0: First tensor to join.
+        tensor1: Second tensor to join. Must be broadcast-compatible with
+            ``tensor0``.
+
+    Returns:
+        torch.Tensor: A tensor with shape ``broadcast_shape + (2,)`` where
+        ``broadcast_shape`` is the broadcast of the input shapes.
+
+    See Also:
+        - :func:`~helion.language.split`
+    """
+    raise NotInsideKernel
+
+
+@_decorators.register_fake(join)
+def _(tensor0: torch.Tensor, tensor1: torch.Tensor) -> torch.Tensor:
+    if tensor0.dtype != tensor1.dtype:
+        raise TypeError("join() requires both tensors to have the same dtype")
+    if tensor0.device != tensor1.device:
+        raise ValueError("join() requires both tensors to be on the same device")
+
+    broadcast_shape = torch.broadcast_shapes(tensor0.shape, tensor1.shape)
+    return tensor0.new_empty([*broadcast_shape, 2])
+
+
+@_decorators.codegen(join)
+def _(state: CodegenState) -> ast.AST:
+    return expr_from_string(
+        "tl.join({tensor0}, {tensor1})",
+        tensor0=state.ast_arg(0),
+        tensor1=state.ast_arg(1),
+    )
+
+
+@_decorators.ref(join)
+def _(tensor0: torch.Tensor, tensor1: torch.Tensor) -> torch.Tensor:
+    left, right = torch.broadcast_tensors(tensor0, tensor1)
+    return torch.stack((left, right), dim=-1)
diff --git a/test/test_views.py b/test/test_views.py
@@ -156,6 +156,41 @@ def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         _code, result = code_and_output(fn, args)
         torch.testing.assert_close(result, args[0] + args[1])
 
+    def test_split_join_roundtrip(self):
+        @helion.kernel(config={"block_size": 64})
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            n = x.size(0)
+            out = torch.empty_like(x)
+            for tile in hl.tile(n):
+                lo, hi = hl.split(x[tile, :])
+                out[tile, :] = hl.join(hi, lo)
+            return out
+
+        x = torch.randn([256, 2], device=DEVICE)
+        code, result = code_and_output(fn, (x,))
+        expected = torch.stack((x[:, 1], x[:, 0]), dim=-1)
+        torch.testing.assert_close(result, expected)
+        self.assertIn("tl.split", code)
+        self.assertIn("tl.join", code)
+
+    def test_join_broadcast_scalar(self):
+        @helion.kernel(config={"block_size": 64})
+        def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            n = x.size(0)
+            out = torch.empty([n, 2], dtype=x.dtype, device=x.device)
+            for tile in hl.tile(n):
+                scalar = hl.load(y, [0])
+                out[tile, :] = hl.join(x[tile], scalar)
+            return out
+
+        x = torch.randn([128], device=DEVICE)
+        y = torch.randn([1], device=DEVICE)
+        code, result = code_and_output(fn, (x, y))
+        broadcast_y = torch.broadcast_to(y, x.shape)
+        expected = torch.stack((x, broadcast_y), dim=-1)
+        torch.testing.assert_close(result, expected)
+        self.assertIn("tl.join", code)
+
     def test_reshape_input_types(self):
         @helion.kernel(static_shapes=True)
         def reshape_reduction_dim(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: