Support layernorm without bias (#585)

mengluy0125 · facebook-github-bot · commit 5540fd8e1e0b · 2025-09-10T21:18:48.000-07:00
Summary:

The current layernorm only supports bias case, we thus add the case without bias.

Differential Revision: D82171738
diff --git a/examples/layer_norm.py b/examples/layer_norm.py
@@ -8,6 +8,8 @@
 # %%
 from __future__ import annotations
 
+from typing import Optional
+
 import torch
 
 import helion
@@ -21,7 +23,7 @@ def layer_norm_fwd(
     x: torch.Tensor,
     normalized_shape: list[int],
     weight: torch.Tensor,
-    bias: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
     eps: float = 1e-5,
 ) -> torch.Tensor:
     """
@@ -30,14 +32,15 @@ def layer_norm_fwd(
         x (torch.Tensor): Input tensor of shape [batch_size, dim], expected to be FP16.
         normalized_shape (list[int]): List containing the dimension to normalize over (should be length 1).
         weight (torch.Tensor): Learnable scale parameter of shape [dim].
-        bias (torch.Tensor): Learnable bias parameter of shape [dim].
+        bias (Optional[torch.Tensor]): Learnable bias parameter of shape [dim].
         eps (float, optional): Small value added to variance for numerical stability. Default is 1e-5.
     Returns:
         torch.Tensor: The layer-normalized output tensor of shape [batch_size, dim], in FP16.
     """
     m, n = x.size()
     assert weight.size(0) == n, f"weight size mismatch {weight.size(0)} != {m}"
-    assert bias.size(0) == n, f"bias size mismatch {bias.size(0)} != {m}"
+    if bias is not None:
+        assert bias.size(0) == n, f"bias size mismatch {bias.size(0)} != {m}"
     assert len(normalized_shape) == 1, (
         "Helion layer norm only supports 1D layer norm currently"
     )
@@ -49,7 +52,10 @@ def layer_norm_fwd(
         acc = x[tile_m, :].to(torch.float32)
         var, mean = torch.var_mean(acc, dim=-1, keepdim=True, correction=0)
         normalized = (acc - mean) * torch.rsqrt(var + eps)
-        acc = normalized * (weight[:].to(torch.float32)) + (bias[:].to(torch.float32))
+        if bias is not None:
+            acc = normalized * (weight[:].to(torch.float32)) + (bias[:].to(torch.float32))
+        else:
+            acc = normalized * (weight[:].to(torch.float32))
         out[tile_m, :] = acc.to(x.dtype)
     return out
 
@@ -70,15 +76,16 @@ def main() -> None:
     weight = torch.randn([dim], device=device, dtype=torch.float16)
     bias = torch.randn([dim], device=device, dtype=torch.float16)
     eps = 1e-4
-    run_example(
-        layer_norm_fwd,
-        torch.nn.functional.layer_norm,
-        (x, [dim], weight, bias, eps),
-        kernel_name="helion",
-        baseline_name="torch",
-        rtol=1e-3,
-        atol=1e-3,
-    )
+    for b in [bias, None]:
+        run_example(
+            layer_norm_fwd,
+            torch.nn.functional.layer_norm,
+            (x, [dim], weight, b, eps),
+            kernel_name="helion",
+            baseline_name="torch",
+            rtol=1e-3,
+            atol=1e-3,
+        )
 
 
 # %%
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -204,6 +204,8 @@ def cached_create_unbacked_symint(
         return result
 
     def to_fake(self, obj: object, origin: Origin) -> object:
+        if obj is None:
+            return None
         if isinstance(obj, torch.Tensor):
             return self._to_fake_tensor(obj, origin.to_source())
         if isinstance(obj, (bool, int, float)):
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -757,6 +757,7 @@ def _graph_module_key(fn: Kernel, obj: torch.fx.GraphModule) -> Hashable:
     types.BuiltinFunctionType: lambda fn, x: x,
     torch.fx.GraphModule: _graph_module_key,
     ConstExpr: lambda fn, x: x.value,  # pyright: ignore[reportAttributeAccessIssue]
+    type(None): lambda fn, x: None,
 }
 
 
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -1303,21 +1303,22 @@ def _helion_layer_norm_fwd(bias, x, weight, out, bias_size_0, bias_stride_0, out
     v_15 = tl.cast(v_14, tl.float16)
     tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_15, mask_0[:, None] & mask_1[None, :])
 
-def layer_norm_fwd(x: torch.Tensor, normalized_shape: list[int], weight: torch.Tensor, bias: torch.Tensor, eps: float=1e-05, *, _launcher=_default_launcher):
+def layer_norm_fwd(x: torch.Tensor, normalized_shape: list[int], weight: torch.Tensor, bias: Optional[torch.Tensor] = None, eps: float=1e-05, *, _launcher=_default_launcher):
     """
     Performs 1D layer normalization on the input tensor using Helion.
     Args:
         x (torch.Tensor): Input tensor of shape [batch_size, dim], expected to be FP16.
         normalized_shape (list[int]): List containing the dimension to normalize over (should be length 1).
         weight (torch.Tensor): Learnable scale parameter of shape [dim].
-        bias (torch.Tensor): Learnable bias parameter of shape [dim].
+        bias (Optional[torch.Tensor]): Learnable bias parameter of shape [dim].
         eps (float, optional): Small value added to variance for numerical stability. Default is 1e-5.
     Returns:
         torch.Tensor: The layer-normalized output tensor of shape [batch_size, dim], in FP16.
     """
     m, n = x.size()
     assert weight.size(0) == n, f'weight size mismatch {weight.size(0)} != {m}'
-    assert bias.size(0) == n, f'bias size mismatch {bias.size(0)} != {m}'
+    if bias is not None:
+        assert bias.size(0) == n, f'bias size mismatch {bias.size(0)} != {m}'
     assert len(normalized_shape) == 1, 'Helion layer norm only supports 1D layer norm currently'
     assert normalized_shape[0] == n, f'normalized shape mismatch {normalized_shape[0]} != {n}'
     out = torch.empty([m, n], dtype=x.dtype, device=x.device)

Original file line number	Diff line number	Diff line change
`@@ -757,6 +757,7 @@ def _graph_module_key(fn: Kernel, obj: torch.fx.GraphModule) -> Hashable:`
`757`	`757`	`types.BuiltinFunctionType: lambda fn, x: x,`
`758`	`758`	`torch.fx.GraphModule: _graph_module_key,`
`759`	`759`	`ConstExpr: lambda fn, x: x.value, # pyright: ignore[reportAttributeAccessIssue]`
	`760`	`+ type(None): lambda fn, x: None,`
`760`	`761`	`}`
`761`	`762`
`762`	`763`