fla-org · sustcsonglin · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025 · coderabbitai
diff --git a/benchmarks/modules/benchmark_fused_conv_l2.py b/benchmarks/modules/benchmark_fused_conv_l2.py
@@ -0,0 +1,178 @@
+import torch
+from einops import rearrange
+
+from fla.modules.convolution import ShortConvolution
+from fla.modules.l2norm import l2norm
+from fla.utils import device
+
+def separate_conv_l2(x, conv, head_dim):
+    """Separate Conv + L2 Norm"""
+    y, _ = conv(x)
+    y = rearrange(y, 'b t (h d) -> b t h d', d=head_dim)
+    y = l2norm(y, eps=1e-5)
+    y = rearrange(y, 'b t h d -> b t (h d)')
+    return y
+
+def fused_conv_l2(x, conv_fused, head_dim):
+    """Fused Conv + L2 Norm"""
+    y, _ = conv_fused(x, head_dim=head_dim)
+    return y
+
+if __name__ == "__main__":
+    import torch.utils.benchmark as benchmark
+
+    # Test configurations
+    B, T, D, W = 4, 2048, 2048, 4
+    H = 16
+    head_dim = D // H
+
+    print("="*80)
+    print(f"Benchmarking Conv + L2 Norm: B={B}, T={T}, D={D}, W={W}, H={H}, head_dim={head_dim}")
+    print("="*80)
+
+    dtype = torch.bfloat16
+
+    # Create input
+    x = torch.randn(B, T, D, device=device, dtype=dtype, requires_grad=True)
+
+    # Separate Conv (no norm)
+    conv_separate = ShortConvolution(
+        hidden_size=D,
+        kernel_size=W,
+        bias=False,
+        activation='silu',
+        norm=None,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Fused Conv + L2 Norm
+    conv_fused = ShortConvolution(
+        hidden_size=D,
+        kernel_size=W,
+        bias=False,
+        activation='silu',
+        norm='l2',
+        norm_eps=1e-5,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Copy weights
+    conv_fused.weight.data.copy_(conv_separate.weight.data)
+
+    # Benchmark Forward
+    print("\n" + "="*80)
+    print("Forward Pass")
+    print("="*80)
+
+    t_sep_fwd = benchmark.Timer(
+        stmt="separate_conv_l2(x, conv, head_dim)",
+        globals={"separate_conv_l2": separate_conv_l2, "x": x, "conv": conv_separate, "head_dim": head_dim},
+    )
+    m_sep_fwd = t_sep_fwd.timeit(100)
+    print(f"Separate: {m_sep_fwd}")
+
+    t_fused_fwd = benchmark.Timer(
+        stmt="fused_conv_l2(x, conv, head_dim)",
+        globals={"fused_conv_l2": fused_conv_l2, "x": x, "conv": conv_fused, "head_dim": head_dim},
+    )
+    m_fused_fwd = t_fused_fwd.timeit(100)
+    print(f"Fused:    {m_fused_fwd}")
+
+    # Benchmark Backward
+    print("\n" + "="*80)
+    print("Backward Pass")
+    print("="*80)
+
+    # Pre-compute forward for backward benchmark
+    y_sep = separate_conv_l2(x, conv_separate, head_dim)
+    grad_sep = torch.randn_like(y_sep)
+
+    def backward_sep():
+        for xi in [x]:
+            if isinstance(xi, torch.Tensor):
+                xi.grad = None
+        y_sep.backward(grad_sep, retain_graph=True)
+
+    t_sep_bwd = benchmark.Timer(
+        stmt="backward_sep()",
+        globals={"backward_sep": backward_sep},
+    )
+    m_sep_bwd = t_sep_bwd.timeit(100)
+    print(f"Separate: {m_sep_bwd}")
+
+    y_fused = fused_conv_l2(x, conv_fused, head_dim)
+    grad_fused = torch.randn_like(y_fused)
+
+    def backward_fused():
+        for xi in [x]:
+            if isinstance(xi, torch.Tensor):
+                xi.grad = None
+        y_fused.backward(grad_fused, retain_graph=True)
+
+    t_fused_bwd = benchmark.Timer(
+        stmt="backward_fused()",
+        globals={"backward_fused": backward_fused},
+    )
+    m_fused_bwd = t_fused_bwd.timeit(100)
+    print(f"Fused:    {m_fused_bwd}")
+
+    # Benchmark Combined
+    print("\n" + "="*80)
+    print("Forward + Backward Pass")
+    print("="*80)
+
+    def combined_sep():
+        for xi in [x]:
+            if isinstance(xi, torch.Tensor):
+                xi.grad = None
+        y = separate_conv_l2(x, conv_separate, head_dim)
+        y.backward(grad_sep, retain_graph=True)
+
+    t_sep_combined = benchmark.Timer(
+        stmt="combined_sep()",
+        globals={"combined_sep": combined_sep},
+    )
+    m_sep_combined = t_sep_combined.timeit(100)
+    print(f"Separate: {m_sep_combined}")
+
+    def combined_fused():
+        for xi in [x]:
+            if isinstance(xi, torch.Tensor):
+                xi.grad = None
+        y = fused_conv_l2(x, conv_fused, head_dim)
+        y.backward(grad_fused, retain_graph=True)
+
+    t_fused_combined = benchmark.Timer(
+        stmt="combined_fused()",
+        globals={"combined_fused": combined_fused},
+    )
+    m_fused_combined = t_fused_combined.timeit(100)
+    print(f"Fused:    {m_fused_combined}")
+
+    # Summary
+    time_sep_fwd = m_sep_fwd.median * 1000
+    time_sep_bwd = m_sep_bwd.median * 1000
+    time_sep_combined = m_sep_combined.median * 1000
+
+    time_fused_fwd = m_fused_fwd.median * 1000
+    time_fused_bwd = m_fused_bwd.median * 1000
+    time_fused_combined = m_fused_combined.median * 1000
+
+    print(f"\n{'='*80}")
+    print(f"{'Method':<35} {'Forward':<12} {'Backward':<12} {'Combined':<12} {'Speedup':<10}")
+    print("-"*80)
+    print(f"{'Separate (FLA)':<35} {time_sep_fwd:>10.3f}ms {time_sep_bwd:>10.3f}ms {time_sep_combined:>10.3f}ms {'1.00x':<10}")
+    print(f"{'Fused (Recompute)':<35} {time_fused_fwd:>10.3f}ms {time_fused_bwd:>10.3f}ms {time_fused_combined:>10.3f}ms {time_sep_combined/time_fused_combined:<10.2f}x")
+
+    speedup_fwd = (time_sep_fwd / time_fused_fwd - 1) * 100
+    speedup_bwd = (time_sep_bwd / time_fused_bwd - 1) * 100
+    speedup_combined = (time_sep_combined / time_fused_combined - 1) * 100
+
+    print(f"\n{'='*80}")
+    print(f"Forward Speedup:   {speedup_fwd:>+8.2f}%")
+    print(f"Backward Speedup:  {speedup_bwd:>+8.2f}%")
+    print(f"Combined Speedup:  {speedup_combined:>+8.2f}%")
+    print(f"\nMemory Saved: {B*T*D*2/1024/1024:.2f} MB per Conv layer (Y_act not stored)")
+    print(f"{'='*80}")
diff --git a/fla/layers/comba.py b/fla/layers/comba.py
@@ -91,6 +91,7 @@ def __init__(
         conv_bias: bool = False,
         layer_idx: int = None,
         norm_eps: float = 1e-5,
+        fuse_conv_l2: bool = True,
         **kwargs,
     ) -> Comba:
         super().__init__()
@@ -106,6 +107,7 @@ def __init__(
         self.use_inner_decay = use_inner_decay
         self.conv_size = conv_size
         self.conv_bias = conv_bias
+        self.fuse_conv_l2 = fuse_conv_l2 and self.use_short_conv
 
         self.head_dim = head_dim
         self.num_heads = num_heads
@@ -179,12 +181,16 @@ def __init__(
                 kernel_size=conv_size,
                 bias=conv_bias,
                 activation='silu',
+                norm='l2' if self.fuse_conv_l2 else None,
+                norm_eps=norm_eps,
             )
             self.k_conv1d = ShortConvolution(
                 hidden_size=self.key_dim,
                 kernel_size=conv_size,
                 bias=conv_bias,
                 activation='silu',
+                norm='l2' if self.fuse_conv_l2 else None,
+                norm_eps=norm_eps,
             )
             self.v_conv1d = ShortConvolution(
                 hidden_size=self.value_dim,
@@ -243,12 +249,14 @@ def forward(
                 cache=conv_state_q,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
+                head_dim=self.head_k_dim if self.fuse_conv_l2 else None,
             )
             k, conv_state_k = self.k_conv1d(
                 x=self.k_proj(hidden_states),
                 cache=conv_state_k,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
+                head_dim=self.head_k_dim if self.fuse_conv_l2 else None,
             )
             v, conv_state_v = self.v_conv1d(
                 x=self.v_proj(hidden_states),
@@ -291,7 +299,7 @@ def forward(
                 initial_state=recurrent_state,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
-                use_qk_l2norm_in_kernel=True,
+                use_qk_l2norm_in_kernel=not self.fuse_conv_l2,
             )
         elif mode == 'fused_recurrent':
             o, recurrent_state = fused_recurrent_comba(
@@ -304,7 +312,7 @@ def forward(
                 initial_state=recurrent_state,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
-                use_qk_l2norm_in_kernel=True,
+                use_qk_l2norm_in_kernel=not self.fuse_conv_l2,
             )
         else:
             raise NotImplementedError(f"Not supported mode `{mode}`.")

diff --git a/fla/layers/delta_net.py b/fla/layers/delta_net.py
@@ -87,13 +87,23 @@ def __init__(
         qk_activation: str = 'silu',
         qk_norm: str = 'l2',
         norm_eps: float = 1e-5,
+        fuse_conv_l2: bool = True,
+        fuse_norm: bool | None = None,
         **kwargs,
     ) -> DeltaNet:
         super().__init__()
 
         self.mode = mode
         self.qk_activation = qk_activation
         self.qk_norm = qk_norm
+        if fuse_norm is not None:
+            warnings.warn(
+                "`fuse_norm` is deprecated for DeltaNet; use `fuse_conv_l2` to control the fused "
+                "ShortConvolution + L2 kernel.",
+                stacklevel=2,
+            )
+            fuse_conv_l2 = fuse_norm
+        self.fuse_conv_l2 = fuse_conv_l2 and use_short_conv and (qk_norm == 'l2')
 
         assert self.qk_activation in ['silu', 'relu', 'elu', 'identity']
         assert self.qk_norm in ['l2', 'sum']
@@ -136,12 +146,16 @@ def __init__(
                 kernel_size=conv_size,
                 bias=conv_bias,
                 activation='silu' if qk_activation == 'silu' else None,
+                norm='l2' if self.fuse_conv_l2 else None,
+                norm_eps=norm_eps,
             )
             self.k_conv1d = ShortConvolution(
                 hidden_size=self.key_dim,
                 kernel_size=conv_size,
                 bias=conv_bias,
                 activation='silu' if qk_activation == 'silu' else None,
+                norm='l2' if self.fuse_conv_l2 else None,
+                norm_eps=norm_eps,
             )
             self.v_conv1d = ShortConvolution(
                 hidden_size=self.value_dim,
@@ -200,12 +214,14 @@ def forward(
                 cache=conv_state_q,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
+                head_dim=self.head_k_dim if self.fuse_conv_l2 else None
             )
             k, conv_state_k = self.k_conv1d(
                 x=self.k_proj(hidden_states),
                 cache=conv_state_k,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
+                head_dim=self.head_k_dim if self.fuse_conv_l2 else None
             )
             v, conv_state_v = self.v_conv1d(
                 x=self.v_proj(hidden_states),
@@ -252,7 +268,7 @@ def forward(
                 initial_state=recurrent_state,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
-                use_qk_l2norm_in_kernel=(self.qk_norm == 'l2'),
+                use_qk_l2norm_in_kernel=(self.qk_norm == 'l2' and not self.fuse_conv_l2),
             )
         elif mode == 'chunk':
             o, recurrent_state = chunk_delta_rule(
@@ -263,7 +279,7 @@ def forward(
                 initial_state=recurrent_state,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
-                use_qk_l2norm_in_kernel=(self.qk_norm == 'l2'),
+                use_qk_l2norm_in_kernel=(self.qk_norm == 'l2' and not self.fuse_conv_l2),
             )
         else:
             raise NotImplementedError(f"Not supported mode `{mode}`.")

diff --git a/fla/layers/gated_deltanet.py b/fla/layers/gated_deltanet.py
@@ -100,6 +100,7 @@ def __init__(
         conv_bias: bool = False,
         layer_idx: int = None,
         norm_eps: float = 1e-5,
+        fuse_conv_l2: bool = True,
         **kwargs,
     ) -> GatedDeltaNet:
         super().__init__()
@@ -113,6 +114,7 @@ def __init__(
         self.use_short_conv = use_short_conv
         self.conv_size = conv_size
         self.conv_bias = conv_bias
+        self.fuse_conv_l2 = fuse_conv_l2 and self.use_short_conv
 
         self.head_dim = head_dim
         self.num_heads = num_heads
@@ -174,12 +176,16 @@ def __init__(
                 kernel_size=conv_size,
                 bias=conv_bias,
                 activation='silu',
+                norm='l2' if self.fuse_conv_l2 else None,
+                norm_eps=norm_eps,
             )
             self.k_conv1d = ShortConvolution(
                 hidden_size=self.key_dim,
                 kernel_size=conv_size,
                 bias=conv_bias,
                 activation='silu',
+                norm='l2' if self.fuse_conv_l2 else None,
+                norm_eps=norm_eps,
             )
             self.v_conv1d = ShortConvolution(
                 hidden_size=self.value_dim,
@@ -239,12 +245,14 @@ def forward(
                 cache=conv_state_q,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
+                head_dim=self.head_k_dim if self.fuse_conv_l2 else None,
             )
             k, conv_state_k = self.k_conv1d(
                 x=self.k_proj(hidden_states),
                 cache=conv_state_k,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
+                head_dim=self.head_k_dim if self.fuse_conv_l2 else None,
             )
             v, conv_state_v = self.v_conv1d(
                 x=self.v_proj(hidden_states),
@@ -280,7 +288,7 @@ def forward(
                 initial_state=recurrent_state,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
-                use_qk_l2norm_in_kernel=True,
+                use_qk_l2norm_in_kernel=not self.fuse_conv_l2,
             )
         elif mode == 'fused_recurrent':
             o, recurrent_state = fused_recurrent_gated_delta_rule(
@@ -292,7 +300,7 @@ def forward(
                 initial_state=recurrent_state,
                 output_final_state=use_cache,
                 cu_seqlens=cu_seqlens,
-                use_qk_l2norm_in_kernel=True,
+                use_qk_l2norm_in_kernel=not self.fuse_conv_l2,
             )
         else:
             raise NotImplementedError(f"Not supported mode `{mode}`.")