[KDA] Faster inter computation in 64x64 intra fwd (#658)

yzhangcs · web-flow · commit d4b33a3a1d41 · 2025-11-24T03:08:02.000+08:00
* Update intra inter impls

* Suooprt passing headdim

* Minor fix

* Fix potential OOD
diff --git a/benchmarks/benchmark_training_throughput.py b/benchmarks/benchmark_training_throughput.py
@@ -56,6 +56,7 @@ def profile(
     context_len: int = 2048,
     varlen: bool = False,
     num_heads: int | None = None,
+    head_dim: int | None = None,
     num_hidden_layers: int | None = None,
     warmup_steps: int = 16,
     steps: int = 32,
@@ -71,6 +72,9 @@ def profile(
     config = configs[name] if name in configs else AutoConfig.from_pretrained(name)
     if num_heads is not None:
         config.num_heads = num_heads
+    if head_dim is not None:
+        config.head_dim = head_dim
+        config.hidden_size = config.num_heads * config.head_dim
     if num_hidden_layers is not None:
         config.num_hidden_layers = num_hidden_layers
     model = AutoModelForCausalLM.from_config(config).cuda().to(dtype)
@@ -147,6 +151,7 @@ def profile(
     parser.add_argument("--context_len", default=None, type=int)
     parser.add_argument("--varlen", action='store_true')
     parser.add_argument("--num_heads", default=None, type=int)
+    parser.add_argument("--head_dim", default=None, type=int)
     parser.add_argument("--num_hidden_layers", default=None, type=int)
     parser.add_argument("--warmup_steps", default=64, type=int)
     parser.add_argument("--steps", default=256, type=int)
@@ -159,6 +164,7 @@ def profile(
         context_len=args.context_len,
         varlen=args.varlen,
         num_heads=args.num_heads,
+        head_dim=args.head_dim,
         num_hidden_layers=args.num_hidden_layers,
         warmup_steps=args.warmup_steps,
         steps=args.steps,
diff --git a/fla/ops/kda/chunk_intra.py b/fla/ops/kda/chunk_intra.py
@@ -39,23 +39,25 @@ def chunk_kda_fwd_kernel_intra_sub_inter(
     K: tl.constexpr,
     BT: tl.constexpr,
     BC: tl.constexpr,
+    BC2: tl.constexpr,
     BK: tl.constexpr,
     NC: tl.constexpr,
     IS_VARLEN: tl.constexpr,
 ):
-    i_t, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_i, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
-    i_i, i_j = i_c // NC, i_c % NC
     if IS_VARLEN:
         i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
-    if i_t * BT + i_i * BC >= T:
-        return
-    if i_i <= i_j:
+    tl.static_assert(NC <= 4, "This kernel is specialized for NC <= 4")
+
+    i_ti = i_t * BT + i_i * BC
+    i_tn = i_ti + BC2
+    if i_ti >= T:
         return
 
     q += (bos * H + i_h) * K
@@ -64,40 +66,95 @@ def chunk_kda_fwd_kernel_intra_sub_inter(
     Aqk += (bos * H + i_h) * BT
     Akk += (bos * H + i_h) * BT
 
-    p_b = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_t * BT + i_i * BC,), (BC,), (0,))
+    p_b = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_ti,), (BC,), (0,))
     b_b = tl.load(p_b, boundary_check=(0,))
 
     b_Aqk = tl.zeros([BC, BC], dtype=tl.float32)
     b_Akk = tl.zeros([BC, BC], dtype=tl.float32)
+
+    b_Aqk0 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk0 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk1 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk1 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk2 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk2 = tl.zeros([BC, BC], dtype=tl.float32)
     for i_k in range(tl.cdiv(K, BK)):
-        p_q = tl.make_block_ptr(q, (T, K), (H*K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+        p_q = tl.make_block_ptr(q, (T, K), (H*K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+        p_k = tl.make_block_ptr(k, (T, K), (H*K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+        p_g = tl.make_block_ptr(g, (T, K), (H*K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
         o_k = i_k * BK + tl.arange(0, BK)
         m_k = o_k < K
-        # [BK,]
-        b_gn = tl.load(g + (i_t * BT + i_i * BC) * H*K + o_k, mask=m_k, other=0)
+
         # [BC, BK]
-        p_g = tl.make_block_ptr(g, (T, K), (H*K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
-        p_k = tl.make_block_ptr(k, (T, K), (H*K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
-        b_kt = tl.make_block_ptr(k, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1))
-        p_gk = tl.make_block_ptr(g, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1))
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
         b_g = tl.load(p_g, boundary_check=(0, 1))
-        b_k = tl.load(p_k, boundary_check=(0, 1)) * exp(b_g - b_gn[None, :])
-        b_gk = tl.load(p_gk, boundary_check=(0, 1))
-        b_kt = tl.load(b_kt, boundary_check=(0, 1))
-        # [BC, BC]
-        b_ktg = b_kt * exp(b_gn[:, None] - b_gk)
-        b_Akk += tl.dot(b_k, b_ktg)
+        # [BK,]
+        b_gn = tl.load(g + i_ti * H*K + o_k, mask=m_k, other=0)
+        # [BC, BK]
+        b_gqk = exp(b_g - b_gn[None, :])
+        b_qg = b_q * b_gqk
+        b_kg = b_k * b_gqk
+        if i_i > 0:
+            p_kt0 = tl.make_block_ptr(k, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BC), (0, 1))
+            p_gk0 = tl.make_block_ptr(g, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BC), (0, 1))
+            b_kt0 = tl.load(p_kt0, boundary_check=(0, 1))
+            b_gk0 = tl.load(p_gk0, boundary_check=(0, 1))
+            b_ktg0 = b_kt0 * exp(b_gn[:, None] - b_gk0)
+            b_Aqk0 += tl.dot(b_qg, b_ktg0)
+            b_Akk0 += tl.dot(b_kg, b_ktg0)
+        if i_i > 1:
+            p_kt1 = tl.make_block_ptr(k, (K, T), (1, H*K), (i_k * BK, i_t * BT + BC), (BK, BC), (0, 1))
+            p_gk1 = tl.make_block_ptr(g, (K, T), (1, H*K), (i_k * BK, i_t * BT + BC), (BK, BC), (0, 1))
+            b_gk1 = tl.load(p_gk1, boundary_check=(0, 1))
+            b_kt1 = tl.load(p_kt1, boundary_check=(0, 1))
+            b_ktg1 = b_kt1 * exp(b_gn[:, None] - b_gk1)
+            b_Aqk1 += tl.dot(b_qg, b_ktg1)
+            b_Akk1 += tl.dot(b_kg, b_ktg1)
+        if i_i > 2:
+            p_kt2 = tl.make_block_ptr(k, (K, T), (1, H*K), (i_k * BK, i_t * BT + 2 * BC), (BK, BC), (0, 1))
+            p_gk2 = tl.make_block_ptr(g, (K, T), (1, H*K), (i_k * BK, i_t * BT + 2 * BC), (BK, BC), (0, 1))
+            b_gk2 = tl.load(p_gk2, boundary_check=(0, 1))
+            b_kt2 = tl.load(p_kt2, boundary_check=(0, 1))
+            b_ktg2 = b_kt2 * exp(b_gn[:, None] - b_gk2)
+            b_Aqk2 += tl.dot(b_qg, b_ktg2)
+            b_Akk2 += tl.dot(b_kg, b_ktg2)
+
+        if i_tn < T:
+            b_gn2 = tl.load(g + i_tn * H*K + o_k, mask=m_k, other=0)
+            b_gqk2 = exp(b_g - b_gn2[None, :])
+            b_ktg = tl.trans(b_k * exp(b_gn2[None, :] - b_g))
+            b_Aqk += tl.dot(b_q * b_gqk2, b_ktg)
+            b_Akk += tl.dot(b_k * b_gqk2, b_ktg)
 
-        b_q = tl.load(p_q, boundary_check=(0, 1))
-        b_qg = b_q * exp(b_g - b_gn[None, :]) * scale
-        b_Aqk += tl.dot(b_qg, b_ktg)
+    if i_i > 0:
+        p_Aqk0 = tl.make_block_ptr(Aqk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, 0), (BC, BC), (1, 0))
+        p_Akk0 = tl.make_block_ptr(Akk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, 0), (BC, BC), (1, 0))
+        tl.store(p_Aqk0, (b_Aqk0 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Akk0, (b_Akk0 * b_b[:, None]).to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    if i_i > 1:
+        p_Aqk1 = tl.make_block_ptr(Aqk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, BC), (BC, BC), (1, 0))
+        p_Akk1 = tl.make_block_ptr(Akk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, BC), (BC, BC), (1, 0))
+        tl.store(p_Aqk1, (b_Aqk1 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Akk1, (b_Akk1 * b_b[:, None]).to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    if i_i > 2:
+        p_Aqk2 = tl.make_block_ptr(Aqk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, 2 * BC), (BC, BC), (1, 0))
+        p_Akk2 = tl.make_block_ptr(Akk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, 2 * BC), (BC, BC), (1, 0))
+        tl.store(p_Aqk2, (b_Aqk2 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Akk2, (b_Akk2 * b_b[:, None]).to(Akk.dtype.element_ty), boundary_check=(0, 1))
+
+    if i_tn >= T:
+        return
+    o_i = tl.arange(0, BC)
+    m_A = (o_i >= BC2)[:, None] & (o_i < BC2)
 
-    b_Akk *= b_b[:, None]
+    b_Aqk = tl.where(m_A, b_Aqk * scale, 0.)
+    b_Akk = tl.where(m_A, b_Akk * b_b[:, None], 0.)
 
-    p_Akk = tl.make_block_ptr(Akk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
-    tl.store(p_Akk, b_Akk.to(Akk.dtype.element_ty), boundary_check=(0, 1))
-    p_Aqk = tl.make_block_ptr(Aqk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+    p_Aqk = tl.make_block_ptr(Aqk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_i * BC), (BC, BC), (1, 0))
+    p_Akk = tl.make_block_ptr(Akk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_i * BC), (BC, BC), (1, 0))
     tl.store(p_Aqk, b_Aqk.to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk, b_Akk.to(Akk.dtype.element_ty), boundary_check=(0, 1))
 
 
 @triton.heuristics({
@@ -438,14 +495,14 @@ def chunk_kda_fwd_intra(
         chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
 
-    BC = min(16, BT)
+    BC, BC2 = 16, 8
     NC = triton.cdiv(BT, BC)
     BK = max(triton.next_power_of_2(K), 16)
 
     Aqk = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype)
     Akk = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype)
-    grid = (NT, NC * NC, B * H)
 
+    grid = (NC, NT, B * H)
     chunk_kda_fwd_kernel_intra_sub_inter[grid](
         q=q,
         k=k,
@@ -461,6 +518,7 @@ def chunk_kda_fwd_intra(
         K=K,
         BT=BT,
         BC=BC,
+        BC2=BC2,
         NC=NC,
     )
 
@@ -476,6 +534,7 @@ def chunk_kda_fwd_intra(
             scale=scale,
             cu_seqlens=cu_seqlens,
             chunk_size=BT,
+            sub_chunk_size=BC2,
         )
     else:
         # Original sub-chunk based implementation
@@ -494,7 +553,7 @@ def chunk_kda_fwd_intra(
             H=H,
             K=K,
             BT=BT,
-            BC=BC,
+            BC=BC2,
             BK=BK,
         )
 
@@ -519,8 +578,8 @@ def chunk_kda_bwd_intra(
     db: torch.Tensor,
     dg: torch.Tensor,
     cu_seqlens: torch.LongTensor | None = None,
-    chunk_size: int = 64,
     chunk_indices: torch.LongTensor | None = None,
+    chunk_size: int = 64,
 ):
     B, T, H, K = k.shape
     BT = chunk_size
@@ -530,7 +589,6 @@ def chunk_kda_bwd_intra(
     if chunk_indices is None and cu_seqlens is not None:
         chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
-    # NC = 4
     NC = triton.cdiv(BT, BC)
     NK = triton.cdiv(K, BK)
 
diff --git a/fla/ops/kda/chunk_intra_token_parallel.py b/fla/ops/kda/chunk_intra_token_parallel.py
@@ -36,6 +36,7 @@ def chunk_kda_fwd_kernel_intra_token_parallel(
     H: tl.constexpr,
     K: tl.constexpr,
     BT: tl.constexpr,
+    BC: tl.constexpr,
     BH: tl.constexpr,
     USE_EXP2: tl.constexpr,
     IS_VARLEN: tl.constexpr,
@@ -71,7 +72,7 @@ def chunk_kda_fwd_kernel_intra_token_parallel(
         bos = tl.load(cu_seqlens + i_n).to(tl.int32)
         eos = tl.load(cu_seqlens + i_n + 1).to(tl.int32)
         i_t = i_tg - bos
-        T = eos - bos # Current sequence length
+        T = eos - bos  # Current sequence length
 
         # Safety check
         if i_t >= T or i_tg >= eos:
@@ -85,8 +86,6 @@ def chunk_kda_fwd_kernel_intra_token_parallel(
         if i_t >= T:
             return
 
-    # Find which sub-chunk (BC=16) this token belongs to
-    BC: tl.constexpr = 16
     i_chunk = i_t // BT  # which BT=64 chunk
     i_subchunk = (i_t % BT) // BC  # which BC=16 sub-chunk within the BT chunk
 
@@ -103,18 +102,15 @@ def chunk_kda_fwd_kernel_intra_token_parallel(
 
     # Load q[i_t, h:h+BH, :] - shape [BH, K]
     # For varlen, we use global offset: bos + i_t = i_tg
-    p_q = tl.make_block_ptr(q + (bos + i_t) * H * K, (H, K), (K, 1),
-                            (i_h_start, 0), (BH, BK), (0, 1))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * H * K, (H, K), (K, 1), (i_h_start, 0), (BH, BK), (0, 1))
     b_q = tl.load(p_q, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
 
     # Load g[i_t, h:h+BH, :]
-    p_g = tl.make_block_ptr(g + (bos + i_t) * H * K, (H, K), (K, 1),
-                            (i_h_start, 0), (BH, BK), (0, 1))
+    p_g = tl.make_block_ptr(g + (bos + i_t) * H * K, (H, K), (K, 1), (i_h_start, 0), (BH, BK), (0, 1))
     b_g = tl.load(p_g, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
 
     # Load k[i_t, h:h+BH, :] and beta[i_t, h:h+BH]
-    p_k = tl.make_block_ptr(k + (bos + i_t) * H * K, (H, K), (K, 1),
-                            (i_h_start, 0), (BH, BK), (0, 1))
+    p_k = tl.make_block_ptr(k + (bos + i_t) * H * K, (H, K), (K, 1), (i_h_start, 0), (BH, BK), (0, 1))
     b_k_self = tl.load(p_k, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
 
     p_beta = beta + (bos + i_t) * H + i_h_start + o_h
@@ -124,28 +120,25 @@ def chunk_kda_fwd_kernel_intra_token_parallel(
     for j in range(subchunk_start, tl.minimum(i_t + 1, subchunk_end)):
 
         # Load k[j, h:h+BH, :] with pointer arithmetic
-        p_k_j = tl.make_block_ptr(k + (bos + j) * H * K, (H, K), (K, 1),
-                                  (i_h_start, 0), (BH, BK), (0, 1))
-        b_k_j = tl.load(p_k_j, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
+        p_kj = tl.make_block_ptr(k + (bos + j) * H * K, (H, K), (K, 1), (i_h_start, 0), (BH, BK), (0, 1))
+        b_kj = tl.load(p_kj, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
 
         # Load g[j, h:h+BH, :]
-        p_g_j = tl.make_block_ptr(g + (bos + j) * H * K, (H, K), (K, 1),
-                                  (i_h_start, 0), (BH, BK), (0, 1))
-        b_g_j = tl.load(p_g_j, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
+        p_gj = tl.make_block_ptr(g + (bos + j) * H * K, (H, K), (K, 1), (i_h_start, 0), (BH, BK), (0, 1))
+        b_gj = tl.load(p_gj, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
 
         # Compute gated key for all BH heads: [BH, BK]
         if USE_EXP2:
-            b_k_j_gated = b_k_j * exp2(b_g - b_g_j)
+            b_kgj = b_kj * exp2(b_g - b_gj)
         else:
-            b_k_j_gated = b_k_j * exp(b_g - b_g_j)
+            b_kgj = b_kj * exp(b_g - b_gj)
 
         # Apply mask for valid K dimension
-        b_k_j_gated = tl.where(m_k[None, :], b_k_j_gated, 0.0)
+        b_kgj = tl.where(m_k[None, :], b_kgj, 0.0)
 
-        # Compute Aqk and Akk for all BH heads: [BH]
-        b_Aqk = tl.sum(b_q * b_k_j_gated, axis=1) * scale  # [BH]
+        b_Aqk = tl.sum(b_q * b_kgj, axis=1) * scale  # [BH]
         # Akk: only accumulate if j < i_t
-        b_Akk = tl.sum(b_k_self * b_k_j_gated, axis=1) * tl.where(j < i_t, 1.0, 0.0)  # [BH]
+        b_Akk = tl.sum(b_k_self * b_kgj, axis=1) * tl.where(j < i_t, 1.0, 0.0)  # [BH]
 
         # Store with [B, T, H, BT] layout (no transpose needed later)
         j_pos = j % BT
@@ -165,6 +158,7 @@ def chunk_kda_fwd_intra_token_parallel(
     scale: float,
     cu_seqlens: torch.LongTensor | None = None,
     chunk_size: int = 64,
+    sub_chunk_size: int = 16,
     use_exp2: bool = False,
 ) -> None:
     """
@@ -187,6 +181,7 @@ def chunk_kda_fwd_intra_token_parallel(
     """
     B, T, H, K = q.shape
     BT = chunk_size
+    BC = sub_chunk_size
 
     # Grid: (total_tokens, H/BH) - each token gets its own block
     if cu_seqlens is not None:
@@ -215,5 +210,6 @@ def grid(meta):
         H=H,
         K=K,
         BT=BT,
+        BC=BC,
         USE_EXP2=use_exp2,
     )