From 9181bbde088b5b8b52ffd1b04fc100512791a2a9 Mon Sep 17 00:00:00 2001
From: Songlin Yang <songlin@thinkingmachines.ai>
Date: Fri, 21 Nov 2025 09:28:11 +0000
Subject: [PATCH 1/2] Support grouped kv heads in ReBased

---
 fla/layers/rebased.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/fla/layers/rebased.py b/fla/layers/rebased.py
index eaf7ec9dd6..1719425933 100644
--- a/fla/layers/rebased.py
+++ b/fla/layers/rebased.py
@@ -8,7 +8,7 @@
 
 import torch
 import torch.nn as nn
-from einops import rearrange
+from einops import rearrange, repeat
 
 from fla.modules.feature_map import RebasedFeatureMap
 from fla.ops.linear_attn import chunk_linear_attn, fused_chunk_linear_attn
@@ -16,6 +16,17 @@
 
 
 class ReBasedLinearAttention(nn.Module):
+    r"""
+    Implementation of ReBased linear attention with optional grouped keys/values.
+
+    Args:
+        hidden_size (int): Model hidden size.
+        feature_dim (int): Dimensionality of the learnable quadratic feature map per head.
+        num_heads (int): Number of query heads.
+        num_key_value_heads (int): Number of unique key/value heads (GQA). Must divide `num_heads`.
+            When smaller than `num_heads`, keys and values are projected once per KV head and then
+            shared across ``num_heads // num_key_value_heads`` query heads.
+    """
 
     def __init__(
         self,
@@ -39,10 +50,16 @@ def __init__(
         self.mode = mode
         assert self.mode in ["fused_chunk", "parallel", 'chunk']
 
+        if hidden_size % num_heads != 0:
+            raise ValueError("`hidden_size` must be divisible by `num_heads`.")
+        if num_heads % num_key_value_heads != 0:
+            raise ValueError("`num_heads` must be divisible by `num_key_value_heads`.")
+
         self.feature_dim = feature_dim
-        self.num_key_value_heads = num_key_value_heads
         self.num_heads = num_heads
-        self.head_dim = self.hidden_size // self.num_key_value_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.num_kv_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = self.hidden_size // self.num_heads
         self.use_gamma = use_gamma
         self.use_beta = use_beta
         self.normalize = normalize
@@ -53,7 +70,7 @@ def __init__(
 
         self.feature_map = RebasedFeatureMap(self.feature_dim, use_gamma, use_beta, normalize)
         self.q_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_heads, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_heads, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_key_value_heads, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
         self.dropout = nn.Identity()
@@ -69,7 +86,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
         k = rearrange(
             self.k_proj(hidden_states),
             "... (h d) -> ... h d",
-            h=self.num_heads,
+            h=self.num_key_value_heads,
             d=self.feature_dim,
         )
         v = rearrange(
@@ -78,7 +95,11 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
             h=self.num_key_value_heads,
             d=self.head_dim,
         )
-        q, k = self.feature_map(q, flatten=(mode != 'parallel')), self.feature_map(k, flatten=(mode != 'parallel'))
+        q = self.feature_map(q, flatten=(mode != 'parallel'))
+        k = self.feature_map(k, flatten=(mode != 'parallel'))
+        if self.num_kv_groups > 1:
+            k = repeat(k, "... h d -> ... (h g) d", g=self.num_kv_groups)
+            v = repeat(v, "... h d -> ... (h g) d", g=self.num_kv_groups)
         if mode == "fused_chunk":
             o = fused_chunk_linear_attn(
                 q=q,

From c6bd19e85cc0e761debbd3919d1a43c51f9e7ef7 Mon Sep 17 00:00:00 2001
From: Songlin Yang <songlin@thinkingmachines.ai>
Date: Sat, 22 Nov 2025 20:50:57 +0000
Subject: [PATCH 2/2] feat: Optimize KDA intra-chunk kernel with Recursive
 Block and Auto-Selection.

- Merged Token Parallel and Recursive Block implementations into chunk_intra.py.
- Added 'recurrent' (naive sequential) implementation for verification.
- Updated chunk_kda_fwd_intra to use 'impl_type' argument and automatically select between 'token' and 'recursive' based on head dimension K (threshold K=128).
- Added comprehensive benchmark script in benchmarks/ops/benchmark_kda_intra.py covering various shapes.
- Performance: Recursive Block achieves ~15% speedup for small K (<=64), while Token Parallel remains superior for large K.
---
 benchmarks/ops/benchmark_kda_intra.py |  68 +++++
 fla/ops/kda/chunk_intra.py            | 405 ++++++++++++++++++++++++--
 2 files changed, 443 insertions(+), 30 deletions(-)
 create mode 100644 benchmarks/ops/benchmark_kda_intra.py

diff --git a/benchmarks/ops/benchmark_kda_intra.py b/benchmarks/ops/benchmark_kda_intra.py
new file mode 100644
index 0000000000..a51c628dc8
--- /dev/null
+++ b/benchmarks/ops/benchmark_kda_intra.py
@@ -0,0 +1,68 @@
+
+import torch
+import triton
+from fla.ops.kda.chunk_intra import chunk_kda_fwd_intra
+
+def benchmark_intra_chunk(B=8, T=4096, H=16, K=128, chunk_size=64):
+    dtype = torch.bfloat16
+    device = 'cuda'
+    
+    q = torch.randn(B, T, H, K, device=device, dtype=dtype)
+    k = torch.randn(B, T, H, K, device=device, dtype=dtype)
+    g = torch.randn(B, T, H, K, device=device, dtype=torch.float32)
+    beta = torch.randn(B, T, H, device=device, dtype=dtype)
+    
+    scale = 1.0
+    
+    quantiles = [0.5, 0.2, 0.8]
+    
+    # Warmup
+    for _ in range(10):
+        chunk_kda_fwd_intra(q, k, g, beta, scale=scale, chunk_size=chunk_size, impl_type="token")
+        chunk_kda_fwd_intra(q, k, g, beta, scale=scale, chunk_size=chunk_size, impl_type="recursive")
+        chunk_kda_fwd_intra(q, k, g, beta, scale=scale, chunk_size=chunk_size, impl_type="recurrent")
+        
+    ms_token = triton.testing.do_bench(
+        lambda: chunk_kda_fwd_intra(q, k, g, beta, scale=scale, chunk_size=chunk_size, impl_type="token"),
+        quantiles=quantiles
+    )
+    
+    ms_recursive = triton.testing.do_bench(
+        lambda: chunk_kda_fwd_intra(q, k, g, beta, scale=scale, chunk_size=chunk_size, impl_type="recursive"),
+        quantiles=quantiles
+    )
+
+    try:
+        ms_recurrent = triton.testing.do_bench(
+            lambda: chunk_kda_fwd_intra(q, k, g, beta, scale=scale, chunk_size=chunk_size, impl_type="recurrent"),
+            quantiles=quantiles
+        )
+        t_recurrent = ms_recurrent[0]
+    except Exception as e:
+        t_recurrent = float('nan')
+    
+    # Format for table row
+    # Shape | Token | Recursive | Recurrent | Rec vs Token
+    row_str = f"B={B}, T={T}, H={H}, K={K}"
+    print(f"{row_str:<30} | {ms_token[0]:.3f} ms            | {ms_recursive[0]:.3f} ms            | {t_recurrent:.3f} ms        | {ms_token[0]/ms_recursive[0]:.2f}x       ")
+
+if __name__ == "__main__":
+    configs = [
+        (8, 4096, 16, 128),
+        (1, 8192, 16, 128),
+        (8, 4096, 32, 64),
+        (1, 8192, 32, 64),
+        # Large Batch
+        (32, 512, 12, 64),
+        # High Head Dim
+        (2, 4096, 8, 256),
+    ]
+    
+    print(f"{'Shape':<30} | {'Token (Original)':<20} | {'Recursive (New)':<20} | {'Recurrent':<15} | {'Speedup (Rec/Tok)':<15}")
+    print("-" * 110)
+    
+    for B, T, H, K in configs:
+        try:
+            benchmark_intra_chunk(B=B, T=T, H=H, K=K, chunk_size=64)
+        except Exception as e:
+            print(f"Failed for shape B={B}, T={T}, H={H}, K={K}: {e}")
diff --git a/fla/ops/kda/chunk_intra.py b/fla/ops/kda/chunk_intra.py
index 8ab99931b6..0200473dc0 100644
--- a/fla/ops/kda/chunk_intra.py
+++ b/fla/ops/kda/chunk_intra.py
@@ -4,7 +4,6 @@
 import triton
 import triton.language as tl
 
-from fla.ops.kda.chunk_intra_token_parallel import chunk_kda_fwd_intra_token_parallel
 from fla.ops.utils import chunk_local_cumsum, prepare_chunk_indices, solve_tril
 from fla.ops.utils.op import exp
 from fla.utils import autotune_cache_kwargs
@@ -143,10 +142,7 @@ def chunk_kda_fwd_kernel_intra_sub_intra(
         return
 
     o_i = tl.arange(0, BC)
-    o_k = tl.arange(0, BK)
-    m_k = o_k < K
     m_A = (i_t * BT + i_i * BC + o_i) < T
-    o_A = (i_t * BT + i_i * BC + o_i) * H*BT + i_i * BC
 
     q += (bos * H + i_h) * K
     k += (bos * H + i_h) * K
@@ -160,28 +156,76 @@ def chunk_kda_fwd_kernel_intra_sub_intra(
     p_g = tl.make_block_ptr(g, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_k = tl.load(p_k, boundary_check=(0, 1))
-    b_g = tl.load(p_g, boundary_check=(0, 1))
-
-    b_k = b_k * tl.load(beta + (i_t * BT + i_i * BC + o_i) * H, mask=m_A, other=0)[:, None]
+    b_g = tl.load(p_g, boundary_check=(0, 1)).to(tl.float32) * 1.44269504
+    b_beta = tl.load(beta + (i_t * BT + i_i * BC + o_i) * H, mask=m_A, other=0)
 
-    p_kt = k + (i_t * BT + i_i * BC) * H*K + o_k
-    p_gk = g + (i_t * BT + i_i * BC) * H*K + o_k
-
-    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
-        b_kt = tl.load(p_kt, mask=m_k, other=0).to(tl.float32)
-        b_gk = tl.load(p_gk, mask=m_k, other=0).to(tl.float32)
-        b_ktg = b_kt[None, :] * exp(b_g - b_gk[None, :])
-        b_Aqk = tl.sum(b_q * b_ktg, 1) * scale
-        b_Akk = tl.sum(b_k * b_ktg, 1)
-        tl.store(Aqk + o_A + j, b_Aqk, mask=m_A)
-        tl.store(Akk + o_A + j, b_Akk, mask=m_A)
-        p_kt += H*K
-        p_gk += H*K
+    # Pre-compute masks for all steps
+    o_i = tl.arange(0, BC)
+    
+    # Accumulators
+    acc_Aqk = tl.zeros([BC, BC], dtype=tl.float32)
+    acc_Akk = tl.zeros([BC, BC], dtype=tl.float32)
+
+    # Add diagonal
+    b_Aqk_diag = tl.sum(b_q * b_k, 1)
+    acc_Aqk = tl.where(o_i[:, None] == o_i[None, :], b_Aqk_diag[:, None], acc_Aqk)
+
+    # Iterate from large spans down to small spans
+    # For BC=64, we need to handle span=32 (log2=5). 
+    # Starting from 6 is safe for BC up to 128.
+    for log_span in range(3, -1, -1):
+        span = 1 << log_span
+        # Identify Q and K rows for this span
+        # For a block size of 2*span:
+        # Top half (0..span-1) are Keys
+        # Bottom half (span..2*span-1) are Queries
+        # Pivot is at index 'span' relative to block start.
+        
+        # Global index within chunk is o_i
+        # Relative index in 2*span block: o_i % (2*span)
+        # Is Query if relative >= span
+        is_q = (o_i % (2*span)) >= span
+        is_k = (o_i % (2*span)) < span
+ 
+        # Pivot index for each row
+        # The pivot is the start of the Q-half of the block (i.e., index `span` relative to block start)
+        # pivot = (o_i // (2*span)) * (2*span) + span
+        pivot_idx = (o_i // (2*span)) * (2*span) + span - 1
+
+        # Gather g_pivot from b_g using matrix multiplication (permutation)
+        # S[i, j] = 1 if j == pivot_idx[i]
+        S = ((o_i[None, :] == pivot_idx[:, None])).to(tl.float32)
+        b_g_pivot = tl.dot(S, b_g)
+        
+        mask_i = m_A[:, None]        
+        mask_q = is_q[:, None] & mask_i
+        mask_k = is_k[:, None] & mask_i
+
+        d_q = tl.where(mask_q, tl.exp2(b_g - b_g_pivot), 0.0)
+        d_k = tl.where(mask_k, tl.exp2(b_g_pivot - b_g), 0.0)
+        
+        # Mask inputs
+        b_q_masked = b_q * d_q
+        b_k_masked = b_k * d_k
+        b_k_q_masked = b_k * d_q
+
+        b_Aqk_sub = tl.dot(b_q_masked, tl.trans(b_k_masked))
+        b_Akk_sub = tl.dot(b_k_q_masked, tl.trans(b_k_masked))
+        
+        # Filter cross-block terms
+        block_id = o_i // (2*span)
+        same_block = block_id[:, None] == block_id[None, :]
+        acc_Aqk += tl.where(same_block, b_Aqk_sub, 0.0)
+        acc_Akk += tl.where(same_block, b_Akk_sub, 0.0)
+    
+    acc_Aqk = tl.where(o_i[:, None] >= o_i[None, :], acc_Aqk * scale, 0.0)
+    acc_Akk = tl.where(o_i[:, None] > o_i[None, :], acc_Akk * b_beta[:, None].to(tl.float32), 0.0)
+    # Store final results
+    p_Aqk = tl.make_block_ptr(Aqk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_i * BC), (BC, BC), (1, 0))
+    p_Akk = tl.make_block_ptr(Akk, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_i * BC), (BC, BC), (1, 0))
+    tl.store(p_Aqk, acc_Aqk.to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk, acc_Akk.to(Akk.dtype.element_ty), boundary_check=(0, 1))
 
-    tl.debug_barrier()
-    b_A = tl.zeros([BC, BC], dtype=tl.float32)
-    tl.store(Aqk + o_A[:, None] + o_i, b_A, mask=m_A[:, None] & (o_i[:, None] < o_i))
-    tl.store(Akk + o_A[:, None] + o_i, b_A, mask=m_A[:, None] & (o_i[:, None] <= o_i))
 
 
 @triton.heuristics({
@@ -391,6 +435,282 @@ def chunk_kda_bwd_kernel_intra(
     tl.store(p_dg, b_dg.to(p_dg.dtype.element_ty), boundary_check=(0, 1))
 
 
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8]
+    ],
+    key=["BK", "BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_kda_fwd_kernel_intra_sub_intra_recurrent(
+    q,
+    k,
+    g,
+    beta,
+    Aqk,
+    Akk,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_i, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if i_t * BT + i_i * BC >= T:
+        return
+
+    o_i = tl.arange(0, BC)
+    o_k = tl.arange(0, BK)
+    m_k = o_k < K
+    m_A = (i_t * BT + i_i * BC + o_i) < T
+    o_A = (i_t * BT + i_i * BC + o_i) * H*BT + i_i * BC
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    g += (bos * H + i_h) * K
+    beta += bos * H + i_h
+    Aqk += (bos * H + i_h) * BT
+    Akk += (bos * H + i_h) * BT
+
+    p_q = tl.make_block_ptr(q, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+    p_k = tl.make_block_ptr(k, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+    p_g = tl.make_block_ptr(g, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+
+    b_k = b_k * tl.load(beta + (i_t * BT + i_i * BC + o_i) * H, mask=m_A, other=0)[:, None]
+
+    p_kt = k + (i_t * BT + i_i * BC) * H*K + o_k
+    p_gk = g + (i_t * BT + i_i * BC) * H*K + o_k
+
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        b_kt = tl.load(p_kt, mask=m_k, other=0).to(tl.float32)
+        b_gk = tl.load(p_gk, mask=m_k, other=0).to(tl.float32)
+        b_ktg = b_kt[None, :] * exp(b_g - b_gk[None, :])
+        b_Aqk = tl.sum(b_q * b_ktg, 1) * scale
+        b_Akk = tl.sum(b_k * b_ktg, 1)
+        tl.store(Aqk + o_A + j, b_Aqk, mask=m_A)
+        tl.store(Akk + o_A + j, b_Akk, mask=m_A)
+        p_kt += H*K
+        p_gk += H*K
+
+    tl.debug_barrier()
+    b_A = tl.zeros([BC, BC], dtype=tl.float32)
+    tl.store(Aqk + o_A[:, None] + o_i[None, :], b_A, mask=m_A[:, None] & (o_i[:, None] < o_i[None, :]))
+    tl.store(Akk + o_A[:, None] + o_i[None, :], b_A, mask=m_A[:, None] & (o_i[:, None] <= o_i[None, :]))
+
+
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BH': BH}, num_warps=num_warps)
+        for BH in [1, 2, 4, 8]  # Let autotune choose freely
+        for num_warps in [1, 2, 4, 8]
+    ],
+    key=["K", "H"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=['T', 'B'])
+def chunk_kda_fwd_kernel_intra_token_parallel(
+    q,
+    k,
+    g,
+    beta,
+    Aqk,
+    Akk,
+    scale,
+    cu_seqlens,
+    B,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BH: tl.constexpr,
+    USE_EXP2: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    # Each block processes one token (i) for BH heads
+    i_tg = tl.program_id(0)  # global token index
+    i_hg = tl.program_id(1)  # head_group index
+
+    i_h_start = i_hg * BH
+
+    if IS_VARLEN:
+        # Binary search to find which sequence this token belongs to
+        # i_tg is the global token index
+        # Range [0, B) where B is num_sequences passed from python
+
+        left = 0
+        right = B
+        i_n = 0
+
+        # Unrolled binary search (max B=2^32)
+        # We can limit iterations based on expected max batch size if needed
+        # 20 iterations covers B=1M, usually enough
+        for _ in range(20):
+            if left < right:
+                mid = (left + right) // 2
+                end_val = tl.load(cu_seqlens + mid + 1).to(tl.int32)
+                if i_tg < end_val:
+                    right = mid
+                else:
+                    left = mid + 1
+        i_n = left
+
+        bos = tl.load(cu_seqlens + i_n).to(tl.int32)
+        eos = tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        i_t = i_tg - bos
+        T = eos - bos # Current sequence length
+
+        # Safety check
+        if i_t >= T or i_tg >= eos:
+            return
+
+    else:
+        i_b = i_tg // T
+        i_t = i_tg % T
+        bos = i_b * T
+
+        if i_t >= T:
+            return
+
+    # Find which sub-chunk (BC=16) this token belongs to
+    BC: tl.constexpr = 16
+    i_chunk = i_t // BT  # which BT=64 chunk
+    i_subchunk = (i_t % BT) // BC  # which BC=16 sub-chunk within the BT chunk
+
+    subchunk_start = i_chunk * BT + i_subchunk * BC
+    subchunk_end = tl.minimum(subchunk_start + BC, T)
+
+    o_h = tl.arange(0, BH)
+    m_h = (i_h_start + o_h) < H
+
+    # Marginalize over entire K dimension at once
+    BK: tl.constexpr = triton.next_power_of_2(K)
+    o_k = tl.arange(0, BK)
+    m_k = o_k < K
+
+    # Load q[i_t, h:h+BH, :] - shape [BH, K]
+    # For varlen, we use global offset: bos + i_t = i_tg
+    p_q = tl.make_block_ptr(q + (bos + i_t) * H * K, (H, K), (K, 1),
+                            (i_h_start, 0), (BH, BK), (0, 1))
+    b_q = tl.load(p_q, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
+
+    # Load g[i_t, h:h+BH, :]
+    p_g = tl.make_block_ptr(g + (bos + i_t) * H * K, (H, K), (K, 1),
+                            (i_h_start, 0), (BH, BK), (0, 1))
+    b_g = tl.load(p_g, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
+
+    # Load k[i_t, h:h+BH, :] and beta[i_t, h:h+BH]
+    p_k = tl.make_block_ptr(k + (bos + i_t) * H * K, (H, K), (K, 1),
+                            (i_h_start, 0), (BH, BK), (0, 1))
+    b_k_self = tl.load(p_k, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
+
+    p_beta = beta + (bos + i_t) * H + i_h_start + o_h
+    b_beta = tl.load(p_beta, mask=m_h, other=0).to(tl.float32)  # [BH]
+    b_k_self = b_k_self * b_beta[:, None]  # [BH, K]
+
+    for j in range(subchunk_start, tl.minimum(i_t + 1, subchunk_end)):
+
+        # Load k[j, h:h+BH, :] with pointer arithmetic
+        p_k_j = tl.make_block_ptr(k + (bos + j) * H * K, (H, K), (K, 1),
+                                  (i_h_start, 0), (BH, BK), (0, 1))
+        b_k_j = tl.load(p_k_j, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
+
+        # Load g[j, h:h+BH, :]
+        p_g_j = tl.make_block_ptr(g + (bos + j) * H * K, (H, K), (K, 1),
+                                  (i_h_start, 0), (BH, BK), (0, 1))
+        b_g_j = tl.load(p_g_j, boundary_check=(0, 1)).to(tl.float32)  # [BH, BK]
+
+        # Compute gated key for all BH heads: [BH, BK]
+        if USE_EXP2:
+            b_k_j_gated = b_k_j * tl.exp2(b_g - b_g_j)
+        else:
+            b_k_j_gated = b_k_j * exp(b_g - b_g_j)
+
+        # Apply mask for valid K dimension
+        b_k_j_gated = tl.where(m_k[None, :], b_k_j_gated, 0.0)
+
+        # Compute Aqk and Akk for all BH heads: [BH]
+        b_Aqk = tl.sum(b_q * b_k_j_gated, axis=1) * scale  # [BH]
+        # Akk: only accumulate if j < i_t
+        b_Akk = tl.sum(b_k_self * b_k_j_gated, axis=1) * tl.where(j < i_t, 1.0, 0.0)  # [BH]
+
+        # Store with [B, T, H, BT] layout (no transpose needed later)
+        j_pos = j % BT
+        offs_h = i_h_start + o_h
+        offs_out = (bos + i_t) * H * BT + offs_h * BT + j_pos
+        tl.store(Aqk + offs_out, b_Aqk.to(Aqk.dtype.element_ty), mask=m_h)
+        tl.store(Akk + offs_out, b_Akk.to(Akk.dtype.element_ty), mask=m_h)
+
+
+def chunk_kda_fwd_intra_token_parallel(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    gk: torch.Tensor,
+    beta: torch.Tensor,
+    Aqk: torch.Tensor,
+    Akk: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+    use_exp2: bool = False,
+) -> None:
+    B, T, H, K = q.shape
+    BT = chunk_size
+
+    # Grid: (total_tokens, H/BH) - each token gets its own block
+    if cu_seqlens is not None:
+        total_tokens = q.shape[1]
+        # Use num_sequences as B for binary search
+        B_kernel = len(cu_seqlens) - 1
+    else:
+        total_tokens = B * T
+        B_kernel = B
+
+    def grid(meta):
+        BH = meta['BH']
+        return (total_tokens, triton.cdiv(H, BH))
+
+    chunk_kda_fwd_kernel_intra_token_parallel[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        Aqk=Aqk,
+        Akk=Akk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        B=B_kernel,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        USE_EXP2=use_exp2,
+    )
+
+
 def chunk_kda_fwd_intra(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -401,7 +721,7 @@ def chunk_kda_fwd_intra(
     chunk_size: int = 64,
     chunk_indices: torch.LongTensor | None = None,
     output_dtype: torch.dtype = torch.float32,
-    use_token_parallel: bool = True,
+    impl_type: str = "auto",
 ) -> tuple[torch.Tensor, torch.Tensor]:
     r"""
     Args:
@@ -422,8 +742,10 @@ def chunk_kda_fwd_intra(
             The chunk size. Default: 64.
         output_dtype (torch.dtype):
             The dtype of the output tensor. Default: `torch.float32`
-        use_token_parallel (bool):
-            Whether to use token-parallel implementation for sub_intra. Default: `True`.
+        impl_type (str):
+            The implementation type for sub_intra kernel. 
+            Options: "auto", "token", "recursive", "recurrent".
+            Default: "auto".
 
     Returns:
         Aqk (torch.Tensor):
@@ -433,12 +755,16 @@ def chunk_kda_fwd_intra(
     """
     B, T, H, K = k.shape
     assert K <= 256
+    
+    if impl_type == "auto":
+        impl_type = "token" if K >= 128 else "recursive"
+            
     BT = chunk_size
     if chunk_indices is None and cu_seqlens is not None:
         chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
 
-    BC = min(16, BT)
+    BC = 16
     NC = triton.cdiv(BT, BC)
     BK = max(triton.next_power_of_2(K), 16)
 
@@ -464,7 +790,7 @@ def chunk_kda_fwd_intra(
         NC=NC,
     )
 
-    if use_token_parallel:
+    if impl_type == "token":
         # Token-parallel implementation for sub_intra (each token gets its own block)
         chunk_kda_fwd_intra_token_parallel(
             q=q,
@@ -477,6 +803,25 @@ def chunk_kda_fwd_intra(
             cu_seqlens=cu_seqlens,
             chunk_size=BT,
         )
+    elif impl_type == "recurrent":
+        grid = (NT, NC, B * H)
+        chunk_kda_fwd_kernel_intra_sub_intra_recurrent[grid](
+            q=q,
+            k=k,
+            g=gk,
+            beta=beta,
+            Aqk=Aqk,
+            Akk=Akk,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            chunk_indices=chunk_indices,
+            T=T,
+            H=H,
+            K=K,
+            BT=BT,
+            BC=16,
+            BK=BK,
+        )
     else:
         # Original sub-chunk based implementation
         grid = (NT, NC, B * H)
@@ -494,7 +839,7 @@ def chunk_kda_fwd_intra(
             H=H,
             K=K,
             BT=BT,
-            BC=BC,
+            BC=16,
             BK=BK,
         )