diff --git a/fast_llm_external_models/apriel2/cache.py b/fast_llm_external_models/apriel2/cache.py
deleted file mode 100644
index f83ae87d6..000000000
--- a/fast_llm_external_models/apriel2/cache.py
+++ /dev/null
@@ -1,406 +0,0 @@
-from __future__ import annotations
-
-import torch
-from transformers.cache_utils import Cache
-
-
-class _AttentionCache:
-    __slots__ = ["key", "value", "window", "cumulative_length"]
-
-    def __init__(self, window=None):
-        self.key = None
-        self.value = None
-        self.window = window
-        self.cumulative_length = 0
-
-    def update(self, key, value):
-        new_tokens = key.shape[-2]
-        self.cumulative_length += new_tokens
-
-        if self.key is None:
-            if self.window and key.shape[-2] > self.window:
-                self.key = key[..., -self.window :, :].contiguous()
-                self.value = value[..., -self.window :, :].contiguous()
-            else:
-                self.key = key.contiguous()
-                self.value = value.contiguous()
-        else:
-            if self.window:
-                self.key = self._window(self.key, key)
-                self.value = self._window(self.value, value)
-            else:
-                self.key = torch.cat([self.key, key], -2)
-                self.value = torch.cat([self.value, value], -2)
-        return self.key, self.value
-
-    def _window(self, cache, new):
-        if cache.shape[-2] == self.window and new.shape[-2] == 1:
-            cache = cache.roll(-1, -2)
-            cache[..., -1:, :] = new
-            return cache
-        return torch.cat([cache, new], -2)[..., -self.window :, :].contiguous()
-
-    def reset(self):
-        self.key = None
-        self.value = None
-        self.cumulative_length = 0
-
-    def reorder(self, beam_idx):
-        if self.key is not None:
-            self.key = self.key.index_select(0, beam_idx.to(self.key.device))
-            self.value = self.value.index_select(0, beam_idx.to(self.value.device))
-
-    def crop(self, max_length):
-        if self.key is not None:
-            self.key = self.key[..., :max_length, :]
-            self.value = self.value[..., :max_length, :]
-            self.cumulative_length = self.key.shape[-2]
-
-    def batch_repeat(self, repeats):
-        if self.key is not None:
-            self.key = self.key.repeat_interleave(repeats, dim=0)
-            self.value = self.value.repeat_interleave(repeats, dim=0)
-
-    def batch_select(self, indices):
-        if self.key is not None:
-            self.key = self.key.index_select(0, indices.to(self.key.device))
-            self.value = self.value.index_select(0, indices.to(self.value.device))
-
-    @property
-    def is_initialized(self):
-        return self.key is not None
-
-    @property
-    def batch_size(self):
-        return self.key.shape[0] if self.key is not None else None
-
-
-class _SSMCache:
-    __slots__ = ["conv", "recurrent"]
-
-    def __init__(self):
-        self.conv = None
-        self.recurrent = None
-
-    def reset(self):
-        self.conv = None
-        self.recurrent = None
-
-    def reorder(self, beam_idx):
-        if self.conv is not None:
-            if isinstance(self.conv, tuple):
-                self.conv = tuple(c.index_select(0, beam_idx.to(c.device)) for c in self.conv)
-            else:
-                self.conv = self.conv.index_select(0, beam_idx.to(self.conv.device))
-        if self.recurrent is not None:
-            self.recurrent = self.recurrent.index_select(0, beam_idx.to(self.recurrent.device))
-
-    def crop(self, max_length):
-        pass  # SSM caches don't have sequence dimension to crop
-
-    def batch_repeat(self, repeats):
-        if self.conv is not None:
-            if isinstance(self.conv, tuple):
-                self.conv = tuple(c.repeat_interleave(repeats, dim=0) for c in self.conv)
-            else:
-                self.conv = self.conv.repeat_interleave(repeats, dim=0)
-        if self.recurrent is not None:
-            self.recurrent = self.recurrent.repeat_interleave(repeats, dim=0)
-
-    def batch_select(self, indices):
-        if self.conv is not None:
-            if isinstance(self.conv, tuple):
-                self.conv = tuple(c.index_select(0, indices.to(c.device)) for c in self.conv)
-            else:
-                self.conv = self.conv.index_select(0, indices.to(self.conv.device))
-        if self.recurrent is not None:
-            self.recurrent = self.recurrent.index_select(0, indices.to(self.recurrent.device))
-
-    @property
-    def is_initialized(self):
-        return self.conv is not None
-
-    @property
-    def batch_size(self):
-        if self.conv is None:
-            return None
-        if isinstance(self.conv, tuple):
-            return self.conv[0].shape[0]
-        return self.conv.shape[0]
-
-
-class _DummyCacheLayer:
-    pass
-
-
-class Apriel2Cache(Cache):
-
-    def __init__(self, config):
-        super().__init__(layer_class_to_replicate=_DummyCacheLayer)
-        self.config = config
-        n = config.decoder["num_blocks"]
-        self.layers = []
-        self.mixer_types = []
-        self.active_mixers = [None] * n
-
-        for i in range(n):
-            block = config.get_block_config(i)
-            mixer = block.get("mixer", {})
-            mtype = mixer.get("type", "attention")
-
-            if mtype == "stochastic":
-                sub = {}
-                main = mixer.get("main_mixer_name")
-                for name, cfg in mixer.get("mixers", {}).items():
-                    if cfg.get("type") == "attention":
-                        sub[name] = _AttentionCache(cfg.get("window_size"))
-                    else:
-                        sub[name] = _SSMCache()
-                self.layers.append(sub)
-                self.mixer_types.append(mixer["mixers"][main].get("type") if main else "attention")
-            elif mtype == "attention":
-                self.layers.append(_AttentionCache(mixer.get("window_size")))
-                self.mixer_types.append("attention")
-            else:
-                self.layers.append(_SSMCache())
-                self.mixer_types.append(mtype)
-
-    def update(self, key_states, value_states, layer_idx, cache_kwargs=None):
-        layer = self.layers[layer_idx]
-        if isinstance(layer, dict):
-            mixer = self.active_mixers[layer_idx]
-            if mixer is None:
-                raise RuntimeError(f"Stochastic layer {layer_idx} needs active_mixer set")
-            return layer[mixer].update(key_states, value_states)
-        return layer.update(key_states, value_states)
-
-    def set_active_mixer(self, layer_idx, mixer_name):
-        self.active_mixers[layer_idx] = mixer_name
-
-    def get_seq_length(self, layer_idx=0):
-        """Returns the cumulative sequence length of tokens seen by the cache.
-
-        For sliding window caches, this returns the total tokens seen (not just cached).
-        This matches HuggingFace's DynamicSlidingWindowLayer behavior.
-        """
-        layer = self.layers[layer_idx]
-        if isinstance(layer, dict):
-            mixer = self.active_mixers[layer_idx]
-            if mixer and isinstance(layer[mixer], _AttentionCache):
-                return layer[mixer].cumulative_length
-            return 0
-        if isinstance(layer, _AttentionCache):
-            return layer.cumulative_length
-        return 0
-
-    def get_max_cache_shape(self, layer_idx=0):
-        layer = self.layers[layer_idx]
-        if isinstance(layer, dict):
-            mixer = self.active_mixers[layer_idx]
-            if mixer and isinstance(layer[mixer], _AttentionCache):
-                return layer[mixer].window
-        elif isinstance(layer, _AttentionCache):
-            return layer.window
-        return None
-
-    def get_mask_sizes(self, cache_position, layer_idx):
-        """Return the length and offset of the cache, used to generate the attention mask.
-
-        For standard (non-sliding) attention:
-            kv_offset = 0 (KV[0] corresponds to sequence position 0)
-            kv_length = cumulative_length + query_length
-
-        For sliding window attention:
-            kv_offset = max(cumulative_length - window + 1, 0)
-            kv_length = min(cumulative_length, window - 1) + query_length
-
-        For SSM/linear layers:
-            kv_offset = 0, kv_length = query_length (no KV cache to attend to)
-        """
-        query_length = cache_position.shape[0]
-        layer = self.layers[layer_idx]
-
-        # Handle stochastic layers by getting the active mixer's cache
-        if isinstance(layer, dict):
-            mixer = self.active_mixers[layer_idx]
-            if mixer is None:
-                # No active mixer set, return defaults
-                return query_length, 0
-            cache = layer[mixer]
-        else:
-            cache = layer
-
-        # SSM layers don't have KV cache for attention mask purposes
-        if isinstance(cache, _SSMCache):
-            return query_length, 0
-
-        # Attention cache - check if sliding window
-        if isinstance(cache, _AttentionCache):
-            cumulative = cache.cumulative_length
-            window = cache.window
-
-            if window is not None:
-                # Sliding window attention
-                kv_offset = max(cumulative - window + 1, 0)
-                if cumulative >= window:
-                    kv_length = window - 1 + query_length
-                else:
-                    kv_length = cumulative + query_length
-            else:
-                # Full attention
-                kv_offset = 0
-                kv_length = cumulative + query_length
-
-            return kv_length, kv_offset
-
-        # Fallback
-        return query_length, 0
-
-    @property
-    def has_previous_state(self):
-        return any(isinstance(cache, _SSMCache) and cache.conv is not None for cache in self._iter_caches())
-
-    @property
-    def key_cache(self):
-        return _LayerListAccessor(self, "key")
-
-    @property
-    def value_cache(self):
-        return _LayerListAccessor(self, "value")
-
-    @property
-    def conv_states(self):
-        return _LayerListAccessor(self, "conv")
-
-    @property
-    def recurrent_states(self):
-        return _LayerListAccessor(self, "recurrent")
-
-    def _iter_caches(self):
-        """Iterate over all leaf cache objects (flattening stochastic layer dicts)."""
-        for layer in self.layers:
-            if isinstance(layer, dict):
-                yield from layer.values()
-            else:
-                yield layer
-
-    def reorder_cache(self, beam_idx):
-        for cache in self._iter_caches():
-            cache.reorder(beam_idx)
-
-    def reset(self):
-        for cache in self._iter_caches():
-            cache.reset()
-
-    def crop(self, max_length):
-        for cache in self._iter_caches():
-            cache.crop(max_length)
-
-    def batch_repeat_interleave(self, repeats):
-        for cache in self._iter_caches():
-            cache.batch_repeat(repeats)
-
-    def batch_select_indices(self, indices):
-        for cache in self._iter_caches():
-            cache.batch_select(indices)
-
-    @property
-    def is_compileable(self):
-        return False
-
-    @property
-    def is_initialized(self):
-        return any(cache.is_initialized for cache in self._iter_caches())
-
-    @property
-    def is_sliding(self):
-        result = []
-        for layer in self.layers:
-            if isinstance(layer, dict):
-                has_sliding = any(
-                    isinstance(cache, _AttentionCache) and cache.window is not None for cache in layer.values()
-                )
-                result.append(has_sliding)
-            elif isinstance(layer, _AttentionCache):
-                result.append(layer.window is not None)
-            else:
-                result.append(False)
-        return result
-
-    @property
-    def max_batch_size(self):
-        for cache in self._iter_caches():
-            bs = cache.batch_size
-            if bs is not None:
-                return bs
-        return None
-
-    @property
-    def max_cache_len(self):
-        windows = [
-            cache.window
-            for cache in self._iter_caches()
-            if isinstance(cache, _AttentionCache) and cache.window is not None
-        ]
-        return min(windows) if windows else None
-
-    def __len__(self):
-        return len(self.layers)
-
-    def __getitem__(self, idx):
-        layer = self.layers[idx]
-        if isinstance(layer, dict):
-            mixer = self.active_mixers[idx]
-            if mixer and isinstance(layer[mixer], _AttentionCache):
-                c = layer[mixer]
-                if c.key is not None:
-                    return c.key, c.value
-        elif isinstance(layer, _AttentionCache):
-            if layer.key is not None:
-                return layer.key, layer.value
-
-        for i, l in enumerate(self.layers):
-            if isinstance(l, _AttentionCache) and l.key is not None:
-                return torch.empty((0,), device=l.key.device, dtype=l.key.dtype), torch.empty(
-                    (0,), device=l.key.device, dtype=l.key.dtype
-                )
-            elif isinstance(l, dict):
-                for c in l.values():
-                    if isinstance(c, _AttentionCache) and c.key is not None:
-                        return torch.empty((0,), device=c.key.device, dtype=c.key.dtype), torch.empty(
-                            (0,), device=c.key.device, dtype=c.key.dtype
-                        )
-        return torch.empty((0,)), torch.empty((0,))
-
-
-class _LayerListAccessor:
-    __slots__ = ["cache", "attr"]
-
-    def __init__(self, cache, attr):
-        self.cache = cache
-        self.attr = attr
-
-    def __getitem__(self, idx):
-        layer = self.cache.layers[idx]
-        if isinstance(layer, dict):
-            mixer = self.cache.active_mixers[idx]
-            if mixer is None:
-                raise RuntimeError(
-                    f"Stochastic layer {idx} requires set_active_mixer() to be called before accessing cache. "
-                    f"Available mixers: {list(layer.keys())}"
-                )
-            return getattr(layer[mixer], self.attr)
-        return getattr(layer, self.attr, None)
-
-    def __setitem__(self, idx, value):
-        layer = self.cache.layers[idx]
-        if isinstance(layer, dict):
-            mixer = self.cache.active_mixers[idx]
-            if mixer is None:
-                raise RuntimeError(
-                    f"Stochastic layer {idx} requires set_active_mixer() to be called before accessing cache. "
-                    f"Available mixers: {list(layer.keys())}"
-                )
-            setattr(layer[mixer], self.attr, value)
-        elif hasattr(layer, self.attr):
-            setattr(layer, self.attr, value)
diff --git a/fast_llm_external_models/apriel2/modeling_apriel2.py b/fast_llm_external_models/apriel2/modeling_apriel2.py
index a37d6fcc8..e30fbc9e3 100644
--- a/fast_llm_external_models/apriel2/modeling_apriel2.py
+++ b/fast_llm_external_models/apriel2/modeling_apriel2.py
@@ -17,6 +17,7 @@
 from transformers.models.llama.modeling_llama import eager_attention_forward
 from transformers.models.mistral.modeling_mistral import MistralMLP, MistralRMSNorm, apply_rotary_pos_emb
 from transformers.processing_utils import Unpack
+from transformers.cache_utils import Cache
 from transformers.utils import logging
 from transformers.utils.import_utils import (
     is_causal_conv1d_available,
@@ -24,14 +25,14 @@
     is_torch_flex_attn_available,
 )
 
-from .cache import Apriel2Cache
 from .configuration_apriel2 import Apriel2Config, Apriel2TextConfig
 
 # GDN implementation - matches Fast-LLM's gdn.py exactly
 try:
-    from fla.ops.gated_delta_rule import chunk_gated_delta_rule
+    from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
 except ImportError:
     chunk_gated_delta_rule = None
+    fused_recurrent_gated_delta_rule = None
 
 try:
     from fla.modules.fused_norm_gate import rms_norm_gated
@@ -56,96 +57,447 @@
 
 logger = logging.get_logger(__name__)
 
-if not is_fast_path_available:
-    logger.warning(
-        "Mamba fast path not available. Requires CUDA, mamba_ssm, and causal_conv1d packages. "
-        "Falling back to PyTorch implementation (slower, CPU-compatible)."
-    )
 
+# =============================================================================
+# Cache Classes
+# =============================================================================
 
-class BlockSequenceKwargs(TypedDict, total=False):
-    attention_mask: Optional[torch.Tensor]
-    position_ids: Optional[torch.LongTensor]
-    cache_position: Optional[torch.LongTensor]
-    past_key_values: Optional[Apriel2Cache]
-    output_attentions: bool
-    output_hidden_states: bool
-    use_cache: bool
 
+class _AttentionCache:
+    __slots__ = ["key", "value", "window", "cumulative_length"]
 
-class PreprocessingOutput(TypedDict, total=False):
-    position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]]
-    attention_mask: Optional[torch.Tensor]
+    def __init__(self, window=None):
+        self.key = None
+        self.value = None
+        self.window = window
+        self.cumulative_length = 0
 
+    def update(self, key, value):
+        new_tokens = key.shape[-2]
+        self.cumulative_length += new_tokens
 
-@torch.compile
-def torch_causal_conv1d_fn(x, weight, bias=None, activation="silu"):
-    assert activation == "silu", f"Only silu activation is supported, got {activation}"
+        if self.key is None:
+            if self.window and key.shape[-2] > self.window:
+                self.key = key[..., -self.window :, :].contiguous()
+                self.value = value[..., -self.window :, :].contiguous()
+            else:
+                self.key = key.contiguous()
+                self.value = value.contiguous()
+        else:
+            if self.window:
+                self.key = self._window(self.key, key)
+                self.value = self._window(self.value, value)
+            else:
+                self.key = torch.cat([self.key, key], -2)
+                self.value = torch.cat([self.value, value], -2)
+        return self.key, self.value
+
+    def _window(self, cache, new):
+        if cache.shape[-2] == self.window and new.shape[-2] == 1:
+            cache = cache.roll(-1, -2)
+            cache[..., -1:, :] = new
+            return cache
+        return torch.cat([cache, new], -2)[..., -self.window :, :].contiguous()
+
+    def reset(self):
+        self.key = None
+        self.value = None
+        self.cumulative_length = 0
+
+    def reorder(self, beam_idx):
+        if self.key is not None:
+            self.key = self.key.index_select(0, beam_idx.to(self.key.device))
+            self.value = self.value.index_select(0, beam_idx.to(self.value.device))
+
+    def crop(self, max_length):
+        if self.key is not None:
+            self.key = self.key[..., :max_length, :]
+            self.value = self.value[..., :max_length, :]
+            self.cumulative_length = self.key.shape[-2]
+
+    def batch_repeat(self, repeats):
+        if self.key is not None:
+            self.key = self.key.repeat_interleave(repeats, dim=0)
+            self.value = self.value.repeat_interleave(repeats, dim=0)
+
+    def batch_select(self, indices):
+        if self.key is not None:
+            self.key = self.key.index_select(0, indices.to(self.key.device))
+            self.value = self.value.index_select(0, indices.to(self.value.device))
 
-    seqlen = x.shape[-1]
-    kernel_size = weight.shape[-1]
+    @property
+    def is_initialized(self):
+        return self.key is not None
 
-    # Causal padding and depthwise conv
-    x = F.pad(x, (kernel_size - 1, 0))
-    x = F.conv1d(x, weight.unsqueeze(1), bias=bias, groups=x.shape[1])
-    x = x[..., :seqlen]
+    @property
+    def batch_size(self):
+        return self.key.shape[0] if self.key is not None else None
 
-    return F.silu(x)
 
+class _SSMCache:
+    __slots__ = ["conv", "recurrent"]
 
-@torch.compile
-def torch_causal_conv1d_update(x, conv_state, weight, bias=None, activation="silu"):
-    """
-    Single-step causal convolution update.
+    def __init__(self):
+        self.conv = None
+        self.recurrent = None
 
-    Args:
-        x: New input [batch, dim]
-        conv_state: Previous state [batch, dim, kernel_size-1], updated in-place
-        weight: Convolution kernel [dim, kernel_size]
-        bias: Optional bias [dim]
-        activation: Activation function name
+    def reset(self):
+        self.conv = None
+        self.recurrent = None
 
-    Returns:
-        Output [batch, dim]
-    """
-    assert activation == "silu", f"Only silu activation is supported, got {activation}"
+    def reorder(self, beam_idx):
+        if self.conv is not None:
+            if isinstance(self.conv, tuple):
+                self.conv = tuple(c.index_select(0, beam_idx.to(c.device)) for c in self.conv)
+            else:
+                self.conv = self.conv.index_select(0, beam_idx.to(self.conv.device))
+        if self.recurrent is not None:
+            self.recurrent = self.recurrent.index_select(0, beam_idx.to(self.recurrent.device))
+
+    def crop(self, max_length):
+        pass  # SSM caches don't have sequence dimension to crop
+
+    def batch_repeat(self, repeats):
+        if self.conv is not None:
+            if isinstance(self.conv, tuple):
+                self.conv = tuple(c.repeat_interleave(repeats, dim=0) for c in self.conv)
+            else:
+                self.conv = self.conv.repeat_interleave(repeats, dim=0)
+        if self.recurrent is not None:
+            self.recurrent = self.recurrent.repeat_interleave(repeats, dim=0)
+
+    def batch_select(self, indices):
+        if self.conv is not None:
+            if isinstance(self.conv, tuple):
+                self.conv = tuple(c.index_select(0, indices.to(c.device)) for c in self.conv)
+            else:
+                self.conv = self.conv.index_select(0, indices.to(self.conv.device))
+        if self.recurrent is not None:
+            self.recurrent = self.recurrent.index_select(0, indices.to(self.recurrent.device))
+
+    @property
+    def is_initialized(self):
+        return self.conv is not None
 
-    dtype = x.dtype
-    # Concatenate state with new input to get full kernel_size window
-    # conv_state: [batch, dim, kernel_size-1], x: [batch, dim] -> full: [batch, dim, kernel_size]
-    full_state = torch.cat([conv_state, x.unsqueeze(-1)], dim=-1)
+    @property
+    def batch_size(self):
+        if self.conv is None:
+            return None
+        if isinstance(self.conv, tuple):
+            return self.conv[0].shape[0]
+        return self.conv.shape[0]
 
-    # Convolve: sum over last dimension
-    out = torch.sum(full_state * weight.unsqueeze(0), dim=-1)
-    if bias is not None:
-        out = out + bias
 
-    # Update state in-place: shift left and add new value
-    conv_state.copy_(full_state[:, :, 1:])
+class _DummyCacheLayer:
+    pass
 
-    return F.silu(out).to(dtype=dtype)
 
+class Apriel2Cache(Cache):
 
-def torch_selective_scan_fn(
-    u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=True, return_last_state=False
-):
-    raise NotImplementedError("torch_selective_scan_fn not yet implemented. Install mamba_ssm for CUDA kernels.")
+    def __init__(self, config):
+        super().__init__(layer_class_to_replicate=_DummyCacheLayer)
+        self.config = config
+        n = config.decoder["num_blocks"]
+        self.layers = []
+        self.mixer_types = []
+        self.active_mixers = [None] * n
+
+        for i in range(n):
+            block = config.get_block_config(i)
+            mixer = block.get("mixer", {})
+            mtype = mixer.get("type", "attention")
+
+            if mtype == "stochastic":
+                sub = {}
+                main = mixer.get("main_mixer_name")
+                for name, cfg in mixer.get("mixers", {}).items():
+                    if cfg.get("type") == "attention":
+                        sub[name] = _AttentionCache(cfg.get("window_size"))
+                    else:
+                        sub[name] = _SSMCache()
+                self.layers.append(sub)
+                self.mixer_types.append(mixer["mixers"][main].get("type") if main else "attention")
+            elif mtype == "attention":
+                self.layers.append(_AttentionCache(mixer.get("window_size")))
+                self.mixer_types.append("attention")
+            else:
+                self.layers.append(_SSMCache())
+                self.mixer_types.append(mtype)
+
+    def update(self, key_states, value_states, layer_idx, cache_kwargs=None):
+        layer = self.layers[layer_idx]
+        if isinstance(layer, dict):
+            mixer = self.active_mixers[layer_idx]
+            if mixer is None:
+                raise RuntimeError(f"Stochastic layer {layer_idx} needs active_mixer set")
+            return layer[mixer].update(key_states, value_states)
+        return layer.update(key_states, value_states)
+
+    def set_active_mixer(self, layer_idx, mixer_name):
+        self.active_mixers[layer_idx] = mixer_name
+
+    def get_seq_length(self, layer_idx=0):
+        """Returns the cumulative sequence length of tokens seen by the cache.
+
+        For sliding window caches, this returns the total tokens seen (not just cached).
+        This matches HuggingFace's DynamicSlidingWindowLayer behavior.
+        """
+        layer = self.layers[layer_idx]
+        if isinstance(layer, dict):
+            mixer = self.active_mixers[layer_idx]
+            if mixer and isinstance(layer[mixer], _AttentionCache):
+                return layer[mixer].cumulative_length
+            return 0
+        if isinstance(layer, _AttentionCache):
+            return layer.cumulative_length
+        return 0
+
+    def get_max_cache_shape(self, layer_idx=0):
+        layer = self.layers[layer_idx]
+        if isinstance(layer, dict):
+            mixer = self.active_mixers[layer_idx]
+            if mixer and isinstance(layer[mixer], _AttentionCache):
+                return layer[mixer].window
+        elif isinstance(layer, _AttentionCache):
+            return layer.window
+        return None
+
+    def get_mask_sizes(self, cache_position, layer_idx):
+        """Return the length and offset of the cache, used to generate the attention mask.
+
+        For standard (non-sliding) attention:
+            kv_offset = 0 (KV[0] corresponds to sequence position 0)
+            kv_length = cumulative_length + query_length
+
+        For sliding window attention:
+            kv_offset = max(cumulative_length - window + 1, 0)
+            kv_length = min(cumulative_length, window - 1) + query_length
+
+        For SSM/linear layers:
+            kv_offset = 0, kv_length = query_length (no KV cache to attend to)
+        """
+        query_length = cache_position.shape[0]
+        layer = self.layers[layer_idx]
+
+        # Handle stochastic layers by getting the active mixer's cache
+        if isinstance(layer, dict):
+            mixer = self.active_mixers[layer_idx]
+            if mixer is None:
+                # No active mixer set, return defaults
+                return query_length, 0
+            cache = layer[mixer]
+        else:
+            cache = layer
+
+        # SSM layers don't have KV cache for attention mask purposes
+        if isinstance(cache, _SSMCache):
+            return query_length, 0
+
+        # Attention cache - check if sliding window
+        if isinstance(cache, _AttentionCache):
+            cumulative = cache.cumulative_length
+            window = cache.window
+
+            if window is not None:
+                # Sliding window attention
+                kv_offset = max(cumulative - window + 1, 0)
+                if cumulative >= window:
+                    kv_length = window - 1 + query_length
+                else:
+                    kv_length = cumulative + query_length
+            else:
+                # Full attention
+                kv_offset = 0
+                kv_length = cumulative + query_length
 
+            return kv_length, kv_offset
 
-def torch_selective_state_update(state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=True):
-    raise NotImplementedError("torch_selective_state_update not yet implemented. Install mamba_ssm for CUDA kernels.")
+        # Fallback
+        return query_length, 0
 
+    @property
+    def has_previous_state(self):
+        return any(isinstance(cache, _SSMCache) and cache.conv is not None for cache in self._iter_caches())
 
-if is_fast_path_available:
-    from causal_conv1d import causal_conv1d_fn as _causal_conv1d_fn
-    from causal_conv1d import causal_conv1d_update as _causal_conv1d_update
-    from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
-    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
-else:
-    _causal_conv1d_fn = None
-    _causal_conv1d_update = None
-    selective_scan_fn = torch_selective_scan_fn
-    selective_state_update = torch_selective_state_update
+    @property
+    def key_cache(self):
+        return _LayerListAccessor(self, "key")
+
+    @property
+    def value_cache(self):
+        return _LayerListAccessor(self, "value")
+
+    @property
+    def conv_states(self):
+        return _LayerListAccessor(self, "conv")
+
+    @property
+    def recurrent_states(self):
+        return _LayerListAccessor(self, "recurrent")
+
+    def _iter_caches(self):
+        """Iterate over all leaf cache objects (flattening stochastic layer dicts)."""
+        for layer in self.layers:
+            if isinstance(layer, dict):
+                yield from layer.values()
+            else:
+                yield layer
+
+    def reorder_cache(self, beam_idx):
+        for cache in self._iter_caches():
+            cache.reorder(beam_idx)
+
+    def reset(self):
+        for cache in self._iter_caches():
+            cache.reset()
+
+    def crop(self, max_length):
+        for cache in self._iter_caches():
+            cache.crop(max_length)
+
+    def batch_repeat_interleave(self, repeats):
+        for cache in self._iter_caches():
+            cache.batch_repeat(repeats)
+
+    def batch_select_indices(self, indices):
+        for cache in self._iter_caches():
+            cache.batch_select(indices)
+
+    @property
+    def is_compileable(self):
+        return False
+
+    @property
+    def is_initialized(self):
+        return any(cache.is_initialized for cache in self._iter_caches())
+
+    @property
+    def is_sliding(self):
+        result = []
+        for layer in self.layers:
+            if isinstance(layer, dict):
+                has_sliding = any(
+                    isinstance(cache, _AttentionCache) and cache.window is not None for cache in layer.values()
+                )
+                result.append(has_sliding)
+            elif isinstance(layer, _AttentionCache):
+                result.append(layer.window is not None)
+            else:
+                result.append(False)
+        return result
+
+    @property
+    def max_batch_size(self):
+        for cache in self._iter_caches():
+            bs = cache.batch_size
+            if bs is not None:
+                return bs
+        return None
+
+    @property
+    def max_cache_len(self):
+        windows = [
+            cache.window
+            for cache in self._iter_caches()
+            if isinstance(cache, _AttentionCache) and cache.window is not None
+        ]
+        return min(windows) if windows else None
+
+    def __len__(self):
+        return len(self.layers)
+
+    def __getitem__(self, idx):
+        layer = self.layers[idx]
+        if isinstance(layer, dict):
+            mixer = self.active_mixers[idx]
+            if mixer and isinstance(layer[mixer], _AttentionCache):
+                c = layer[mixer]
+                if c.key is not None:
+                    return c.key, c.value
+        elif isinstance(layer, _AttentionCache):
+            if layer.key is not None:
+                return layer.key, layer.value
+
+        for i, l in enumerate(self.layers):
+            if isinstance(l, _AttentionCache) and l.key is not None:
+                return torch.empty((0,), device=l.key.device, dtype=l.key.dtype), torch.empty(
+                    (0,), device=l.key.device, dtype=l.key.dtype
+                )
+            elif isinstance(l, dict):
+                for c in l.values():
+                    if isinstance(c, _AttentionCache) and c.key is not None:
+                        return torch.empty((0,), device=c.key.device, dtype=c.key.dtype), torch.empty(
+                            (0,), device=c.key.device, dtype=c.key.dtype
+                        )
+        return torch.empty((0,)), torch.empty((0,))
+
+
+class _LayerListAccessor:
+    __slots__ = ["cache", "attr"]
+
+    def __init__(self, cache, attr):
+        self.cache = cache
+        self.attr = attr
+
+    def __getitem__(self, idx):
+        layer = self.cache.layers[idx]
+        if isinstance(layer, dict):
+            mixer = self.cache.active_mixers[idx]
+            if mixer is None:
+                raise RuntimeError(
+                    f"Stochastic layer {idx} requires set_active_mixer() to be called before accessing cache. "
+                    f"Available mixers: {list(layer.keys())}"
+                )
+            return getattr(layer[mixer], self.attr)
+        return getattr(layer, self.attr, None)
+
+    def __setitem__(self, idx, value):
+        layer = self.cache.layers[idx]
+        if isinstance(layer, dict):
+            mixer = self.cache.active_mixers[idx]
+            if mixer is None:
+                raise RuntimeError(
+                    f"Stochastic layer {idx} requires set_active_mixer() to be called before accessing cache. "
+                    f"Available mixers: {list(layer.keys())}"
+                )
+            setattr(layer[mixer], self.attr, value)
+        elif hasattr(layer, self.attr):
+            setattr(layer, self.attr, value)
+
+
+# =============================================================================
+# TypedDict Classes
+# =============================================================================
+
+
+class BlockSequenceKwargs(TypedDict, total=False):
+    attention_mask: Optional[torch.Tensor]
+    position_ids: Optional[torch.LongTensor]
+    cache_position: Optional[torch.LongTensor]
+    past_key_values: Optional[Apriel2Cache]
+    output_attentions: bool
+    output_hidden_states: bool
+    use_cache: bool
+
+
+class PreprocessingOutput(TypedDict, total=False):
+    position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]]
+    attention_mask: Optional[torch.Tensor]
+
+
+
+
+# Require fast path CUDA kernels - no silent fallback to unoptimized code paths
+if not is_fast_path_available:
+    raise ImportError(
+        "CausalConv1d and Mamba require CUDA kernels from causal_conv1d and mamba_ssm. "
+        "Install with: pip install causal-conv1d mamba-ssm"
+    )
+
+from causal_conv1d import causal_conv1d_fn as _causal_conv1d_fn
+from causal_conv1d import causal_conv1d_update as _causal_conv1d_update
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update
 
 
 class CausalConv1d(nn.Conv1d):
@@ -158,7 +510,8 @@ class CausalConv1d(nn.Conv1d):
     Supports:
     - Prefill mode: process full sequence, optionally return final state for caching
     - Decode mode: single-token update using cached conv state
-    - CUDA fast path (causal_conv1d library) with automatic CPU/fallback support
+
+    Requires causal_conv1d library for CUDA kernels (no PyTorch fallback).
     """
 
     def __init__(
@@ -185,10 +538,6 @@ def _weight(self) -> torch.Tensor:
         """Weight in [dim, kernel_size] format for causal_conv1d functions."""
         return self.weight.squeeze(1)
 
-    def _use_fast_path(self, x: torch.Tensor) -> bool:
-        """Check if we can use CUDA fast path."""
-        return _causal_conv1d_fn is not None and x.device.type == "cuda"
-
     def forward(
         self,
         x: torch.Tensor,
@@ -210,76 +559,61 @@ def forward(
             If return_final_state is True: (output, final_state) tuple
         """
         batch_size, dim, seq_len = x.shape
+        state_len = self.kernel_size[0] - 1
 
+        # Edge case: seq_len==1 with return_final_state
         # CUDA kernel limitation: return_final_states requires channel-last layout,
-        # which is impossible to achieve when seq_len==1. Fall back to PyTorch.
-        use_fast_path = self._use_fast_path(x) and not (return_final_state and seq_len == 1)
-
-        if use_fast_path:
-            # CUDA fast path
-            if return_final_state:
-                # causal_conv1d requires channel-last layout for returning final states.
-                # Channel-last means: stride(1)==1 AND stride(2)==dim (channels are contiguous).
-                # For shape [batch, dim, seq], standard contiguous is (dim*seq, seq, 1).
-                # Channel-last is (dim*seq, 1, dim) - achieved via transpose+contiguous+transpose.
-                if x.stride(1) != 1 or x.stride(2) < dim:
-                    x = x.transpose(1, 2).contiguous().transpose(1, 2)
-                # Allocate final state buffer with correct memory layout
-                # causal_conv1d requires final_states.stride(1) == 1
-                final_state = x.new_zeros(batch_size, self.kernel_size[0] - 1, dim).transpose(1, 2)
-            else:
-                final_state = None
-
-            out = _causal_conv1d_fn(
-                x,
+        # which is impossible when seq_len==1. Handle via update() with zero-init state.
+        if return_final_state and seq_len == 1:
+            # Initialize zero state if none provided, with channel-last layout for CUDA kernel
+            if conv_state is None:
+                # Create channel-last state: stride(1) == 1
+                conv_state = x.new_zeros(batch_size, state_len, dim).transpose(1, 2)
+            # Use update() which handles single tokens efficiently
+            out = _causal_conv1d_update(
+                x.squeeze(2),  # [batch, dim, 1] -> [batch, dim]
+                conv_state,
                 self._weight,
                 bias=self.bias,
-                initial_states=conv_state,
-                return_final_states=return_final_state,
-                final_states_out=final_state,
                 activation=self._activation,
             )
-
-            if return_final_state:
-                if isinstance(out, tuple):
-                    out, final_state = out
-                # Return a contiguous copy (still in channel-last layout) so callers can modify it in-place
-                # final_state has shape [batch, dim, state_len] with channel-last strides
-                # We need to preserve the channel-last layout for subsequent CUDA kernel calls
-                if final_state.stride(1) != 1:
-                    # Already contiguous in channel-last
-                    pass
-                else:
-                    # Make a copy that's safe to modify in-place
-                    final_state = final_state.clone()
-                return out, final_state
-            return out
+            return out.unsqueeze(2), conv_state  # [batch, dim, 1], updated state
+
+        # Standard CUDA path
+        if return_final_state:
+            # causal_conv1d requires channel-last layout for returning final states.
+            # Channel-last means: stride(1)==1 AND stride(2)==dim (channels are contiguous).
+            # For shape [batch, dim, seq], standard contiguous is (dim*seq, seq, 1).
+            # Channel-last is (dim*seq, 1, dim) - achieved via transpose+contiguous+transpose.
+            if x.stride(1) != 1 or x.stride(2) < dim:
+                x = x.transpose(1, 2).contiguous().transpose(1, 2)
+            # Allocate final state buffer with correct memory layout
+            # causal_conv1d requires final_states.stride(1) == 1
+            final_state = x.new_zeros(batch_size, state_len, dim).transpose(1, 2)
         else:
-            # PyTorch fallback
-            state_len = self.kernel_size[0] - 1
-
-            if conv_state is not None:
-                # Prepend state to input for proper convolution with history
-                x_with_state = torch.cat([conv_state, x], dim=-1)
-                out_with_state = torch_causal_conv1d_fn(
-                    x_with_state, self._weight, bias=self.bias, activation=self._activation
-                )
-                # Only keep outputs for the new input positions (not the state positions)
-                out = out_with_state[:, :, state_len:]
-            else:
-                out = torch_causal_conv1d_fn(x, self._weight, bias=self.bias, activation=self._activation)
-
-            if return_final_state:
-                # Final state: last kernel_size-1 positions of input (with state if provided)
-                if conv_state is not None:
-                    combined = torch.cat([conv_state, x], dim=-1)
-                    final_state = combined[:, :, -state_len:].clone()
-                elif seq_len < state_len:
-                    final_state = F.pad(x, (state_len - seq_len, 0))
-                else:
-                    final_state = x[:, :, -state_len:].clone()
-                return out, final_state
-            return out
+            final_state = None
+
+        out = _causal_conv1d_fn(
+            x,
+            self._weight,
+            bias=self.bias,
+            initial_states=conv_state,
+            return_final_states=return_final_state,
+            final_states_out=final_state,
+            activation=self._activation,
+        )
+
+        if return_final_state:
+            if isinstance(out, tuple):
+                out, final_state = out
+            # final_state has shape [batch, dim, state_len] with channel-last strides
+            # Ensure it's safe for in-place updates by subsequent CUDA kernel calls
+            assert final_state is not None
+            if final_state.stride(1) == 1:
+                # Make a copy that's safe to modify in-place
+                final_state = final_state.clone()
+            return out, final_state
+        return out
 
     def update(
         self,
@@ -296,22 +630,13 @@ def update(
         Returns:
             Output tensor [batch, dim]
         """
-        if self._use_fast_path(x):
-            return _causal_conv1d_update(
-                x,
-                conv_state,
-                self._weight,
-                bias=self.bias,
-                activation=self._activation,
-            )
-        else:
-            return torch_causal_conv1d_update(
-                x,
-                conv_state,
-                self._weight,
-                bias=self.bias,
-                activation=self._activation,
-            )
+        return _causal_conv1d_update(
+            x,
+            conv_state,
+            self._weight,
+            bias=self.bias,
+            activation=self._activation,
+        )
 
 
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -958,93 +1283,10 @@ def _l2norm(x: torch.Tensor, dim: int = -1, eps: float = 1e-6) -> torch.Tensor:
     return x * torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps)
 
 
-def torch_chunk_gated_delta_rule(
-    query,
-    key,
-    value,
-    g,
-    beta,
-    chunk_size=64,
-    initial_state=None,
-    output_final_state=False,
-    use_qk_l2norm_in_kernel=False,
-):
-    """Pure PyTorch fallback for chunk_gated_delta_rule - matches Fast-LLM's gdn.py."""
-    initial_dtype = query.dtype
-    if use_qk_l2norm_in_kernel:
-        query = _l2norm(query, dim=-1, eps=1e-6)
-        key = _l2norm(key, dim=-1, eps=1e-6)
-    query, key, value, beta, g = (
-        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
-    )
-
-    batch_size, num_heads, sequence_length, k_head_dim = key.shape
-    v_head_dim = value.shape[-1]
-    pad_size = (chunk_size - sequence_length % chunk_size) % chunk_size
-    query = F.pad(query, (0, 0, 0, pad_size))
-    key = F.pad(key, (0, 0, 0, pad_size))
-    value = F.pad(value, (0, 0, 0, pad_size))
-    beta = F.pad(beta, (0, pad_size))
-    g = F.pad(g, (0, pad_size))
-    total_sequence_length = sequence_length + pad_size
-    scale = 1 / (query.shape[-1] ** 0.5)
-    query = query * scale
-
-    v_beta = value * beta.unsqueeze(-1)
-    k_beta = key * beta.unsqueeze(-1)
-    # reshape to chunks
-    query, key, value, k_beta, v_beta = (
-        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1]) for x in (query, key, value, k_beta, v_beta)
-    )
-    g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
-    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0)
-
-    # chunk decay
-    g = g.cumsum(dim=-1)
-    decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril()
-    attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
-    for i in range(1, chunk_size):
-        row = attn[..., i, :i].clone()
-        sub = attn[..., :i, :i].clone()
-        attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
-    attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
-    value = attn @ v_beta
-    k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
-    last_recurrent_state = (
-        torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value)
-        if initial_state is None
-        else initial_state.to(value)
-    )
-    core_attn_out = torch.zeros_like(value)
-    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1)
-
-    # for each chunk
-    for i in range(0, total_sequence_length // chunk_size):
-        q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
-        attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
-        v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
-        v_new = v_i - v_prime
-        attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
-        core_attn_out[:, :, i] = attn_inter + attn @ v_new
-        last_recurrent_state = (
-            last_recurrent_state * g[:, :, i, -1, None, None].exp()
-            + (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
-        )
-
-    if not output_final_state:
-        last_recurrent_state = None
-    elif last_recurrent_state is not None:
-        last_recurrent_state = last_recurrent_state.to(initial_dtype)
-    core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1])
-    core_attn_out = core_attn_out[:, :, :sequence_length]
-    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
-    return core_attn_out, last_recurrent_state
-
-
 class GatedRMSNormalization(nn.Module):
     """
     Gated RMS normalization layer matching Fast-LLM's implementation.
-    Uses fla.modules.fused_norm_gate.rms_norm_gated when available.
+    Uses fla.modules.fused_norm_gate.rms_norm_gated (required).
 
     Args:
         hidden_size: Size of the hidden dimension
@@ -1054,18 +1296,16 @@ class GatedRMSNormalization(nn.Module):
 
     def __init__(self, hidden_size: int, eps: float = 1e-5, activation: str = "silu"):
         super().__init__()
+        if rms_norm_gated is None:
+            raise ImportError(
+                "GatedRMSNormalization requires rms_norm_gated from fla library. "
+                "Install with: pip install fla-core"
+            )
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.eps = eps
         self.activation = activation
 
     def forward(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor:
-        # Use PyTorch fallback on CPU since fla requires CUDA
-        if rms_norm_gated is not None and input_.device.type != "cpu":
-            return self._forward_fla(input_, gate)
-        else:
-            return self._forward_local(input_, gate)
-
-    def _forward_fla(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor:
         return rms_norm_gated(
             input_,
             gate,
@@ -1078,19 +1318,6 @@ def _forward_fla(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor
             residual_in_fp32=False,
         )
 
-    def _forward_local(self, input_: torch.Tensor, gate: torch.Tensor) -> torch.Tensor:
-        """Pure PyTorch fallback for gated RMS normalization."""
-        input_dtype = input_.dtype
-        hidden_states = input_.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-        hidden_states = self.weight * hidden_states.to(input_dtype)
-        # Apply gating with configured activation
-        if self.activation == "sigmoid":
-            return hidden_states * torch.sigmoid(gate)
-        else:  # silu
-            return hidden_states * F.silu(gate)
-
 
 class Apriel2GatedDeltaNet(nn.Module):
     """
@@ -1156,13 +1383,11 @@ def __init__(
         # Normalization layer - named 'norm' with 'weight' param to match Fast-LLM
         self.norm = GatedRMSNormalization(self.value_head_dim, eps=self.norm_eps)
 
-        # Select kernel implementation - fla if available, else torch fallback
-        self._chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
-
-        if chunk_gated_delta_rule is None:
-            logger.warning(
-                "GatedDeltaNet fast path not available. Install fla library for optimized kernels. "
-                "Falling back to PyTorch implementation."
+        # Require FLA kernels - no silent fallback to unoptimized code paths
+        if chunk_gated_delta_rule is None or fused_recurrent_gated_delta_rule is None:
+            raise ImportError(
+                "GatedDeltaNet requires the fla library for optimized kernels. "
+                "Install with: pip install fla-core"
             )
 
     def _fix_query_key_value_ordering(self, mixed_qkvz: torch.Tensor, mixed_ba: torch.Tensor):
@@ -1272,21 +1497,16 @@ def forward(self, hidden_states: torch.Tensor, past_key_values=None, attention_m
             query = query.repeat_interleave(self.value_heads_per_key, dim=2)
             key = key.repeat_interleave(self.value_heads_per_key, dim=2)
 
-        # Run gated delta rule
-        # Use PyTorch fallback on CPU since fla requires CUDA
-        chunk_fn = self._chunk_gated_delta_rule
-        if query.device.type == "cpu" and chunk_gated_delta_rule is not None:
-            chunk_fn = torch_chunk_gated_delta_rule
-
+        # Run gated delta rule (FLA kernels required)
         if not use_precomputed_states:
             # Chunked mode for prefill
-            output, last_recurrent_state = chunk_fn(
+            output, last_recurrent_state = chunk_gated_delta_rule(
                 query,
                 key,
                 value,
                 g=g,
                 beta=beta_gate,
-                initial_state=None,
+                initial_state=recurrent_state,
                 output_final_state=past_key_values is not None,
                 use_qk_l2norm_in_kernel=True,
             )
@@ -1295,11 +1515,15 @@ def forward(self, hidden_states: torch.Tensor, past_key_values=None, attention_m
                 last_recurrent_state = last_recurrent_state.to(hidden_states.dtype)
         else:
             # Recurrent mode for single token decode
-            # Convert recurrent_state to match hidden_states dtype if needed
-            if recurrent_state is not None and recurrent_state.dtype != hidden_states.dtype:
-                recurrent_state = recurrent_state.to(hidden_states.dtype)
-            output, last_recurrent_state = self._recurrent_gated_delta_rule(
-                query, key, value, g, beta_gate, recurrent_state
+            output, last_recurrent_state = fused_recurrent_gated_delta_rule(
+                query,
+                key,
+                value,
+                g=g,
+                beta=beta_gate,
+                initial_state=recurrent_state,
+                output_final_state=past_key_values is not None,
+                use_qk_l2norm_in_kernel=True,
             )
 
         # Update recurrent state in cache
@@ -1319,69 +1543,6 @@ def forward(self, hidden_states: torch.Tensor, past_key_values=None, attention_m
 
         return (output,)
 
-    def _recurrent_gated_delta_rule(self, query, key, value, g, beta, state):
-        """Single-step recurrent update for cached inference.
-
-        Input shapes: [batch, seq=1, heads, dim]
-        State shape: [batch, heads, key_dim, value_dim]
-
-        Implements the delta rule recurrence:
-            1. Decay state: S = S * exp(g)
-            2. Retrieve memory: mem = S @ k
-            3. Compute delta: delta = (v - mem) * beta
-            4. Update state: S = S + k ⊗ delta
-            5. Output: o = S @ q (scaled)
-        """
-        input_dtype = query.dtype
-
-        # Transpose from [batch, seq, heads, dim] to [batch, heads, seq, dim]
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
-
-        # L2 normalize query and key
-        query = _l2norm(query, dim=-1, eps=1e-6)
-        key = _l2norm(key, dim=-1, eps=1e-6)
-
-        # Apply query scaling (matches chunked mode)
-        scale = 1.0 / (query.shape[-1] ** 0.5)
-        query = query * scale
-
-        # Reshape for computation: [batch, heads, 1, dim] -> [batch, heads, dim]
-        query = query.squeeze(2)
-        key = key.squeeze(2)
-        value = value.squeeze(2)
-        g = g.squeeze(1)
-        beta = beta.squeeze(1)
-
-        # 1. Decay state: S = S * exp(g)
-        decay = g.exp().to(input_dtype).unsqueeze(-1).unsqueeze(-1)  # [batch, heads, 1, 1]
-        state = state * decay
-
-        # 2. Retrieve memory: mem = S @ k = (S * k.unsqueeze(-1)).sum(dim=-2)
-        # state: [batch, heads, key_dim, value_dim], key: [batch, heads, key_dim]
-        kv_mem = (state * key.unsqueeze(-1)).sum(dim=-2)  # [batch, heads, value_dim]
-
-        # 3. Compute delta: delta = (v - mem) * beta
-        delta = (value - kv_mem) * beta.unsqueeze(-1)  # [batch, heads, value_dim]
-
-        # 4. Update state: S = S + k ⊗ delta
-        # k.unsqueeze(-1): [batch, heads, key_dim, 1]
-        # delta.unsqueeze(-2): [batch, heads, 1, value_dim]
-        state = state + key.unsqueeze(-1) * delta.unsqueeze(-2)
-
-        # 5. Output: o = S @ q = (S * q.unsqueeze(-1)).sum(dim=-2)
-        output = (state * query.unsqueeze(-1)).sum(dim=-2)  # [batch, heads, value_dim]
-        output = output.unsqueeze(2)  # [batch, heads, 1, value_dim]
-
-        # Transpose back to [batch, seq=1, heads, value_dim]
-        output = output.transpose(1, 2)
-
-        # Ensure state matches output dtype
-        state = state.to(output.dtype)
-
-        return output, state
-
     @classmethod
     def setup(
         cls,
@@ -1416,7 +1577,7 @@ class KimiDeltaAttention(nn.Module):
     - norm - gated RMS normalization
 
     Uses fla.ops.kda.chunk_kda and fused_recurrent_kda kernels.
-    Uses CausalConv1d for convolutions (CUDA fast path with PyTorch fallback).
+    Uses CausalConv1d for convolutions (requires causal_conv1d CUDA kernels).
     """
 
     def __init__(
@@ -1550,9 +1711,7 @@ def forward(
         **kwargs,
     ):
         batch_size, seq_len, _ = hidden_states.shape
-        mode = "fused_recurrent" if seq_len <= 64 else self.mode
-        if self.training:
-            mode = "chunk"
+        mode = "fused_recurrent" if (seq_len <= 64 and not self.training) else self.mode
 
         # Get cache states if available
         conv_state_q, conv_state_k, conv_state_v = None, None, None
@@ -1570,10 +1729,9 @@ def forward(
         k, conv_state_k = self._apply_conv(self.k_proj(hidden_states), self.k_conv, conv_state_k, use_cache)
         v, conv_state_v = self._apply_conv(self.v_proj(hidden_states), self.v_conv, conv_state_v, use_cache)
 
-        # Gate kernel computation
+        # Gate kernel computation (raw g, gate applied inside kernel for chunk mode)
         g = self.f_b_proj(self.f_a_proj(hidden_states))
         g = rearrange(g, "... (h d) -> ... h d", d=self.head_dim)
-        g = fused_kda_gate(g, self.A_log.float(), dt_bias=self.dt_bias)
 
         # Beta gating
         beta = self.beta_proj(hidden_states).float().sigmoid()
@@ -1584,17 +1742,23 @@ def forward(
 
         # Run KDA kernel
         if mode == "chunk":
+            # For chunk mode: gate computed inside kernel (matches FLA reference)
             o, recurrent_state = chunk_kda(
                 q=q,
                 k=k,
                 v=v,
                 g=g,
                 beta=beta,
+                A_log=self.A_log,
+                dt_bias=self.dt_bias,
                 initial_state=recurrent_state,
                 output_final_state=past_key_values is not None,
                 use_qk_l2norm_in_kernel=True,
+                use_gate_in_kernel=True,
             )
         else:
+            # For fused_recurrent mode: pre-compute gate (matches FLA reference)
+            g = fused_kda_gate(g, self.A_log.float(), dt_bias=self.dt_bias)
             o, recurrent_state = fused_recurrent_kda(
                 q=q,
                 k=k,
diff --git a/fast_llm_external_models/tests/test_apriel2/conftest.py b/fast_llm_external_models/tests/test_apriel2/conftest.py
index 21b90b097..de83c5597 100644
--- a/fast_llm_external_models/tests/test_apriel2/conftest.py
+++ b/fast_llm_external_models/tests/test_apriel2/conftest.py
@@ -7,7 +7,7 @@
 import torch
 from transformers import LlavaConfig, LlavaForConditionalGeneration, MistralConfig
 
-from fast_llm_external_models.apriel2.cache import _AttentionCache, _SSMCache
+from fast_llm_external_models.apriel2.modeling_apriel2 import _AttentionCache, _SSMCache
 
 
 # Register custom marks
@@ -831,7 +831,7 @@ def apriel2_config_with_bias():
 @pytest.fixture
 def apriel2_cache(apriel2_config_tiny):
     """Create empty Apriel2Cache from tiny config."""
-    from fast_llm_external_models.apriel2.cache import Apriel2Cache
+    from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2Cache
 
     return Apriel2Cache(apriel2_config_tiny)
 
diff --git a/fast_llm_external_models/tests/test_apriel2/test_cache_apriel2_specific.py b/fast_llm_external_models/tests/test_apriel2/test_cache_apriel2_specific.py
index b45779454..f14f0d319 100644
--- a/fast_llm_external_models/tests/test_apriel2/test_cache_apriel2_specific.py
+++ b/fast_llm_external_models/tests/test_apriel2/test_cache_apriel2_specific.py
@@ -18,7 +18,7 @@
 import pytest
 import torch
 
-from fast_llm_external_models.apriel2.cache import Apriel2Cache
+from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2Cache
 
 # =============================================================================
 # STOCHASTIC MIXER ROUTING
diff --git a/fast_llm_external_models/tests/test_apriel2/test_cache_contracts.py b/fast_llm_external_models/tests/test_apriel2/test_cache_contracts.py
index 8ceabfb91..337ff1fa3 100644
--- a/fast_llm_external_models/tests/test_apriel2/test_cache_contracts.py
+++ b/fast_llm_external_models/tests/test_apriel2/test_cache_contracts.py
@@ -27,7 +27,7 @@
 import pytest
 import torch
 
-from fast_llm_external_models.apriel2.cache import Apriel2Cache, _AttentionCache
+from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2Cache, _AttentionCache
 
 # =============================================================================
 # SECTION 1: FULL ATTENTION - _AttentionCache vs DynamicLayer
diff --git a/fast_llm_external_models/tests/test_apriel2/test_causal_conv1d.py b/fast_llm_external_models/tests/test_apriel2/test_causal_conv1d.py
deleted file mode 100644
index 0567cd76e..000000000
--- a/fast_llm_external_models/tests/test_apriel2/test_causal_conv1d.py
+++ /dev/null
@@ -1,544 +0,0 @@
-"""Tests for CausalConv1d consistency across all code paths.
-
-The Key Consistency Property
-============================
-For ANY input sequence, ALL of the following must produce the SAME output:
-
-1. Prefill entire sequence at once (CPU/PyTorch fallback)
-2. Prefill entire sequence at once (CUDA fast path)
-3. Prefill in chunks with state passing (CPU)
-4. Prefill in chunks with state passing (CUDA)
-5. Prefill prefix + decode remaining tokens one-by-one (CPU)
-6. Prefill prefix + decode remaining tokens one-by-one (CUDA)
-7. Mixed: CUDA prefill → CPU decode
-8. Mixed: CPU prefill → CUDA decode
-
-This is critical because during inference:
-- Prefill processes the prompt (potentially chunked for long prompts)
-- Decode generates tokens one at a time
-- If these paths diverge, generation quality degrades silently
-"""
-
-import pytest
-import torch
-
-from fast_llm_external_models.apriel2.modeling_apriel2 import CausalConv1d, _causal_conv1d_fn
-
-# =============================================================================
-# Fixtures
-# =============================================================================
-
-
-@pytest.fixture
-def conv():
-    """CausalConv1d layer with fixed random weights (on CPU)."""
-    torch.manual_seed(42)
-    return CausalConv1d(
-        in_channels=64,
-        out_channels=64,
-        kernel_size=4,
-        groups=64,
-        bias=True,
-        activation="silu",
-        device="cpu",
-    )
-
-
-@pytest.fixture
-def dim():
-    return 64
-
-
-@pytest.fixture
-def kernel_size():
-    return 4
-
-
-# =============================================================================
-# Helpers
-# =============================================================================
-
-
-def to_device(conv: CausalConv1d, device: str) -> CausalConv1d:
-    """Create a copy of conv on the specified device."""
-    import copy
-
-    return copy.deepcopy(conv).to(device)
-
-
-def prefill(conv: CausalConv1d, x: torch.Tensor, state: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
-    """Prefill and return (output, final_state)."""
-    return conv(x, conv_state=state, return_final_state=True)
-
-
-def decode_sequence(
-    conv: CausalConv1d, tokens: torch.Tensor, state: torch.Tensor
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Decode multiple tokens one-by-one, return (stacked_outputs, final_state).
-
-    Args:
-        conv: CausalConv1d layer
-        tokens: [batch, dim, num_tokens] - tokens to decode
-        state: [batch, dim, kernel_size-1] - initial state (modified in-place)
-
-    Returns:
-        outputs: [batch, dim, num_tokens] - output for each token
-        state: final state after all tokens
-    """
-    outputs = []
-    for i in range(tokens.shape[-1]):
-        token = tokens[:, :, i]
-        out = conv.update(token, state)
-        outputs.append(out)
-    return torch.stack(outputs, dim=-1), state
-
-
-# =============================================================================
-# Unit Tests
-# =============================================================================
-
-
-class TestCausalConv1dBasics:
-    """Basic functionality tests."""
-
-    def test_output_shape(self, conv, dim):
-        """Output shape matches input shape."""
-        x = torch.randn(2, dim, 16, device="cpu")
-        out = conv(x)
-        assert out.shape == x.shape
-
-    def test_state_shape(self, conv, dim, kernel_size):
-        """Returned state has correct shape."""
-        x = torch.randn(2, dim, 16, device="cpu")
-        out, state = conv(x, return_final_state=True)
-        assert state.shape == (2, dim, kernel_size - 1)
-
-    def test_deterministic(self, conv, dim):
-        """Same input produces same output."""
-        x = torch.randn(2, dim, 16, device="cpu")
-        out1 = conv(x)
-        out2 = conv(x)
-        torch.testing.assert_close(out1, out2)
-
-    def test_update_output_shape(self, conv, dim, kernel_size):
-        """Update produces single token output."""
-        token = torch.randn(2, dim, device="cpu")
-        state = torch.randn(2, dim, kernel_size - 1, device="cpu")
-        out = conv.update(token, state)
-        assert out.shape == (2, dim)
-
-    def test_fast_path_detection(self, conv, dim):
-        """Fast path correctly detected based on device."""
-        x_cpu = torch.randn(2, dim, 16, device="cpu")
-        assert not conv._use_fast_path(x_cpu)
-
-        if torch.cuda.is_available():
-            x_cuda = torch.randn(2, dim, 16, device="cuda")
-            conv_cuda = conv.cuda()
-            # Fast path available only if CUDA kernels installed
-            expected = _causal_conv1d_fn is not None
-            assert conv_cuda._use_fast_path(x_cuda) == expected
-
-
-# =============================================================================
-# Backend Equivalence (CUDA vs CPU)
-# =============================================================================
-
-
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
-@pytest.mark.skipif(_causal_conv1d_fn is None, reason="CUDA conv kernels required")
-class TestBackendEquivalence:
-    """CUDA and CPU backends produce identical results."""
-
-    @pytest.mark.parametrize("seq_len", [1, 4, 8, 17, 32, 65])
-    @pytest.mark.parametrize("batch_size", [1, 2, 4])
-    def test_prefill_cuda_vs_cpu(self, conv, dim, seq_len, batch_size):
-        """CUDA prefill matches CPU prefill."""
-        torch.manual_seed(123)
-        x = torch.randn(batch_size, dim, seq_len, device="cpu")
-
-        # CPU
-        out_cpu = conv(x)
-
-        # CUDA
-        conv_cuda = to_device(conv, "cuda")
-        out_cuda = conv_cuda(x.cuda()).cpu()
-
-        torch.testing.assert_close(out_cuda, out_cpu, atol=1e-4, rtol=1e-4)
-
-    @pytest.mark.parametrize("seq_len", [1, 4, 8, 17, 32])
-    def test_prefill_with_state_cuda_vs_cpu(self, conv, dim, kernel_size, seq_len):
-        """CUDA prefill with state output matches CPU."""
-        torch.manual_seed(123)
-        x = torch.randn(2, dim, seq_len, device="cpu")
-
-        # CPU
-        out_cpu, state_cpu = prefill(conv, x)
-
-        # CUDA
-        conv_cuda = to_device(conv, "cuda")
-        out_cuda, state_cuda = prefill(conv_cuda, x.cuda())
-        out_cuda, state_cuda = out_cuda.cpu(), state_cuda.cpu()
-
-        torch.testing.assert_close(out_cuda, out_cpu, atol=1e-4, rtol=1e-4)
-        torch.testing.assert_close(state_cuda, state_cpu, atol=1e-5, rtol=1e-5)
-
-    def test_decode_cuda_vs_cpu(self, conv, dim, kernel_size):
-        """CUDA single-token decode matches CPU."""
-        torch.manual_seed(123)
-        token = torch.randn(2, dim, device="cpu")
-        state = torch.randn(2, dim, kernel_size - 1, device="cpu")
-
-        # CPU
-        state_cpu = state.clone()
-        out_cpu = conv.update(token, state_cpu)
-
-        # CUDA
-        conv_cuda = to_device(conv, "cuda")
-        state_cuda = state.cuda()
-        out_cuda = conv_cuda.update(token.cuda(), state_cuda).cpu()
-        state_cuda = state_cuda.cpu()
-
-        torch.testing.assert_close(out_cuda, out_cpu, atol=1e-4, rtol=1e-4)
-        torch.testing.assert_close(state_cuda, state_cpu, atol=1e-5, rtol=1e-5)
-
-
-# =============================================================================
-# Chunking Consistency
-# =============================================================================
-
-
-class TestChunkingConsistency:
-    """Chunked prefill matches full prefill."""
-
-    @pytest.mark.parametrize("total_len", [16, 33, 64])
-    @pytest.mark.parametrize("chunk_size", [4, 7, 16])
-    def test_chunked_prefill_cpu(self, conv, dim, total_len, chunk_size):
-        """CPU: Chunked prefill matches full prefill."""
-        torch.manual_seed(123)
-        x = torch.randn(2, dim, total_len, device="cpu")
-
-        # Reference: full prefill
-        ref_out, _ = prefill(conv, x)
-
-        # Chunked prefill
-        outputs = []
-        state = None
-        for start in range(0, total_len, chunk_size):
-            chunk = x[:, :, start : start + chunk_size]
-            out, state = prefill(conv, chunk, state)
-            outputs.append(out)
-
-        chunked_out = torch.cat(outputs, dim=-1)
-        torch.testing.assert_close(chunked_out, ref_out, atol=1e-5, rtol=1e-5)
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
-    @pytest.mark.skipif(_causal_conv1d_fn is None, reason="CUDA conv kernels required")
-    @pytest.mark.parametrize("total_len", [16, 33, 64])
-    @pytest.mark.parametrize("chunk_size", [4, 7, 16])
-    def test_chunked_prefill_cuda(self, conv, dim, total_len, chunk_size):
-        """CUDA: Chunked prefill matches full prefill."""
-        torch.manual_seed(123)
-        x = torch.randn(2, dim, total_len, device="cpu")
-
-        conv_cuda = to_device(conv, "cuda")
-
-        # Reference: full prefill
-        ref_out, _ = prefill(conv_cuda, x.cuda())
-
-        # Chunked prefill
-        outputs = []
-        state = None
-        for start in range(0, total_len, chunk_size):
-            chunk = x[:, :, start : start + chunk_size].cuda()
-            out, state = prefill(conv_cuda, chunk, state)
-            outputs.append(out)
-
-        chunked_out = torch.cat(outputs, dim=-1)
-        torch.testing.assert_close(chunked_out, ref_out, atol=1e-4, rtol=1e-4)
-
-
-# =============================================================================
-# Decode Consistency
-# =============================================================================
-
-
-class TestDecodeConsistency:
-    """Token-by-token decode matches batch prefill."""
-
-    @pytest.mark.parametrize("prefill_len", [4, 8, 16])
-    @pytest.mark.parametrize("decode_len", [1, 5, 10])
-    def test_prefill_then_decode_cpu(self, conv, dim, prefill_len, decode_len):
-        """CPU: Prefill + decode matches full prefill."""
-        torch.manual_seed(123)
-        total_len = prefill_len + decode_len
-        x = torch.randn(2, dim, total_len, device="cpu")
-
-        # Reference: full prefill
-        ref_out, _ = prefill(conv, x)
-
-        # Prefill prefix, then decode rest
-        out_prefix, state = prefill(conv, x[:, :, :prefill_len])
-        out_decode, _ = decode_sequence(conv, x[:, :, prefill_len:], state)
-
-        combined = torch.cat([out_prefix, out_decode], dim=-1)
-        torch.testing.assert_close(combined, ref_out, atol=1e-5, rtol=1e-5)
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
-    @pytest.mark.skipif(_causal_conv1d_fn is None, reason="CUDA conv kernels required")
-    @pytest.mark.parametrize("prefill_len", [4, 8, 16])
-    @pytest.mark.parametrize("decode_len", [1, 5, 10])
-    def test_prefill_then_decode_cuda(self, conv, dim, prefill_len, decode_len):
-        """CUDA: Prefill + decode matches full prefill."""
-        torch.manual_seed(123)
-        total_len = prefill_len + decode_len
-        x = torch.randn(2, dim, total_len, device="cuda")
-
-        conv_cuda = to_device(conv, "cuda")
-
-        # Reference: full prefill
-        ref_out, _ = prefill(conv_cuda, x)
-
-        # Prefill prefix, then decode rest
-        out_prefix, state = prefill(conv_cuda, x[:, :, :prefill_len])
-        out_decode, _ = decode_sequence(conv_cuda, x[:, :, prefill_len:], state)
-
-        combined = torch.cat([out_prefix, out_decode], dim=-1)
-        torch.testing.assert_close(combined, ref_out, atol=1e-4, rtol=1e-4)
-
-
-# =============================================================================
-# Global Consistency: The Ultimate Test
-# =============================================================================
-
-
-class TestGlobalConsistency:
-    """ALL code paths must produce identical results for the same input."""
-
-    def test_all_cpu_paths_match(self, conv, dim):
-        """All CPU paths produce identical output."""
-        torch.manual_seed(42)
-
-        total_len = 24
-        prefill_len = 16
-        chunk_size = 8
-        x = torch.randn(2, dim, total_len, device="cpu")
-
-        # Reference: full prefill
-        reference, _ = prefill(conv, x)
-
-        # Path 1: Chunked prefill
-        outputs = []
-        state = None
-        for start in range(0, total_len, chunk_size):
-            chunk = x[:, :, start : start + chunk_size]
-            out, state = prefill(conv, chunk, state)
-            outputs.append(out)
-        path1 = torch.cat(outputs, dim=-1)
-
-        # Path 2: Prefill + decode
-        out_prefix, state = prefill(conv, x[:, :, :prefill_len])
-        out_decode, _ = decode_sequence(conv, x[:, :, prefill_len:], state)
-        path2 = torch.cat([out_prefix, out_decode], dim=-1)
-
-        # Path 3: All decode (extreme case)
-        # Prefill first kernel_size-1 tokens, decode rest
-        init_len = conv.kernel_size[0] - 1
-        out_init, state = prefill(conv, x[:, :, :init_len])
-        out_decode, _ = decode_sequence(conv, x[:, :, init_len:], state)
-        path3 = torch.cat([out_init, out_decode], dim=-1)
-
-        torch.testing.assert_close(path1, reference, atol=1e-5, rtol=1e-5)
-        torch.testing.assert_close(path2, reference, atol=1e-5, rtol=1e-5)
-        torch.testing.assert_close(path3, reference, atol=1e-5, rtol=1e-5)
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
-    @pytest.mark.skipif(_causal_conv1d_fn is None, reason="CUDA conv kernels required")
-    def test_all_paths_match_cross_device(self, conv, dim):
-        """All paths (CPU and CUDA) produce identical output."""
-        torch.manual_seed(42)
-
-        total_len = 24
-        prefill_len = 16
-        chunk_size = 8
-        x = torch.randn(2, dim, total_len, device="cpu")
-
-        conv_cuda = to_device(conv, "cuda")
-
-        # REFERENCE: CPU full prefill (simplest, most trustworthy)
-        reference, _ = prefill(conv, x)
-
-        results = {}
-
-        # CPU paths
-        # ---------
-
-        # CPU chunked
-        outputs, state = [], None
-        for start in range(0, total_len, chunk_size):
-            out, state = prefill(conv, x[:, :, start : start + chunk_size], state)
-            outputs.append(out)
-        results["cpu_chunked"] = torch.cat(outputs, dim=-1)
-
-        # CPU prefill + decode
-        out_prefix, state = prefill(conv, x[:, :, :prefill_len])
-        out_decode, _ = decode_sequence(conv, x[:, :, prefill_len:], state)
-        results["cpu_prefill_decode"] = torch.cat([out_prefix, out_decode], dim=-1)
-
-        # CUDA paths
-        # ----------
-
-        # CUDA full prefill
-        results["cuda_full"], _ = prefill(conv_cuda, x.cuda())
-        results["cuda_full"] = results["cuda_full"].cpu()
-
-        # CUDA chunked
-        outputs, state = [], None
-        for start in range(0, total_len, chunk_size):
-            out, state = prefill(conv_cuda, x[:, :, start : start + chunk_size].cuda(), state)
-            outputs.append(out.cpu())
-        results["cuda_chunked"] = torch.cat(outputs, dim=-1)
-
-        # CUDA prefill + decode
-        out_prefix, state = prefill(conv_cuda, x[:, :, :prefill_len].cuda())
-        out_decode, _ = decode_sequence(conv_cuda, x[:, :, prefill_len:].cuda(), state)
-        results["cuda_prefill_decode"] = torch.cat([out_prefix.cpu(), out_decode.cpu()], dim=-1)
-
-        # Mixed paths
-        # -----------
-
-        # CPU prefill, CUDA decode
-        out_prefix, state = prefill(conv, x[:, :, :prefill_len])
-        state = state.cuda()
-        out_decode, _ = decode_sequence(conv_cuda, x[:, :, prefill_len:].cuda(), state)
-        results["cpu_prefill_cuda_decode"] = torch.cat([out_prefix, out_decode.cpu()], dim=-1)
-
-        # CUDA prefill, CPU decode
-        out_prefix, state = prefill(conv_cuda, x[:, :, :prefill_len].cuda())
-        out_prefix, state = out_prefix.cpu(), state.cpu()
-        out_decode, _ = decode_sequence(conv, x[:, :, prefill_len:], state)
-        results["cuda_prefill_cpu_decode"] = torch.cat([out_prefix, out_decode], dim=-1)
-
-        # Verify all match reference
-        tolerances = {
-            "cpu_chunked": 1e-5,
-            "cpu_prefill_decode": 1e-5,
-            "cuda_full": 1e-4,
-            "cuda_chunked": 1e-4,
-            "cuda_prefill_decode": 1e-4,
-            "cpu_prefill_cuda_decode": 1e-4,
-            "cuda_prefill_cpu_decode": 1e-4,
-        }
-
-        for name, result in results.items():
-            tol = tolerances[name]
-            torch.testing.assert_close(
-                result, reference, atol=tol, rtol=tol, msg=f"Path '{name}' diverged from reference"
-            )
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
-    @pytest.mark.skipif(_causal_conv1d_fn is None, reason="CUDA conv kernels required")
-    def test_long_decode_no_drift(self, conv, dim):
-        """Long decode sequence doesn't accumulate errors."""
-        torch.manual_seed(42)
-
-        prefill_len = 8
-        decode_len = 100  # Long decode to catch drift
-        total_len = prefill_len + decode_len
-        x = torch.randn(2, dim, total_len, device="cpu")
-
-        conv_cuda = to_device(conv, "cuda")
-
-        # Reference: CPU full prefill
-        reference, _ = prefill(conv, x)
-
-        # CUDA prefill + long decode
-        out_prefix, state = prefill(conv_cuda, x[:, :, :prefill_len].cuda())
-        out_decode, _ = decode_sequence(conv_cuda, x[:, :, prefill_len:].cuda(), state)
-        result = torch.cat([out_prefix.cpu(), out_decode.cpu()], dim=-1)
-
-        # Check max error at each position doesn't grow
-        errors = (result - reference).abs().max(dim=1).values.max(dim=0).values  # [seq_len]
-
-        # First positions should have small error
-        assert errors[:prefill_len].max() < 1e-4, "Prefill error too large"
-
-        # Decode errors shouldn't grow unboundedly
-        # Allow slightly more tolerance for later positions but not exponential growth
-        assert errors[prefill_len:].max() < 1e-3, "Decode error too large"
-
-        # Check no systematic drift (errors shouldn't consistently increase)
-        decode_errors = errors[prefill_len:]
-        first_half = decode_errors[: len(decode_errors) // 2].mean()
-        second_half = decode_errors[len(decode_errors) // 2 :].mean()
-        assert second_half < first_half * 2, "Errors growing over decode steps (drift detected)"
-
-
-# =============================================================================
-# Edge Cases
-# =============================================================================
-
-
-class TestEdgeCases:
-    """Edge cases and boundary conditions."""
-
-    def test_single_token_prefill(self, conv, dim, kernel_size):
-        """Prefill with just 1 token works."""
-        x = torch.randn(2, dim, 1, device="cpu")
-        out, state = prefill(conv, x)
-
-        assert out.shape == (2, dim, 1)
-        assert state.shape == (2, dim, kernel_size - 1)
-
-    def test_seq_shorter_than_kernel(self, conv, dim, kernel_size):
-        """Sequence shorter than kernel_size works."""
-        seq_len = kernel_size - 2  # Shorter than kernel
-        x = torch.randn(2, dim, seq_len, device="cpu")
-        out, state = prefill(conv, x)
-
-        assert out.shape == (2, dim, seq_len)
-        assert state.shape == (2, dim, kernel_size - 1)
-
-    def test_seq_exactly_kernel_size(self, conv, dim, kernel_size):
-        """Sequence exactly kernel_size works."""
-        x = torch.randn(2, dim, kernel_size, device="cpu")
-        out, state = prefill(conv, x)
-
-        assert out.shape == (2, dim, kernel_size)
-
-    def test_batch_size_one(self, conv, dim):
-        """Batch size 1 works."""
-        x = torch.randn(1, dim, 16, device="cpu")
-        out, state = prefill(conv, x)
-
-        assert out.shape == (1, dim, 16)
-
-    def test_empty_decode_after_prefill(self, conv, dim, kernel_size):
-        """Zero decode steps after prefill is valid."""
-        x = torch.randn(2, dim, 16, device="cpu")
-        out_prefill, state = prefill(conv, x)
-
-        # No decode, just verify state is usable
-        token = torch.randn(2, dim, device="cpu")
-        out_token = conv.update(token, state)
-        assert out_token.shape == (2, dim)
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
-    @pytest.mark.skipif(_causal_conv1d_fn is None, reason="CUDA conv kernels required")
-    def test_state_device_transfer(self, conv, dim, kernel_size):
-        """State can be transferred between devices."""
-        x = torch.randn(2, dim, 16, device="cpu")
-
-        # Prefill on CPU
-        _, state_cpu = prefill(conv, x)
-
-        # Transfer state to CUDA
-        state_cuda = state_cpu.cuda()
-        conv_cuda = to_device(conv, "cuda")
-
-        # Decode on CUDA with transferred state
-        token = torch.randn(2, dim, device="cuda")
-        out = conv_cuda.update(token, state_cuda)
-
-        assert out.shape == (2, dim)
-        assert out.device.type == "cuda"
diff --git a/fast_llm_external_models/tests/test_apriel2/test_mixer_equivalence.py b/fast_llm_external_models/tests/test_apriel2/test_mixer_equivalence.py
index 536d40330..bb4fe8bc6 100644
--- a/fast_llm_external_models/tests/test_apriel2/test_mixer_equivalence.py
+++ b/fast_llm_external_models/tests/test_apriel2/test_mixer_equivalence.py
@@ -53,6 +53,24 @@ def seq_len(request):
     return request.param
 
 
+@pytest.fixture(params=[4, 32, 64])
+def prefill_len(request):
+    """Length of initial prefill phase in cache tests."""
+    return request.param
+
+
+@pytest.fixture(params=[4])
+def decode_steps(request):
+    """Number of decode steps in cache tests. Single value to limit test explosion."""
+    return request.param
+
+
+@pytest.fixture(params=[4, 16])
+def prefill2_len(request):
+    """Length of second prefill phase in cache tests."""
+    return request.param
+
+
 @pytest.fixture(params=[256, 512])
 def hidden_size(request):
     """Hidden sizes to test. 256 is minimal, 512 exercises larger matrices."""
@@ -102,6 +120,41 @@ def kda_config(request):
     return request.param
 
 
+@pytest.fixture
+def gdn_mixer_config(gdn_config):
+    """GDN mixer config dict derived from gdn_config tuple."""
+    value_heads, key_heads, key_head_dim, value_head_dim = gdn_config
+    return {
+        "type": "gdn",
+        "value_heads": value_heads,
+        "key_heads": key_heads,
+        "key_head_dim": key_head_dim,
+        "value_head_dim": value_head_dim,
+        "convolution_layer": {"kernel_size": 4},
+        "norm_eps": 1e-5,
+    }
+
+
+@pytest.fixture
+def kda_mixer_config(kda_config):
+    """KDA mixer config dict derived from kda_config tuple."""
+    num_heads, head_dim = kda_config
+    return {
+        "type": "kda",
+        "heads": num_heads,
+        "head_dim": head_dim,
+        "convolution_layer": {"kernel_size": 4},
+        "normalization": {"epsilon": 1e-5},
+    }
+
+
+@pytest.fixture
+def kda_hidden_size(kda_config):
+    """Hidden size for KDA (constrained: num_heads * head_dim)."""
+    num_heads, head_dim = kda_config
+    return num_heads * head_dim
+
+
 # =============================================================================
 # Test Mode Configuration
 # =============================================================================
@@ -110,11 +163,8 @@ def kda_config(request):
 @pytest.fixture(
     params=[
         "precise",
-        # "fast" mode (bf16/sdpa) is intentionally skipped:
-        # - These are correctness tests, not performance benchmarks
-        # - bf16 has ~3 decimal digits precision, masking real bugs
-        # - Small tensor sizes make GPU overhead dominate anyway
-        pytest.param("fast", marks=pytest.mark.skip(reason="Correctness tests use fp32")),
+        # "fast" mode (bf16/sdpa) - enabled for testing
+        "fast",
     ]
 )
 def test_mode(request):
@@ -178,6 +228,10 @@ def assert_close(
         atol: Absolute tolerance
         msg: Context message for failure
     """
+    # Cast to same dtype for comparison (fp32 for precision)
+    if actual.dtype != expected.dtype:
+        actual = actual.float()
+        expected = expected.float()
     if not torch.allclose(actual, expected, rtol=rtol, atol=atol):
         diff = (actual - expected).abs()
         max_diff = diff.max().item()
@@ -211,6 +265,20 @@ def assert_deterministic(out1: torch.Tensor, out2: torch.Tensor, mixer_name: str
         )
 
 
+def make_apriel2_config(hidden_size: int, mixer_config: dict):
+    """Create minimal Apriel2TextConfig for single-layer mixer testing."""
+    from fast_llm_external_models.apriel2.configuration_apriel2 import Apriel2TextConfig
+
+    return Apriel2TextConfig(
+        hidden_size=hidden_size,
+        decoder={
+            "type": "fixed",
+            "num_blocks": 1,
+            "block": {"mixer": mixer_config},
+        },
+    )
+
+
 def extract_module_weights(module: nn.Module) -> dict[W, torch.Tensor]:
     """Extract weights from a module as a dict with W keys for conversion plan."""
     weights = {}
@@ -443,26 +511,15 @@ def test_attention_determinism(self, attention_config):
         assert_deterministic(out1, out2, "Apriel2Attention")
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="GDN requires CUDA")
-    def test_gdn_determinism(self, gdn_config):
+    def test_gdn_determinism(self, gdn_mixer_config):
         """Verify Apriel2GatedDeltaNet produces identical output on repeated calls."""
         from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2GatedDeltaNet
 
-        value_heads, key_heads, key_head_dim, value_head_dim = gdn_config
         hidden_size = 256
         batch_size, seq_len = 2, 32
 
-        config_dict = {
-            "type": "gdn",
-            "value_heads": value_heads,
-            "key_heads": key_heads,
-            "key_head_dim": key_head_dim,
-            "value_head_dim": value_head_dim,
-            "convolution_layer": {"kernel_size": 4},
-            "norm_eps": 1e-5,
-        }
-
         torch.manual_seed(42)
-        model = Apriel2GatedDeltaNet(hidden_size, config_dict, layer_idx=0)
+        model = Apriel2GatedDeltaNet(hidden_size, gdn_mixer_config, layer_idx=0)
         model.eval()
 
         torch.manual_seed(123)
@@ -475,28 +532,18 @@ def test_gdn_determinism(self, gdn_config):
         assert_deterministic(out1, out2, "Apriel2GatedDeltaNet")
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="KDA requires CUDA")
-    def test_kda_determinism(self, kda_config):
+    def test_kda_determinism(self, kda_mixer_config, kda_hidden_size):
         """Verify Apriel2 KimiDeltaAttention produces identical output on repeated calls."""
         from fast_llm_external_models.apriel2.modeling_apriel2 import KimiDeltaAttention
 
-        num_heads, head_dim = kda_config
-        hidden_size = num_heads * head_dim
         batch_size, seq_len = 2, 32
 
-        config_dict = {
-            "type": "kda",
-            "heads": num_heads,
-            "head_dim": head_dim,
-            "convolution_layer": {"kernel_size": 4},
-            "normalization": {"epsilon": 1e-5},
-        }
-
         torch.manual_seed(42)
-        model = KimiDeltaAttention(hidden_size, config_dict, layer_idx=0)
+        model = KimiDeltaAttention(kda_hidden_size, kda_mixer_config, layer_idx=0)
         model.eval()
 
         torch.manual_seed(123)
-        hidden_states = torch.randn(batch_size, seq_len, hidden_size)
+        hidden_states = torch.randn(batch_size, seq_len, kda_hidden_size)
 
         with torch.no_grad():
             out1 = model(hidden_states)[0]
@@ -725,13 +772,41 @@ def test_noncausal_vs_pixtral(
 class TestGDNEquivalence:
     """Verify Apriel2GatedDeltaNet matches Qwen3NextGatedDeltaNet."""
 
-    @pytest.fixture
-    def qwen3_config(self, hidden_size, gdn_config):
-        """Create Qwen3NextConfig for GDN testing."""
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GDN requires CUDA")
+    @pytest.mark.parametrize("seed", [42, 123, 456])
+    def test_vs_qwen3next(
+        self,
+        gdn_config,
+        gdn_mixer_config,
+        hidden_size,
+        batch_size,
+        prefill_len,
+        decode_steps,
+        prefill2_len,
+        seed,
+        tolerance,
+        test_dtype,
+    ):
+        """Verify Apriel2GatedDeltaNet matches Qwen3NextGatedDeltaNet output.
+
+        Three-phase test (prefill → decode → prefill) verifies cache handling.
+
+        Note: Phase 3 diverges because Qwen3Next has a bug where chunk mode
+        always uses initial_state=None, ignoring cached recurrent state.
+        """
         from transformers.models.qwen3_next.configuration_qwen3_next import Qwen3NextConfig
+        from transformers.models.qwen3_next.modeling_qwen3_next import (
+            Qwen3NextDynamicCache,
+            Qwen3NextGatedDeltaNet,
+        )
+
+        from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2Cache, Apriel2GatedDeltaNet
 
         value_heads, key_heads, key_head_dim, value_head_dim = gdn_config
-        return Qwen3NextConfig(
+        seq_len = prefill_len + decode_steps + prefill2_len
+
+        # Create config with layer_types (required by Qwen3NextDynamicCache)
+        qwen3_config = Qwen3NextConfig(
             hidden_size=hidden_size,
             linear_num_value_heads=value_heads,
             linear_num_key_heads=key_heads,
@@ -744,43 +819,16 @@ def qwen3_config(self, hidden_size, gdn_config):
             num_key_value_heads=2,
             head_dim=64,
             torch_dtype=torch.get_default_dtype(),
+            num_hidden_layers=1,
+            layer_types=["linear_attention"],
         )
 
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GDN requires CUDA")
-    @pytest.mark.parametrize("seed", [42, 123, 456])
-    def test_vs_qwen3next(
-        self,
-        qwen3_config,
-        gdn_config,
-        hidden_size,
-        batch_size,
-        seq_len,
-        seed,
-        tolerance,
-    ):
-        """Verify Apriel2GatedDeltaNet matches Qwen3NextGatedDeltaNet output."""
-        from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextGatedDeltaNet
-
-        from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2GatedDeltaNet
-
-        value_heads, key_heads, key_head_dim, value_head_dim = gdn_config
-
-        config_dict = {
-            "type": "gdn",
-            "value_heads": value_heads,
-            "key_heads": key_heads,
-            "key_head_dim": key_head_dim,
-            "value_head_dim": value_head_dim,
-            "convolution_layer": {"kernel_size": 4},
-            "norm_eps": 1e-5,
-        }
-
-        # Create models
+        # Create models with same weights
         torch.manual_seed(seed)
-        qwen_gdn = Qwen3NextGatedDeltaNet(qwen3_config, layer_idx=0)
-        apriel2_gdn = Apriel2GatedDeltaNet(hidden_size, config_dict, layer_idx=0)
+        qwen_gdn = Qwen3NextGatedDeltaNet(qwen3_config, layer_idx=0).to(device="cuda", dtype=test_dtype)
+        apriel_gdn = Apriel2GatedDeltaNet(hidden_size, gdn_mixer_config, layer_idx=0).to(device="cuda", dtype=test_dtype)
 
-        # Transfer weights
+        # Transfer weights using conversion plan
         plan = plan_qwen3next_gdn_to_apriel2(
             num_k_heads=key_heads,
             num_v_heads=value_heads,
@@ -789,36 +837,119 @@ def test_vs_qwen3next(
         )
         source_weights = extract_module_weights(qwen_gdn)
         target_weights = execute(plan, source_weights, seed=seed)
-        load_weights_into_module(apriel2_gdn, target_weights)
-
-        # Create input
-        torch.manual_seed(seed)
-        hidden_states = torch.randn(batch_size, seq_len, hidden_size)
+        load_weights_into_module(apriel_gdn, target_weights)
 
         qwen_gdn.eval()
-        apriel2_gdn.eval()
+        apriel_gdn.eval()
+
+        rtol, atol = tolerance
+
+        # Create full input sequence
+        torch.manual_seed(seed + 1)
+        hidden_states = torch.randn(batch_size, seq_len, hidden_size, device="cuda", dtype=test_dtype)
+
+        # Create caches
+        qwen_cache = Qwen3NextDynamicCache(qwen3_config)
+        apriel_cache = Apriel2Cache(make_apriel2_config(hidden_size, gdn_mixer_config))
+
+        # ========== PHASE 1: Initial Prefill ==========
+        prefill_input = hidden_states[:, :prefill_len, :]
 
         with torch.no_grad():
-            qwen_out = qwen_gdn(hidden_states)
-            apriel2_out = apriel2_gdn(hidden_states)[0]
+            qwen_out1 = qwen_gdn(
+                prefill_input,
+                cache_params=qwen_cache,
+                cache_position=torch.arange(prefill_len, device="cuda"),
+            )
+            apriel_out1 = apriel_gdn(
+                prefill_input,
+                past_key_values=apriel_cache,
+                cache_position=torch.arange(prefill_len, device="cuda"),
+            )[0]
 
-        rtol, atol = tolerance
         assert_close(
-            apriel2_out,
-            qwen_out,
+            apriel_out1,
+            qwen_out1,
+            rtol=rtol,
+            atol=atol,
+            msg=f"Phase 1 (prefill): output mismatch (batch={batch_size}, prefill={prefill_len})",
+        )
+
+        # Compare recurrent states
+        assert_close(
+            apriel_cache.recurrent_states[0],
+            qwen_cache.recurrent_states[0],
             rtol=rtol,
             atol=atol,
-            msg=f"Apriel2GatedDeltaNet vs Qwen3NextGatedDeltaNet (batch={batch_size}, seq={seq_len})",
+            msg="Phase 1: recurrent_state mismatch",
         )
 
+        # ========== PHASE 2: Decode (single tokens) ==========
+        for i in range(decode_steps):
+            pos = prefill_len + i
+            decode_input = hidden_states[:, pos : pos + 1, :]
+
+            with torch.no_grad():
+                qwen_out = qwen_gdn(
+                    decode_input,
+                    cache_params=qwen_cache,
+                    cache_position=torch.tensor([pos], device="cuda"),
+                )
+                apriel_out = apriel_gdn(
+                    decode_input,
+                    past_key_values=apriel_cache,
+                    cache_position=torch.tensor([pos], device="cuda"),
+                )[0]
+
+            assert_close(
+                apriel_out,
+                qwen_out,
+                rtol=rtol,
+                atol=atol,
+                msg=f"Phase 2 (decode step {i}): output mismatch",
+            )
+
+        # Compare recurrent states after decode
+        assert_close(
+            apriel_cache.recurrent_states[0],
+            qwen_cache.recurrent_states[0],
+            rtol=rtol,
+            atol=atol,
+            msg="Phase 2: recurrent_state mismatch",
+        )
+
+        # ========== PHASE 3: Prefill again (decode→prefill transition) ==========
+        # NOTE: Qwen3Next passes initial_state=None in chunk mode, so outputs diverge.
+        prefill2_start = prefill_len + decode_steps
+        prefill2_input = hidden_states[:, prefill2_start : prefill2_start + prefill2_len, :]
+
+        with torch.no_grad():
+            qwen_out3 = qwen_gdn(
+                prefill2_input,
+                cache_params=qwen_cache,
+                cache_position=torch.arange(prefill2_start, prefill2_start + prefill2_len, device="cuda"),
+            )
+            apriel_out3 = apriel_gdn(
+                prefill2_input,
+                past_key_values=apriel_cache,
+                cache_position=torch.arange(prefill2_start, prefill2_start + prefill2_len, device="cuda"),
+            )[0]
+
+        # Phase 3 diverges due to Qwen3Next bug - just verify we can run it
+        _ = (qwen_out3, apriel_out3)  # Outputs computed but not compared
+
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="GDN requires CUDA")
     @pytest.mark.parametrize("seed", [42, 123, 456])
-    @pytest.mark.parametrize("prefill_len", [4, 8, 16])
     def test_chunked_vs_recurrent(
         self,
-        gdn_config,
-        seed,
+        gdn_mixer_config,
+        hidden_size,
+        batch_size,
         prefill_len,
+        decode_steps,
+        seed,
+        tolerance,
+        test_dtype,
     ):
         """Verify GDN recurrent mode (decode) matches chunked mode (prefill).
 
@@ -826,45 +957,25 @@ def test_chunked_vs_recurrent(
         subsequent single-token decodes using recurrent mode should produce the same
         output as if we had run the full sequence through chunked mode.
         """
-        from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2GatedDeltaNet
+        from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2Cache, Apriel2GatedDeltaNet
 
-        value_heads, key_heads, key_head_dim, value_head_dim = gdn_config
-        hidden_size = 256
-        batch_size = 2
-        total_len = prefill_len + 4  # Prefill + 4 decode steps
-
-        config_dict = {
-            "type": "gdn",
-            "value_heads": value_heads,
-            "key_heads": key_heads,
-            "key_head_dim": key_head_dim,
-            "value_head_dim": value_head_dim,
-            "convolution_layer": {"kernel_size": 4},
-            "norm_eps": 1e-5,
-        }
+        total_len = prefill_len + decode_steps
 
         # Create model
         torch.manual_seed(seed)
-        model = Apriel2GatedDeltaNet(hidden_size, config_dict, layer_idx=0)
-        model = model.cuda()
+        model = Apriel2GatedDeltaNet(hidden_size, gdn_mixer_config, layer_idx=0).to(device="cuda", dtype=test_dtype)
         model.eval()
 
         # Create input sequence
         torch.manual_seed(seed + 1)
-        full_hidden_states = torch.randn(batch_size, total_len, hidden_size, device="cuda")
+        full_hidden_states = torch.randn(batch_size, total_len, hidden_size, device="cuda", dtype=test_dtype)
 
         # === Reference: Run full sequence through chunked mode ===
         with torch.no_grad():
             reference_output = model(full_hidden_states)[0]
 
         # === Test: Prefill + decode ===
-        # Create a simple cache object to hold conv and recurrent states
-        class SimpleCache:
-            def __init__(self):
-                self.conv_states = {0: None}
-                self.recurrent_states = {0: None}
-
-        cache = SimpleCache()
+        cache = Apriel2Cache(make_apriel2_config(hidden_size, gdn_mixer_config))
 
         # Prefill phase
         prefill_input = full_hidden_states[:, :prefill_len, :]
@@ -877,13 +988,14 @@ def __init__(self):
 
         # Decode phase - one token at a time
         decode_outputs = []
-        for i in range(prefill_len, total_len):
-            decode_input = full_hidden_states[:, i : i + 1, :]
+        for i in range(decode_steps):
+            pos = prefill_len + i
+            decode_input = full_hidden_states[:, pos : pos + 1, :]
             with torch.no_grad():
                 decode_output = model(
                     decode_input,
                     past_key_values=cache,
-                    cache_position=torch.tensor([i], device="cuda"),
+                    cache_position=torch.tensor([pos], device="cuda"),
                 )[0]
             decode_outputs.append(decode_output)
 
@@ -891,16 +1003,16 @@ def __init__(self):
         test_output = torch.cat([prefill_output] + decode_outputs, dim=1)
 
         # Use looser tolerance for chunked vs recurrent comparison
-        # (different processing order leads to numerical differences)
+        # (different numerical accumulation order leads to larger differences)
+        rtol, atol = tolerance
         assert_close(
             test_output,
             reference_output,
-            rtol=1e-3,
-            atol=1e-3,
-            msg=f"GDN chunked vs recurrent mode (prefill={prefill_len}, total={total_len})",
+            rtol=rtol * 5,
+            atol=atol * 5,
+            msg=f"GDN chunked vs recurrent mode (prefill={prefill_len}, decode={decode_steps})",
         )
 
-
 # =============================================================================
 # SECTION 2: EQUIVALENCE TESTS - KimiDeltaAttention
 # =============================================================================
@@ -914,79 +1026,189 @@ class TestKDAEquivalence:
     def test_vs_fla(
         self,
         kda_config,
+        kda_mixer_config,
+        kda_hidden_size,
         batch_size,
-        seq_len,
+        prefill_len,
+        decode_steps,
+        prefill2_len,
         seed,
         tolerance,
+        test_dtype,
     ):
-        """Verify Apriel2 KimiDeltaAttention matches FLA KimiDeltaAttention output."""
+        """Verify Apriel2 KimiDeltaAttention matches FLA KimiDeltaAttention output.
+
+        Three-phase test (prefill → decode → prefill) verifies cache handling.
+
+        Unlike GDN (where Qwen3Next has a bug), FLA KDA correctly passes initial_state
+        in chunk mode, so all three phases should match.
+        """
         from fla.layers.kda import KimiDeltaAttention as FLA_KDA
+        from fla.models.utils import Cache as FLACache
 
-        from fast_llm_external_models.apriel2.modeling_apriel2 import KimiDeltaAttention as Apriel2_KDA
+        from fast_llm_external_models.apriel2.modeling_apriel2 import (
+            Apriel2Cache,
+            KimiDeltaAttention as Apriel2_KDA,
+        )
 
         num_heads, head_dim = kda_config
-        hidden_size = num_heads * head_dim
+        seq_len = prefill_len + decode_steps + prefill2_len
 
-        config_dict = {
-            "type": "kda",
-            "heads": num_heads,
-            "head_dim": head_dim,
-            "convolution_layer": {"kernel_size": 4},
-            "normalization": {"epsilon": 1e-5},
-        }
-
-        # Create FLA KDA
+        # Create FLA KDA with same weights
         torch.manual_seed(seed)
         fla_kda = FLA_KDA(
-            hidden_size=hidden_size,
+            hidden_size=kda_hidden_size,
             num_heads=num_heads,
             head_dim=head_dim,
             conv_size=4,
             conv_bias=False,
             norm_eps=1e-5,
             layer_idx=0,
-        )
+        ).to(device="cuda", dtype=test_dtype)
         # FLA has g_proj.1 bias=True but Apriel2/upstream Kimi doesn't - zero it out
         fla_kda.g_proj[1].bias.data.zero_()
 
         # Create Apriel2 KDA
-        apriel2_kda = Apriel2_KDA(hidden_size, config_dict, layer_idx=0)
+        apriel_kda = Apriel2_KDA(kda_hidden_size, kda_mixer_config, layer_idx=0).to(device="cuda", dtype=test_dtype)
 
-        # Transfer weights
+        # Transfer weights using conversion plan
         plan = plan_fla_kda_to_apriel2()
         source_weights = extract_module_weights(fla_kda)
         target_weights = execute(plan, source_weights, seed=seed)
-        load_weights_into_module(apriel2_kda, target_weights)
-
-        # Create input
-        torch.manual_seed(seed)
-        hidden_states = torch.randn(batch_size, seq_len, hidden_size)
+        load_weights_into_module(apriel_kda, target_weights)
 
         fla_kda.eval()
-        apriel2_kda.eval()
+        apriel_kda.eval()
+
+        rtol, atol = tolerance
+
+        # Create full input sequence
+        torch.manual_seed(seed + 1)
+        hidden_states = torch.randn(batch_size, seq_len, kda_hidden_size, device="cuda", dtype=test_dtype)
+
+        # Create caches
+        fla_cache = FLACache()
+        apriel_cache = Apriel2Cache(make_apriel2_config(kda_hidden_size, kda_mixer_config))
+
+        # Force chunk mode for prefill
+        fla_kda.mode = "chunk"
+        apriel_kda.mode = "chunk"
+
+        # ========== PHASE 1: Initial Prefill ==========
+        prefill_input = hidden_states[:, :prefill_len, :]
 
         with torch.no_grad():
-            # use_cache=True ensures FLA initializes conv cache for short sequences
-            fla_out = fla_kda(hidden_states, use_cache=True)[0]
-            apriel2_out = apriel2_kda(hidden_states)[0]
+            fla_out1 = fla_kda(
+                prefill_input,
+                past_key_values=fla_cache,
+                use_cache=True,
+            )[0]
+            apriel_out1 = apriel_kda(
+                prefill_input,
+                past_key_values=apriel_cache,
+            )[0]
 
-        rtol, atol = tolerance
         assert_close(
-            apriel2_out,
-            fla_out,
+            apriel_out1,
+            fla_out1,
             rtol=rtol,
             atol=atol,
-            msg=f"Apriel2 KDA vs FLA KDA (batch={batch_size}, seq={seq_len}, hidden={hidden_size})",
+            msg=f"Phase 1 (prefill): output mismatch (batch={batch_size}, prefill={prefill_len})",
+        )
+
+        # Compare recurrent states
+        assert_close(
+            apriel_cache.recurrent_states[0],
+            fla_cache[0]["recurrent_state"],
+            rtol=rtol,
+            atol=atol,
+            msg="Phase 1: recurrent_state mismatch",
+        )
+
+        # ========== PHASE 2: Decode (single tokens) ==========
+        fla_kda.mode = "fused_recurrent"
+        apriel_kda.mode = "fused_recurrent"
+
+        for i in range(decode_steps):
+            pos = prefill_len + i
+            decode_input = hidden_states[:, pos : pos + 1, :]
+
+            with torch.no_grad():
+                fla_out = fla_kda(
+                    decode_input,
+                    past_key_values=fla_cache,
+                    use_cache=True,
+                )[0]
+                apriel_out = apriel_kda(
+                    decode_input,
+                    past_key_values=apriel_cache,
+                )[0]
+
+            assert_close(
+                apriel_out,
+                fla_out,
+                rtol=rtol,
+                atol=atol,
+                msg=f"Phase 2 (decode step {i}): output mismatch",
+            )
+
+        # Compare recurrent states after decode
+        assert_close(
+            apriel_cache.recurrent_states[0],
+            fla_cache[0]["recurrent_state"],
+            rtol=rtol,
+            atol=atol,
+            msg="Phase 2: recurrent_state mismatch",
+        )
+
+        # ========== PHASE 3: Prefill again (decode→prefill transition) ==========
+        # FLA KDA correctly uses initial_state in chunk mode, so this should match
+        fla_kda.mode = "chunk"
+        apriel_kda.mode = "chunk"
+
+        prefill2_start = prefill_len + decode_steps
+        prefill2_input = hidden_states[:, prefill2_start : prefill2_start + prefill2_len, :]
+
+        with torch.no_grad():
+            fla_out3 = fla_kda(
+                prefill2_input,
+                past_key_values=fla_cache,
+                use_cache=True,
+            )[0]
+            apriel_out3 = apriel_kda(
+                prefill2_input,
+                past_key_values=apriel_cache,
+            )[0]
+
+        assert_close(
+            apriel_out3,
+            fla_out3,
+            rtol=rtol,
+            atol=atol,
+            msg="Phase 3 (decode→prefill): output mismatch",
+        )
+
+        # Compare final recurrent states
+        assert_close(
+            apriel_cache.recurrent_states[0],
+            fla_cache[0]["recurrent_state"],
+            rtol=rtol,
+            atol=atol,
+            msg="Phase 3: recurrent_state mismatch",
         )
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="KDA requires CUDA")
     @pytest.mark.parametrize("seed", [42, 123, 456])
-    @pytest.mark.parametrize("prefill_len", [4, 8, 16])
     def test_chunked_vs_recurrent(
         self,
-        kda_config,
-        seed,
+        kda_mixer_config,
+        kda_hidden_size,
+        batch_size,
         prefill_len,
+        decode_steps,
+        seed,
+        tolerance,
+        test_dtype,
     ):
         """Verify KDA recurrent mode (fused_recurrent_kda) matches chunked mode (chunk_kda).
 
@@ -994,45 +1216,26 @@ def test_chunked_vs_recurrent(
         subsequent single-token decodes using recurrent mode should produce the same
         output as if we had run the full sequence through chunked mode.
         """
-        from fast_llm_external_models.apriel2.modeling_apriel2 import KimiDeltaAttention
+        from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2Cache, KimiDeltaAttention
 
-        num_heads, head_dim = kda_config
-        hidden_size = num_heads * head_dim
-        batch_size = 2
-        total_len = prefill_len + 4  # Prefill + 4 decode steps
-
-        config_dict = {
-            "type": "kda",
-            "heads": num_heads,
-            "head_dim": head_dim,
-            "convolution_layer": {"kernel_size": 4},
-            "normalization": {"epsilon": 1e-5},
-        }
+        total_len = prefill_len + decode_steps
 
         # Create model
         torch.manual_seed(seed)
-        model = KimiDeltaAttention(hidden_size, config_dict, layer_idx=0)
-        model = model.cuda()
+        model = KimiDeltaAttention(kda_hidden_size, kda_mixer_config, layer_idx=0).to(device="cuda", dtype=test_dtype)
         model.eval()
 
         # Create input sequence
         torch.manual_seed(seed + 1)
-        full_hidden_states = torch.randn(batch_size, total_len, hidden_size, device="cuda")
+        full_hidden_states = torch.randn(batch_size, total_len, kda_hidden_size, device="cuda", dtype=test_dtype)
 
         # === Reference: Run full sequence through chunked mode ===
-        # Force chunk mode by using long sequence or setting mode directly
         model.mode = "chunk"
         with torch.no_grad():
             reference_output = model(full_hidden_states)[0]
 
         # === Test: Prefill + decode ===
-        # Create a simple cache object to hold conv and recurrent states
-        class SimpleCache:
-            def __init__(self):
-                self.conv_states = {0: None}
-                self.recurrent_states = {0: None}
-
-        cache = SimpleCache()
+        cache = Apriel2Cache(make_apriel2_config(kda_hidden_size, kda_mixer_config))
 
         # Prefill phase - force chunk mode
         model.mode = "chunk"
@@ -1043,11 +1246,12 @@ def __init__(self):
                 past_key_values=cache,
             )[0]
 
-        # Decode phase - one token at a time (will use fused_recurrent since seq_len=1 <= 64)
-        model.mode = "fused_recurrent"  # Ensure recurrent mode for decode
+        # Decode phase - one token at a time
+        model.mode = "fused_recurrent"
         decode_outputs = []
-        for i in range(prefill_len, total_len):
-            decode_input = full_hidden_states[:, i : i + 1, :]
+        for i in range(decode_steps):
+            pos = prefill_len + i
+            decode_input = full_hidden_states[:, pos : pos + 1, :]
             with torch.no_grad():
                 decode_output = model(
                     decode_input,
@@ -1059,69 +1263,13 @@ def __init__(self):
         test_output = torch.cat([prefill_output] + decode_outputs, dim=1)
 
         # Use looser tolerance for chunked vs recurrent comparison
-        # (different processing order leads to numerical differences)
+        # (different numerical accumulation order leads to larger differences)
+        rtol, atol = tolerance
         assert_close(
             test_output,
             reference_output,
-            rtol=1e-3,
-            atol=1e-3,
-            msg=f"KDA chunked vs recurrent mode (prefill={prefill_len}, total={total_len})",
-        )
-
-
-# =============================================================================
-# SECTION 3: FAST PATH vs SLOW PATH TESTS
-# =============================================================================
-
-
-class TestFastVsSlowPath:
-    """Verify CUDA kernel outputs match PyTorch fallback outputs.
-
-    These tests ensure the optimized CUDA kernels (from fla-core) produce
-    the same results as the pure PyTorch implementations used on CPU or
-    when CUDA kernels are unavailable.
-    """
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA")
-    def test_gdn_fast_vs_slow(self, gdn_config, batch_size):
-        """Verify GDN CUDA kernel matches PyTorch fallback."""
-        from fast_llm_external_models.apriel2.modeling_apriel2 import (
-            Apriel2GatedDeltaNet,
-            chunk_gated_delta_rule,
-            torch_chunk_gated_delta_rule,
+            rtol=rtol * 5,
+            atol=atol * 5,
+            msg=f"KDA chunked vs recurrent mode (prefill={prefill_len}, decode={decode_steps})",
         )
 
-        if chunk_gated_delta_rule is None:
-            pytest.skip("Fast path (fla) not available")
-
-        value_heads, key_heads, key_head_dim, value_head_dim = gdn_config
-        hidden_size, seq_len = 256, 32
-
-        config_dict = {
-            "type": "gdn",
-            "value_heads": value_heads,
-            "key_heads": key_heads,
-            "key_head_dim": key_head_dim,
-            "value_head_dim": value_head_dim,
-            "convolution_layer": {"kernel_size": 4},
-            "norm_eps": 1e-5,
-        }
-
-        torch.manual_seed(42)
-        model = Apriel2GatedDeltaNet(hidden_size, config_dict, layer_idx=0)
-        model.eval()
-
-        torch.manual_seed(123)
-        hidden_states = torch.randn(batch_size, seq_len, hidden_size)
-
-        with torch.no_grad():
-            # Fast path (CUDA kernel)
-            model._chunk_gated_delta_rule = chunk_gated_delta_rule
-            fast_out = model(hidden_states)[0].clone()
-
-            # Slow path (PyTorch fallback)
-            model._chunk_gated_delta_rule = torch_chunk_gated_delta_rule
-            slow_out = model(hidden_states)[0].clone()
-
-        # Looser tolerance for kernel vs reference comparison
-        assert_close(fast_out, slow_out, rtol=1e-3, atol=1e-3, msg="GDN fast path (CUDA) vs slow path (PyTorch)")
diff --git a/fast_llm_external_models/tests/test_apriel2/test_model_structure.py b/fast_llm_external_models/tests/test_apriel2/test_model_structure.py
index 56d2bc6a6..1adbcda70 100644
--- a/fast_llm_external_models/tests/test_apriel2/test_model_structure.py
+++ b/fast_llm_external_models/tests/test_apriel2/test_model_structure.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from fast_llm_external_models.apriel2.cache import Apriel2Cache, _AttentionCache, _SSMCache
+from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2Cache, _AttentionCache, _SSMCache
 from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2ForCausalLM
 
 
diff --git a/fast_llm_external_models/tests/test_apriel2/test_modeling.py b/fast_llm_external_models/tests/test_apriel2/test_modeling.py
index 8e2f610bb..500e1d5ad 100644
--- a/fast_llm_external_models/tests/test_apriel2/test_modeling.py
+++ b/fast_llm_external_models/tests/test_apriel2/test_modeling.py
@@ -62,7 +62,7 @@ def test_model_end_to_end(self, config_name, request):
 
         # Test 1: Empty cache should give different results than filled cache
         # This verifies cache is being used at all
-        from fast_llm_external_models.apriel2.cache import Apriel2Cache, _AttentionCache
+        from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2Cache, _AttentionCache
 
         empty_cache = Apriel2Cache(config)