AI-Hypercomputer
diff --git a/‎src/maxtext/inference/kvcache.py‎
Lines changed: 76 additions & 1 deletion b/‎src/maxtext/inference/kvcache.py‎
Lines changed: 76 additions & 1 deletion
diff --git a/‎src/maxtext/inference/maxengine/maxengine.py‎
Lines changed: 23 additions & 1 deletion b/‎src/maxtext/inference/maxengine/maxengine.py‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎src/maxtext/layers/decoders.py‎
Lines changed: 15 additions & 4 deletions b/‎src/maxtext/layers/decoders.py‎
Lines changed: 15 additions & 4 deletions
@@ -230,7 +230,11 @@ def kv_cache_as_linen(
   )
 
 
-class KVCache(nnx.Module):
+class BaseCache(nnx.Module):
+  """Abstract base class for Caches."""
+
+
+class KVCache(BaseCache):
   """Implementation of the KVCache."""
 
   def __init__(
@@ -290,6 +294,7 @@ def __init__(
       use_chunked_prefill: Whether to use chunked prefill.
       rngs: The random number generators for initialization.
     """
+    super().__init__()
     self.max_prefill_length = max_prefill_length
     self.max_target_length = max_target_length
     self.batch = batch
@@ -844,6 +849,76 @@ def __call__(
       raise ValueError(f"Model Mode isn't supported! {model_mode=}")
 
 
+class GatedDeltaNetCache(BaseCache):
+  """Cache for Linear Attention (Gated Delta Net).
+
+  Stores the fixed-size recurrent state and the sliding window state for convolution.
+  """
+
+  def __init__(
+      self,
+      batch: int,
+      num_heads: int,
+      k_head_dim: int,
+      v_head_dim: int,
+      conv_kernel_size: int,
+      conv_dim: int,
+      dtype: DType,
+      cache_batch_axis_name: str = CACHE_BATCH,
+      cache_heads_axis_name: str = CACHE_HEADS,
+  ):
+    super().__init__()
+    self.batch = batch
+    self.dtype = dtype
+
+    # 1. Recurrent State (S) for the Delta Rule
+    # Shape: [Batch, Heads, K_Dim, V_Dim]
+    # We maintain the running state matrix.
+    self.recurrent_state = nnx.Cache(
+        jnp.zeros((int(batch), num_heads, k_head_dim, v_head_dim), dtype=dtype),
+        # Sharding: Batch, Heads, None (K), None (V)
+        sharding=(cache_batch_axis_name, cache_heads_axis_name, None, None),
+    )
+
+    # 2. Convolution State for the 1D Conv
+    # Shape: [Batch, Kernel_Size - 1, Conv_Dim]
+    # We store the last (K-1) inputs to perform the sliding window conv during decoding.
+    self.conv_state = nnx.Cache(
+        jnp.zeros((int(batch), conv_kernel_size - 1, conv_dim), dtype=dtype),
+        # Sharding: Batch, None (Time), None (Dim)
+        sharding=(cache_batch_axis_name, None, None),
+    )
+
+  def __call__(self):
+    """Returns the cache variables for the layer to use."""
+    return self
+
+
+def gated_delta_net_cache_as_linen(
+    *,
+    batch: int,
+    num_heads: int,
+    head_dim: int,
+    conv_kernel_size: int,
+    conv_dim: int,
+    dtype: DType,
+    name: str | None = None,
+):
+  """Initializes the GatedDeltaNetCache and returns it as a Linen module."""
+  return nnx_wrappers.to_linen(
+      GatedDeltaNetCache,
+      batch=batch,
+      num_heads=num_heads,
+      head_dim=head_dim,
+      conv_kernel_size=conv_kernel_size,
+      conv_dim=conv_dim,
+      dtype=dtype,
+      metadata_fn=variable_to_logically_partitioned,
+      name=name,
+      abstract_init=False,
+  )
+
+
 def mla_kv_cache_as_linen(
     *,
     max_prefill_length: int,
 
@@ -1146,6 +1146,9 @@ def copy(path, partial_cache, full_cache, annotations):
             "cached_prefill_value_scale",
         ]:
           full_cache = jax.lax.dynamic_update_index_in_dim(full_cache, partial_cache, slot, batch_idx)
+        elif path_key in ["recurrent_state", "conv_state"]:
+          # Direct update for fixed-size linear attention states
+          full_cache = jax.lax.dynamic_update_index_in_dim(full_cache, partial_cache, slot, batch_idx)
         else:
           raise ValueError(f"We don't have a strategy for inserting {path_key}")
 
@@ -1258,6 +1261,10 @@ def copy(path, partial_cache, full_cache, annotations):
           "cached_prefill_value_scale",
       ]:
         return jax.lax.dynamic_update_index_in_dim(full_cache, partial_cache, slot, batch_idx)
+      elif path_key in ["recurrent_state", "conv_state"]:
+        # For linear attention, the state is fixed size. We simply copy the result
+        # from the prefill step (partial_cache) into the decode state (full_cache).
+        return jax.lax.dynamic_update_index_in_dim(full_cache, partial_cache, slot, batch_idx)
       else:
         raise ValueError(f"We don't have a strategy for inserting {path_key}")
 
@@ -1447,6 +1454,15 @@ def copy(path, partial_cache, full_cache, annotations):
         partial_cache = jax.lax.dynamic_slice(partial_cache, start_indices, slice_size)
 
         return jax.lax.dynamic_update_index_in_dim(full_cache, partial_cache, slot, batch_idx)
+      elif path_key in ["recurrent_state", "conv_state"]:
+        # SSM states are the "final state" after prefill, so we just overwrite the slot.
+        # We don't need to slice by sequence length like we do for KV cache.
+        if num_prompts > 1:
+          raise NotImplementedError(
+              "Packed prefill is currently incompatible with linear attention states (GDN). "
+              "Prompt memory will bleed into adjacent prompts. Please disable packed prefill."
+          )
+        return jax.lax.dynamic_update_index_in_dim(full_cache, partial_cache, slot, batch_idx)
       else:
         raise ValueError(f"We don't have a strategy for inserting {path_key}")
 
@@ -1660,7 +1676,13 @@ def initialize():
     def is_lp(k):
       return isinstance(k, flax.linen.spmd.LogicallyPartitioned)
 
-    self.kv_cache_annotations_named = jax.tree_util.tree_map(lambda x: tuple(x.names), cache, is_leaf=is_lp)
+    self.kv_cache_annotations_named = jax.tree_util.tree_map(
+        lambda x: tuple(x.logical_axes)
+        if hasattr(x, "logical_axes")
+        else (tuple(x.names) if hasattr(x, "names") else ()),
+        cache,
+        is_leaf=is_lp,
+    )
     zeroed = max_utils.unbox_logicallypartioned(init_state)
     return zeroed
 
 
@@ -980,15 +980,22 @@ def __call__(
               }
             if cfg.decoder_block == DecoderBlockType.QWEN3_NEXT:
               layer_kwargs = {"layer_idx": lyr}
+            kv_cache = None
+            if kv_caches is not None and cfg.decoder_block != DecoderBlockType.QWEN3_NEXT:
+              kv_cache = kv_caches[lyr]
+            elif kv_caches is not None and cfg.decoder_block == DecoderBlockType.QWEN3_NEXT:
+              # For Qwen3Next, kv_caches is a dictionary of lists of caches.
+              if (lyr + 1) % cfg.inhomogeneous_layer_cycle_interval == 0:
+                kv_cache = (kv_caches["key_cache"][lyr], kv_caches["value_cache"][lyr])
+
             if cfg.decoder_block == DecoderBlockType.GPT_OSS:
               layer_kwargs = {"attention_type": gpt_oss.get_attention_type(layer_id=lyr)}
             if cfg.decoder_block == DecoderBlockType.OLMO3:
               layer_kwargs = {"attention_type": olmo3.get_attention_type(layer_id=lyr)}
             layer = RemattedBlockLayer(
                 config=cfg, mesh=mesh, name=f"layers_{lyr}", quant=self.quant, model_mode=self.model_mode, **layer_kwargs
             )
-            kv_cache = kv_caches[lyr] if kv_caches is not None else None
-            y, kv_cache = layer(
+            y, returned_cache = layer(
                 y,
                 decoder_segment_ids,
                 decoder_positions,
@@ -1001,8 +1008,12 @@ def __call__(
                 attention_metadata=attention_metadata,
                 **layer_call_kwargs,
             )
-            if kv_caches is not None and kv_cache is not None:
-              kv_caches[lyr] = kv_cache
+            if kv_caches is not None and returned_cache is not None:
+              if cfg.decoder_block != DecoderBlockType.QWEN3_NEXT:
+                kv_caches[lyr] = returned_cache
+              elif (lyr + 1) % cfg.inhomogeneous_layer_cycle_interval == 0:
+                kv_caches["key_cache"][lyr] = returned_cache[0]
+                kv_caches["value_cache"][lyr] = returned_cache[1]
 
             if deepstack_visual_embeds is not None and lyr < len(deepstack_visual_embeds):
               visual_embeds = deepstack_visual_embeds[lyr]