AI-Hypercomputer
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 18 additions & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/maxtext/configs/models/deepseek-custom.yml‎
Lines changed: 8 additions & 0 deletions b/‎src/maxtext/configs/models/deepseek-custom.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 31 additions & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/maxtext/layers/decoders.py‎
Lines changed: 11 additions & 1 deletion b/‎src/maxtext/layers/decoders.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/maxtext/layers/engram.py‎
Lines changed: 36 additions & 24 deletions b/‎src/maxtext/layers/engram.py‎
Lines changed: 36 additions & 24 deletions
@@ -315,6 +315,7 @@ out_proj: 'remat'
 mla_q: 'remat'
 mla_kv: 'remat'
 attention_out: 'remat'
+engram: 'remat'
 
 optimizer_memory_host_offload: False
 parameter_memory_host_offload: False
@@ -1102,3 +1103,20 @@ force_q_layout: false
 mhc_expansion_rate: 1
 # The number of iterations for the Sinkhorn-Knopp algorithm.
 sinkhorn_iterations: 20
+
+################################## DeepSeek Engram ##################################
+# Indices of transformer layers where Engram are integrated; leave empty [] to disable.
+# Example: [1, 4] attaches to the 2nd and 5th layer. 
+engram_layers: []
+# The max 'n' in N-gram. Example: n=3 means it covers both 2-grams and 3-grams.
+engram_max_ngram_size: 3
+# Number of heads dedicated to the Engram.
+engram_num_heads: 8
+# Head dimension for heads.
+engram_head_dim: 1280
+# List of minimum head vocab sizes for each n-gram order.
+engram_vocab_bases: []
+# Temporal window size for Engram convolution.
+engram_kernel_size: 4
+# The seed for Engram hash mapping.
+engram_seed: 0
@@ -59,3 +59,11 @@ index_topk: 256                # Reduced from 2048
 # Hyper-connections: mHC enabled
 mhc_expansion_rate: 4
 sinkhorn_iterations: 20
+# Engram
+engram_layers: [1, 4]
+engram_num_heads: 8
+engram_head_dim: 512
+engram_vocab_bases: [226240, 226240]
+engram_max_ngram_size: 3
+engram_kernel_size: 4
+engram_seed: 0
@@ -897,6 +897,7 @@ class RematAndOffload(BaseModel):
       RematLocation.REMAT,
       description="Remat policy for the attention output.",
   )
+  engram: RematLocation = Field(RematLocation.REMAT, description="Remat policy for the engram output.")
 
   optimizer_memory_host_offload: bool = Field(False, description="Offload optimizer state to host memory.")
   parameter_memory_host_offload: bool = Field(False, description="Offload parameters to host memory.")
@@ -1630,6 +1631,23 @@ class SpecialTokens(BaseModel):
   solution_end_token: str = Field("</answer>", description="Token to mark the end of a solution section.")
 
 
+class Engram(BaseModel):
+  """Configuration for DeepSeek Engram (https://www.arxiv.org/pdf/2601.07372)."""
+
+  engram_layers: list[int] = Field(
+      default_factory=list,
+      description="Indices of transformer layers where Engram are integrated.",
+  )
+  engram_num_heads: int = Field(8, description="Number of heads dedicated to the Engram.")
+  engram_head_dim: int = Field(1280, description="Head dimension for heads.")
+  engram_vocab_bases: list[int] = Field(
+      default_factory=list, description="List of minimum head vocab sizes for each n-gram order."
+  )
+  engram_max_ngram_size: int = Field(3, description="The max 'n' in N-gram.")
+  engram_kernel_size: int = Field(4, description="Temporal window size for Engram convolution.")
+  engram_seed: int = Field(0, description="The seed for Engram hash mapping.")
+
+
 class DerivedValues(BaseModel):
   """Holds all fields that are derived from other config values for perfect legacy compatibility."""
 
@@ -1782,6 +1800,7 @@ class MaxTextConfig(
     Quantization,
     # Core Model Architecture
     ModelArchitecture,
+    Engram,
     MTP,
     Logits,
     # Attention Mechanisms
@@ -2262,6 +2281,18 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         and self.gradient_accumulation_steps > 1
     ):
       raise ValueError("FP8 quantization is not compatible with gradient accumulation.")
+    if self.engram_layers:
+      if not self.hf_access_token or not self.tokenizer_path:
+        raise ValueError(
+            "Engram requires both 'hf_access_token' and 'tokenizer_path' " "to load the Hugging Face tokenizer."
+        )
+      if self.scan_layers:
+        raise NotImplementedError("Currently Engram only supports unscanned version. Please set scan_layers=False.")
+      if len(self.engram_vocab_bases) != (self.engram_max_ngram_size - 1):
+        raise ValueError(
+            f"Engram vocab size mismatch: expected {self.engram_max_ngram_size - 1} (max_ngram_size - 1), "
+            f"but got {self.engram_vocab_bases}."
+        )
     if self.num_experts > 1:
       is_fully_moe = (
           self.interleave_moe_layer_step == 1
 
@@ -916,11 +916,19 @@ def __call__(
           num_moe_layers = cfg.num_decoder_layers - cfg.first_num_dense_layers
           num_layers_list = [cfg.first_num_dense_layers, num_moe_layers]
           # Iterate over the two layer groups (dense and MoE) and apply layer transformation
+          global_layer_idx_offset = 0
           for layer, num_layers, layer_prefix in zip(layers, num_layers_list, layer_prefixes):
             for index in range(num_layers):
+              global_layer_idx = global_layer_idx_offset + index
               kv_cache = kv_caches[index] if kv_caches is not None else None
+              input_tokens = decoder_input_tokens if cfg.engram_layers else None
               y, kv_cache = layer(
-                  config=cfg, mesh=mesh, name=f"{layer_prefix}_{index}", quant=self.quant, model_mode=self.model_mode
+                  config=cfg,
+                  mesh=mesh,
+                  name=f"{layer_prefix}_{index}",
+                  quant=self.quant,
+                  model_mode=self.model_mode,
+                  layer_idx=global_layer_idx,
               )(
                   y,
                   decoder_segment_ids,
@@ -932,9 +940,11 @@ def __call__(
                   slot=slot,
                   kv_cache=kv_cache,
                   attention_metadata=attention_metadata,
+                  decoder_input_tokens=input_tokens,
               )
               if kv_caches is not None and kv_cache is not None:
                 kv_caches[index] = kv_cache
+            global_layer_idx_offset += num_layers
         else:
           for lyr in range(cfg.num_decoder_layers):
             RemattedBlockLayer = RemattedBlockLayers[0]
 
@@ -35,6 +35,7 @@
 from sympy import isprime
 from tokenizers import Regex, normalizers
 
+
 class CompressedTokenizer:
   """
   A canonicalizing wrapper that reduces vocabulary sparsity for n-gram lookup.
@@ -50,7 +51,8 @@ class CompressedTokenizer:
 
   def __init__(self, tokenizer: HFTokenizer):
     normalizer = self._build_normalizer()
-    self.lookup_table, self.num_new_token = self._build_lookup_table(tokenizer, normalizer)
+    self.lookup_table_np, self.num_new_token = self._build_lookup_table(tokenizer, normalizer)
+    self.lookup_table = jnp.array(self.lookup_table_np, dtype=jnp.int64)
 
   def __len__(self) -> int:
     return self.num_new_token
@@ -118,19 +120,18 @@ def _build_lookup_table(self, tokenizer: HFTokenizer, normalizer: normalizers.Se
 
     return old2new, len(key2new)
 
-  def __call__(self, input_ids) -> np.ndarray:
+  def __call__(self, input_ids) -> Array:
     """
     Maps original token IDs to compressed IDs.
     """
-    input_ids = np.asarray(input_ids, dtype=np.int64)
+    input_ids = jnp.asarray(input_ids, dtype=jnp.int64)
 
-    # Identify valid tokens (ignore padding/masks usually marked with negative IDs)
-    valid_mask = input_ids >= 0
-    valid_ids = input_ids[valid_mask]
+    # Map negative IDs to 0 for lookup, then mask output back.
+    safe_ids = jnp.where(input_ids < 0, 0, input_ids)
+    mapped_ids = self.lookup_table[safe_ids]
 
-    # Vectorized lookup: O(1) per token
-    output_ids = input_ids.copy()
-    output_ids[valid_mask] = self.lookup_table[valid_ids]
+    # Restore negative IDs (padding)
+    output_ids = jnp.where(input_ids < 0, input_ids, mapped_ids)
     return output_ids
 
 
@@ -177,11 +178,16 @@ def __init__(
     # Initialize compressed tokenizer
     self.compressed_tokenizer = CompressedTokenizer(tokenizer)
     self.tokenizer_vocab_size = len(self.compressed_tokenizer)
-    if pad_id is not None:
-      self.pad_id = int(self.compressed_tokenizer.lookup_table[pad_id])
+    if pad_id is None:
+      raise ValueError("The `pad_id` must be provided and cannot be None.")
+    # Pre-calculate pad_id on CPU using numpy array to avoid ConcretizationTypeError
+    self.pad_id = int(self.compressed_tokenizer.lookup_table_np[pad_id])
 
     # Pre-calculate odd multipliers for hashing: {layer_id: multipliers}
-    self.layer_multipliers = self._calculate_multipliers_across_layers(seed)
+    # Store as JAX arrays
+    self.layer_multipliers = {
+        k: jnp.array(v, dtype=jnp.int64) for k, v in self._calculate_multipliers_across_layers(seed).items()
+    }
 
     # Pre-calculate unique prime vocab sizes for every head
     # Structure: {layer_id: [[2gram_head1, ..., 2gram_headH], ..., [Ngram_head1, ..., Ngram_headH]]}
@@ -254,7 +260,7 @@ def get_vocab_sizes(self, layer_id: int) -> List[int]:
     """
     return [head_size for ngram_size in self.vocab_size_across_layers[layer_id] for head_size in ngram_size]
 
-  def _get_ngram_hashes(self, compressed_ids: np.ndarray, layer_id: int) -> np.ndarray:
+  def _get_ngram_hashes(self, compressed_ids: Array, layer_id: int) -> Array:
     """
     Computes hash indices for all n-grams in the input batch.
 
@@ -265,22 +271,21 @@ def _get_ngram_hashes(self, compressed_ids: np.ndarray, layer_id: int) -> np.nda
     Returns:
       hash_ids: [B, S, H_total] where H_total = H * num_ngram_orders
     """
-    x = np.asarray(compressed_ids, dtype=np.int64)
-    B, S = x.shape
+    x = jnp.asarray(compressed_ids, dtype=jnp.int64)
+    B, _ = x.shape
 
     # 1. Create Sliding Windows via Shifting
     shifted_inputs = []
     for k in range(self.max_ngram_size):
       if k == 0:
-        # e.g., k=0, [The, cat, sat]
         shifted_inputs.append(x)
       else:
         # Pre-allocate full array with PAD_ID
-        shifted_x = np.full((B, S), self.pad_id, dtype=np.int64)
+        padding = jnp.full((B, k), self.pad_id, dtype=jnp.int64)
         # Fast memory copy, slicing and assignment
         # e.g., k=1, [PAD, The, cat]
         #       k=2, [PAD, PAD, The]
-        shifted_x[:, k:] = x[:, :-k]
+        shifted_x = jnp.concatenate([padding, x[:, :-k]], axis=1)
         shifted_inputs.append(shifted_x)
 
     # 2. Retrieve layer-specific hash multipliers
@@ -299,21 +304,21 @@ def _get_ngram_hashes(self, compressed_ids: np.ndarray, layer_id: int) -> np.nda
 
     for n in range(2, self.max_ngram_size + 1):
       # Update hash with next history token
-      ngram_hash = np.bitwise_xor(ngram_hash, shifted_inputs[n - 1] * multipliers[n - 1])
+      ngram_hash = jnp.bitwise_xor(ngram_hash, shifted_inputs[n - 1] * multipliers[n - 1])
 
       # Retrieve prime vocab sizes for all heads of this n-gram order
       vocab_sizes_for_this_gram = vocab_sizes[n - 2]
-      mods = np.array(vocab_sizes_for_this_gram, dtype=np.int64)
+      mods = jnp.array(vocab_sizes_for_this_gram, dtype=jnp.int64)
 
       # Broadcast Modulo: Map hash to valid table indices
       # [B, S, 1] % [H] -> [B, S, H]
       head_hashes = ngram_hash[..., None] % mods
       all_hashes.append(head_hashes)
 
     # Concatenate all heads: [B, S, H_total] where H_total = H * num_ngram_orders
-    return np.concatenate(all_hashes, axis=2)
+    return jnp.concatenate(all_hashes, axis=2)
 
-  def __call__(self, input_ids) -> dict[int, np.ndarray]:
+  def __call__(self, input_ids) -> dict[int, Array]:
     # input_ids from standard tokenizer
     compressed_ids = self.compressed_tokenizer(input_ids)
     hash_ids_for_all_layers = {}
@@ -323,6 +328,13 @@ def __call__(self, input_ids) -> dict[int, np.ndarray]:
     return hash_ids_for_all_layers
 
 
+class StaticWrapper:
+  """Wrapper to prevent nnx from treating the value as a variable."""
+
+  def __init__(self, val):
+    self.val = val
+
+
 class MultiHeadEmbedding(nnx.Module):
   """
   A flattened table representation for multi-head embedding spaces across n-gram orders.
@@ -350,7 +362,7 @@ def __init__(
     # Compute starting index for each head's segment in the flattened table.
     # Offsets serve as the "base address" for each head.
     offsets = np.cumsum([0] + vocab_sizes[:-1])  # prefix sum
-    self.offsets = jnp.array(offsets, dtype=jnp.int32)
+    self.offsets = StaticWrapper(np.array(offsets, dtype=np.int64))
 
     # The total embedding size is the sum of all individual head vocabularies.
     self.embedding = Embed(num_embeddings=sum(vocab_sizes), num_features=head_dim, config=config, mesh=mesh, rngs=rngs)
@@ -368,7 +380,7 @@ def __call__(self, input_ids: Array, model_mode: str = MODEL_MODE_TRAIN) -> Arra
     """
     # Broadcasting Add: [B, S, H] + [H] -> [B, S, H]
     # Shifts local indices (0..Prime-1) to global table positions.
-    shifted_ids = input_ids + self.offsets
+    shifted_ids = input_ids + self.offsets.val
 
     # Embedding lookup: [B, S, H_total] -> [B, S, H_total, D_head]
     return self.embedding(shifted_ids, model_mode=model_mode)