Merge pull request #3223 from AI-Hypercomputer:engram_flops_clean

Google-ML-Automation · Google-ML-Automation · commit 20b297f6382c · 2026-03-02T15:04:51.000-08:00
PiperOrigin-RevId: 877595416
diff --git a/src/maxtext/layers/engram.py b/src/maxtext/layers/engram.py
@@ -469,7 +469,7 @@ def __call__(self, x: Array) -> Array:
       B: Batch size
       S: Sequence length (temporal dimension)
       G: Number of branches (mhc_expansion_rate)
-      D: Hidden size (base_emb_dim)
+      D: Hidden size (emb_dim)
     """
     B, S, G, D = x.shape
 
@@ -557,7 +557,7 @@ def __init__(
     # retrieved n-gram memory -> Key, from D_en to [G, D]
     self.key_proj = DenseGeneral(
         in_features_shape=self.engram_dim,
-        out_features_shape=(mhc_expansion_rate, config.base_emb_dim),
+        out_features_shape=(mhc_expansion_rate, config.emb_dim),
         axis=-1,
         kernel_init=self.kernel_init,
         kernel_axes=("engram_dim", "mhc", "embed"),
@@ -578,7 +578,7 @@ def __init__(
     @nnx.vmap(in_axes=0, out_axes=0)
     def create_norms(rngs):
       return RMSNorm(
-          num_features=config.base_emb_dim,
+          num_features=config.emb_dim,
           dtype=config.dtype,
           weight_dtype=config.weight_dtype,
           epsilon=config.normalization_layer_epsilon,
@@ -594,7 +594,7 @@ def create_norms(rngs):
     # Value Projection (Shared): Retrieved memory -> Value
     self.value_proj = DenseGeneral(
         in_features_shape=self.engram_dim,
-        out_features_shape=config.base_emb_dim,
+        out_features_shape=config.emb_dim,
         axis=-1,
         kernel_init=self.kernel_init,
         kernel_axes=("engram_dim", "embed"),
@@ -611,7 +611,7 @@ def create_norms(rngs):
     # Applies depthwise causal convolution to smooth the retrieved memory over time.
     self.short_conv = ShortConv(
         config=config,
-        hidden_size=config.base_emb_dim,
+        hidden_size=config.emb_dim,
         kernel_size=self.conv_kernel_size,
         dilation=self.max_ngram_size,
         mhc_expansion_rate=mhc_expansion_rate,
@@ -635,7 +635,7 @@ def __call__(self, hidden_states: Array, hash_input_ids: Array) -> Array:
       S: Sequence Length
       G: mhc_expansion_rate, Number of Branches
       H_total: Total number of heads across n-grams. num_head * num_ngrams
-      D: base_emb_dim
+      D: emb_dim
       D_head: Dimension of a single head embedding
       D_en: Dimension of flattened embedding across heads and n-grams
     """
diff --git a/src/maxtext/utils/maxtext_utils.py b/src/maxtext/utils/maxtext_utils.py
@@ -669,6 +669,38 @@ def calculate_llama4_vision_layers_tflops_per_device(config):
   return total_tflops, learnable_weight_tflops, total_attn_tflops
 
 
+def calculate_engram_tflops(config):
+  """Calculate engram TFLOPs per device."""
+  B = config.per_device_batch_size
+  S = config.max_target_length
+  G = config.mhc_expansion_rate  # Multi-manifold branches
+  D = config.emb_dim  # Hidden dimension
+  k = config.engram_kernel_size  # Conv window
+
+  num_ngram_orders = config.engram_max_ngram_size - 1
+  engram_dim = config.engram_num_heads * config.engram_head_dim * num_ngram_orders
+
+  # 1. Key Projection
+  key_flops = 2 * (B * S) * engram_dim * (G * D)
+  # 2. Value Projection
+  value_flops = 2 * (B * S) * engram_dim * D
+  # 3. QK Attention
+  attention_flops = 2 * (B * S) * G * D
+  # 4. Short Convolution
+  # Standard flops as 2 * kernel_size * input_channels * output_channels / feature_group_count
+  # In Engram, the feature_group_count = input_channels = output_channels
+  # Unlike global attention, convolution work is constant per token (not O(S^2)),
+  # so we do not apply the 0.5 causal scaling factor.
+  total_channels = G * D
+  conv_flops = 2 * (B * S) * k * total_channels
+
+  num_layers = len(config.engram_layers)
+  # account for both the forward (1x) and backward (2x) passes
+  learnable_tflops = num_layers * (key_flops + value_flops + conv_flops) * 3 / 1e12
+  attention_tflops = num_layers * attention_flops * 3 / 1e12
+  return learnable_tflops, attention_tflops
+
+
 def calculate_vision_encoder_tflops(config):
   """Calculate vision encoder TFLOPs per prefill step per device."""
   if config.model_name.startswith("gemma3"):
@@ -786,18 +818,11 @@ def calculate_tflops_training_per_device(config, log=True):
     )
     attention_tflops = causal_attention_flops * config.num_decoder_layers * 3 / 10**12
 
-  learnable_weight_tflops = learnable_weight_tflops * config.gradient_accumulation_steps
-  attention_tflops = attention_tflops * config.gradient_accumulation_steps
-
-  # DPO includes one additional forward pass per gradient accumulation step
-  if config.use_dpo:
-    reference_model_tflops = learnable_weight_tflops / 3  # additional forward pass
-    reference_model_attention_tflops = attention_tflops / 3
-    attention_tflops = attention_tflops + reference_model_attention_tflops
-  else:
-    reference_model_tflops = 0
-
-  total_tflops = learnable_weight_tflops + attention_tflops + reference_model_tflops
+  # Engram flops
+  if config.engram_layers:
+    engram_learnable_tflops, engram_attention_tflops = calculate_engram_tflops(config)
+    learnable_weight_tflops += engram_learnable_tflops
+    attention_tflops += engram_attention_tflops
 
   if config.use_multimodal:
     # Add vision layers TFLOPs for multimodal models
@@ -810,10 +835,22 @@ def calculate_tflops_training_per_device(config, log=True):
           f"and {100 * mm_attention_tflops/mm_total_tflops:.2f}% attention flops;\n",
           f"learnable weight {mm_learnable_weight_tflops:.2f} TFLOPs, attention {mm_attention_tflops:.2f} TFLOPs",
       )
-    total_tflops += mm_total_tflops
     learnable_weight_tflops += mm_learnable_weight_tflops
     attention_tflops += mm_attention_tflops
 
+  learnable_weight_tflops = learnable_weight_tflops * config.gradient_accumulation_steps
+  attention_tflops = attention_tflops * config.gradient_accumulation_steps
+
+  # DPO includes one additional forward pass per gradient accumulation step
+  if config.use_dpo:
+    reference_model_tflops = learnable_weight_tflops / 3  # additional forward pass
+    reference_model_attention_tflops = attention_tflops / 3
+    attention_tflops = attention_tflops + reference_model_attention_tflops
+  else:
+    reference_model_tflops = 0
+
+  total_tflops = learnable_weight_tflops + attention_tflops + reference_model_tflops
+
   if log:
     print(
         "Per train step:\n",
diff --git a/tests/unit/flop_calculation_test.py b/tests/unit/flop_calculation_test.py
@@ -487,3 +487,76 @@ def test_deepseek32_671b_flops(self):
     )
     calculated_tflops, _, _ = calculate_tflops_training_per_device(cfg)
     self.assertFlopsAlmostEqual(calculated_tflops, golden_tflops)
+
+  @pytest.mark.cpu_only
+  def test_custom_engram_flops(self):
+    """Test model with Engram FLops calculation"""
+    kwargs = {
+        # Model bases
+        "model_name": "deepseek2-16b",
+        "override_model_config": True,
+        # Core workload parameters
+        "per_device_batch_size": 4,
+        "max_target_length": 8192,
+        "num_experts": 64,
+        "num_experts_per_tok": 6,
+        "shared_experts": 2,
+        # Model dimensions
+        "base_emb_dim": 2048,
+        "base_num_query_heads": 16,
+        "base_num_kv_heads": 16,
+        "base_mlp_dim": 10944,
+        "base_moe_mlp_dim": 1408,
+        "base_num_decoder_layers": 27,
+        "first_num_dense_layers": 1,
+        "mlp_activations": ["silu", "linear"],
+        "vocab_size": 102400,
+        # MLA
+        "q_lora_rank": 0,
+        "kv_lora_rank": 512,
+        "qk_nope_head_dim": 128,
+        "qk_rope_head_dim": 64,
+        "v_head_dim": 128,
+        "skip_jax_distributed_system": True,
+        # Engram
+        "mhc_expansion_rate": 1,
+        "engram_layers": [2, 15],
+        "engram_num_heads": 8,
+        "engram_head_dim": 1280,
+        "engram_kernel_size": 4,
+        "engram_max_ngram_size": 3,
+        "engram_vocab_bases": [226240, 226240],
+        "tokenizer_type": "huggingface",
+        "tokenizer_path": "deepseek-ai/DeepSeek-V3.2",
+        "hf_access_token": "fake",
+        "scan_layers": False,
+    }
+    B = kwargs["per_device_batch_size"]
+    S = kwargs["max_target_length"]
+    G = kwargs["mhc_expansion_rate"]
+    D = kwargs["base_emb_dim"]
+    K = kwargs["engram_kernel_size"]
+    H = kwargs["engram_num_heads"]
+    H_D = kwargs["engram_head_dim"]
+    L = len(kwargs["engram_layers"])
+    N = kwargs["engram_max_ngram_size"]
+
+    attention_flops = self.compute_deepseek_attention_flops_per_device(kwargs)
+    # deepseek2-16b has ~2.4B active parameters
+    # https://arxiv.org/pdf/2405.04434
+    golden_param_size = 2.4e9
+
+    # calculate Engram active params
+    engram_dim = H * H_D * (N - 1)
+    key_params = engram_dim * (G * D)
+    value_params = engram_dim * D
+    conv_params = K * (G * D)
+    engram_active_params = L * (key_params + value_params + conv_params)
+    golden_tflops = 6 * B * S * (golden_param_size + engram_active_params) / 1e12 + attention_flops
+
+    cfg = pyconfig.initialize(
+        [None, get_test_config_path()],
+        **kwargs,
+    )
+    calculated_tflops, _, _ = calculate_tflops_training_per_device(cfg)
+    self.assertFlopsAlmostEqual(calculated_tflops, golden_tflops)