AI-Hypercomputer
diff --git a/‎pytest.ini‎
Lines changed: 2 additions & 1 deletion b/‎pytest.ini‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 7 additions & 0 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/MaxText/configs/models/deepseek3.2-671b.yml‎
Lines changed: 59 additions & 0 deletions b/‎src/MaxText/configs/models/deepseek3.2-671b.yml‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 17 additions & 3 deletions b/‎src/MaxText/configs/types.py‎
Lines changed: 17 additions & 3 deletions
@@ -14,12 +14,13 @@ addopts =
     --ignore=tests/unit/gemma3_layers_test.py
     --ignore=tests/unit/gpt_vs_reference_test.py
     --ignore=tests/unit/llama4_layers_test.py
-    --ignore=tests/unit/mla_vs_reference_test.py
+    --ignore=tests/unit/yarn_vs_reference_test.py
     --ignore=tests/unit/moba_vs_reference_test.py
     --ignore=tests/unit/offline_engine_test.py
     --ignore=tests/unit/profiler_test.py
     --ignore=tests/unit/qwen3_omni_layers_test.py
     --ignore=tests/unit/qwen3_next_vs_reference_test.py
+    --ignore=tests/unit/deepseek32_vs_reference_test.py
 markers =
     tpu_only: marks tests to be run on TPUs only
     gpu_only: marks tests to be run on GPUs only
 
@@ -328,6 +328,13 @@ moba: False
 moba_chunk_size: 1024
 moba_topk: 8
 
+# DeepSeek Sparse Attention (DSA)
+# deepseek3.2 introduces indexer in MLA
+use_sparse_indexer: False
+index_head_dim: 128
+index_n_heads: 64
+index_topk: 2048
+
 # MLA parameters
 q_lora_rank: 0
 kv_lora_rank: 512
 
@@ -0,0 +1,59 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek V3.2 - 671B
+# Identical to deepseek3-671b config, except adding indexer config.
+
+base_emb_dim: 7168
+base_num_query_heads: 128
+base_num_kv_heads: 128
+base_mlp_dim: 18432
+base_moe_mlp_dim: 2048
+base_num_decoder_layers: 61
+first_num_dense_layers: 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+num_experts: 256
+num_experts_per_tok: 8
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek"
+# MLA
+attention_type: "mla"
+q_lora_rank: 1536
+kv_lora_rank: 512
+qk_nope_head_dim: 128
+qk_rope_head_dim: 64
+v_head_dim: 128
+# RoPE
+mscale: 1.0
+rope_type: "yarn"
+rope_max_timescale: 10_000 # DeepSeek uses  "rope_theta": 10000
+max_position_embeddings: 163840
+original_max_position_embeddings: 4096
+rope_factor: 40
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
+# Indexer for DeepSeek Sparse Attention
+use_sparse_indexer: True
+index_n_heads: 64
+index_head_dim: 128
+index_topk: 2048
@@ -207,6 +207,7 @@ class ProfilerType(str, Enum):
     "deepseek3-671b-2dfsdp",
     "deepseek3-test",
     "deepseek3-tiny",
+    "deepseek3.2-671b",
     "kimi-k2-1t",
     "gemma-7b",
     "gemma-2b",
@@ -502,6 +503,15 @@ class MlaAttention(BaseModel):
   v_head_dim: NonNegativeInt = Field(128, description="Dimension of V heads in MLA.")
 
 
+class AttentionIndexer(BaseModel):
+  """Configuration for DeepSeek Sparse Attention (DSA): DeepSeek3.2-style MLA with indexer."""
+
+  use_sparse_indexer: bool = Field(False, description="Whether to use sparse indexer for MLA.")
+  index_head_dim: NonNegativeInt = Field(128, description="Head dim for indexer query and key.")
+  index_n_heads: NonNegativeInt = Field(64, description="Number of query heads in indexer.")
+  index_topk: NonNegativeInt = Field(2048, description="Number of tokens selected by the query token in indexer.")
+
+
 class Llama4Attention(BaseModel):
   """Configuration specific to Llama4-style models."""
 
@@ -1686,6 +1696,7 @@ class MaxTextConfig(
     Attention,
     MlaAttention,
     MoBa,
+    AttentionIndexer,
     Llama4Attention,
     SplashAttention,
     PagedAttention,
@@ -2120,6 +2131,11 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         raise ValueError("`local_checkpoint_period` must be > 0 for emergency checkpointing.")
     if self.moba and self.attention not in ("dot_product"):
       raise ValueError("MoBA is only supported with dot_product attention.")
+    if self.use_sparse_indexer:
+      if self.q_lora_rank == 0:
+        raise NotImplementedError("Sparse indexer has not implemented for q_lora_rank = 0.")
+      if self.attention not in ("dot_product"):
+        raise ValueError("Sparse indexer is only supported dot_product attention")
     if self.attention_type == AttentionType.CHUNK.value and (
         not isinstance(self.chunk_attn_window_size, int) or self.chunk_attn_window_size <= 0
     ):
@@ -2259,9 +2275,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           f"`python3 -m MaxText.muon_utils {self.model_name} True`"
       )
     if self.force_q_layout and not self.use_jax_splash:
-      raise ValueError(
-          "`force_q_layout` can only be true if `use_jax_splash` is also true."
-      )
+      raise ValueError("`force_q_layout` can only be true if `use_jax_splash` is also true.")
 
     # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
     # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.