AI-Hypercomputer
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 67 additions & 64 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 67 additions & 64 deletions
diff --git a/‎src/MaxText/configs/models/deepseek3-671b-2dfsdp.yml‎
Lines changed: 75 additions & 0 deletions b/‎src/MaxText/configs/models/deepseek3-671b-2dfsdp.yml‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 40 additions & 28 deletions b/‎src/MaxText/configs/types.py‎
Lines changed: 40 additions & 28 deletions
@@ -18,6 +18,7 @@ run_name: ""
 
 model_name: "default" # override config settings to match a specific model. other than the override, nothing should use this!
 override_model_config: False # When set to true allows overriding model parameters via CLI for the purpose of debugging/testing.
+override_logical_axis_rules: False # When set overrides logical axis rules instead of merging them.
 debug:
   rl: False # RL-specific debugging
 
@@ -70,72 +71,72 @@ checkpoint_storage_concurrent_gb: 96
 
 # Bool flag for enabling Orbax v1.
 enable_orbax_v1: False
-# Function for processing loaded checkpoint dict into a format MaxText can understand. (for other formats, i.e. SafeTensors)
-checkpoint_conversion_fn: None
-# Optional checkpoint context to use for loading. Options: "orbax", "safetensors"
+# function for processing loaded checkpoint dict into a format maxtext can understand. (for other formats, i.e. safetensors)
+checkpoint_conversion_fn: none
+# optional checkpoint context to use for loading. options: "orbax", "safetensors"
 source_checkpoint_layout: "orbax"
-############################### END CHECKPOINTING ##################################
+############################### end checkpointing ##################################
 
 
-reuse_example_batch: 0 # for testing TPU performance, this options repeated uses the same batch.
+reuse_example_batch: 0 # for testing tpu performance, this options repeated uses the same batch.
 
 
-metrics_file: "" # for testing, local file that stores scalar metrics. If empty, no metrics are written.
-# If true save metrics such as loss and TFLOPS to GCS in {base_output_directory}/{run_name}/metrics/
-gcs_metrics: False
+metrics_file: "" # for testing, local file that stores scalar metrics. if empty, no metrics are written.
+# if true save metrics such as loss and tflops to gcs in {base_output_directory}/{run_name}/metrics/
+gcs_metrics: false
 
-# If true save config to GCS in {base_output_directory}/{run_name}/
-save_config_to_gcs: False
+# if true save config to gcs in {base_output_directory}/{run_name}/
+save_config_to_gcs: false
 
-# Gradient dtype
+# gradient dtype
 grad_dtype: "float32"
 
-# Activation dtypes.
+# activation dtypes.
 dtype: "bfloat16"
-# Used to configure quantization in the transformer layers, defaults to null implying bf16.
-# Possible alternative settings are as follows:
+# used to configure quantization in the transformer layers, defaults to null implying bf16.
+# possible alternative settings are as follows:
 # 'int8' for dynamic range quantization using 8-bits
-# 'intmp' for mixed precision quantization for inference as described here: src/MaxText/configs/quantization/README.md
-# 'fp8' for 8-bit floating-point GeMMs on NVIDIA GPUs.
-# 'nanoo_fp8' for 8-bit floating-point GeMMs on AMD MI300/MI325 GPUs.
-# 'fp8_full' for FP8 quantization with static scaling.
+# 'intmp' for mixed precision quantization for inference as described here: src/MaxText/configs/quantization/readme.md
+# 'fp8' for 8-bit floating-point gemms on nvidia gpus.
+# 'nanoo_fp8' for 8-bit floating-point gemms on amd mi300/mi325 gpus.
+# 'fp8_full' for fp8 quantization with static scaling.
 quantization: ""
-# Used to configure constant_bound_config in aqt lib for static scaling, e.g. constant_bound_config='0.5, 0.5, 0.5, 0.5, 0.5, 0.5'
+# used to configure constant_bound_config in aqt lib for static scaling, e.g. constant_bound_config='0.5, 0.5, 0.5, 0.5, 0.5, 0.5'
 constant_bound_config: ""
-# Choose one of default, high, and highest.
-# https://kolonist26-jax-kr.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
+# choose one of default, high, and highest.
+# https://kolonist26-jax-kr.readthedocs.io/en/latest/jax.lax.html#jax.lax.precision
 matmul_precision: "default"
-activations_in_float32: False # Sets activations to float32 before nonlinearity it true, else dtype
-# Used to replicate the quantization scale to avoid the inefficient XLA fusion for 2d sharding.
-replicate_quant_scale: False
-# Path to file with quantization config for intmp.
+activations_in_float32: false # sets activations to float32 before nonlinearity it true, else dtype
+# used to replicate the quantization scale to avoid the inefficient xla fusion for 2d sharding.
+replicate_quant_scale: false
+# path to file with quantization config for intmp.
 quant_cfg_path: ""
-quantize_kvcache: False # Set to True to quantize KV Cache values, defaults to False
-# Valid kv_quant_axis values:
-#   - "" is valid only when quantize_kvcache is False
+quantize_kvcache: false # set to true to quantize kv cache values, defaults to false
+# valid kv_quant_axis values:
+#   - "" is valid only when quantize_kvcache is false
 #   - "dkv" indicates quantize kv cache over the cache_kv, i.e. kv dimension axis
 #   - "heads_and_dkv" indicates quantize kv cache over cache_heads and cache_kv axes
-# Default to "heads_and_dkv" for faster compution, kv_quant_axis is not used when quantize_kvcache is False
+# default to "heads_and_dkv" for faster compution, kv_quant_axis is not used when quantize_kvcache is false
 #   - "dkv" is expected with better accuracy but degraded computation
 kv_quant_axis: "heads_and_dkv"
 kv_quant_dtype: "int8"
-checkpoint_is_quantized: False # Set to True if reading from a saved aqt quantized checkpoint
-# Saves params quantized on fly at following path
+checkpoint_is_quantized: false # set to true if reading from a saved aqt quantized checkpoint
+# saves params quantized on fly at following path
 save_quantized_params_path: ""
-#Used to configure the mode in which model is called
+#used to configure the mode in which model is called
 # when left as is, corresponds to training
 # accepted values are "inference"
 model_call_mode: ""
-use_qwix_quantization: False # Whether to use qwix for quantization. If set to True, the model will be quantized using qwix.
-# Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
+use_qwix_quantization: false # whether to use qwix for quantization. if set to true, the model will be quantized using qwix.
+# quantization calibration method used for weights and activations. supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#l70-l80
 weight_quantization_calibration_method: "absmax"
 act_quantization_calibration_method: "absmax"
 bwd_quantization_calibration_method: "absmax"
-# Shard the range finding operation for quantization. By default this is set to number of slices.
+# shard the range finding operation for quantization. by default this is set to number of slices.
 quantization_local_shard_count: -1
 
-decoder_block: "llama2" # which style of DecoderBlock to use.
-# Global parameter scale needs to be a power of 2. If you want finer grained control of the model sizes
+decoder_block: "llama2" # which style of decoderblock to use.
+# global parameter scale needs to be a power of 2. if you want finer grained control of the model sizes
 # then you should explicitly set base_embed_dim, base_num_query_heads, base_num_kv_heads,
 # base_mlp_dim, base_num_decoder_layers and/or head_dim.
 weight_dtype: "float32"
@@ -149,39 +150,39 @@ head_dim: 128
 mlp_activations: ["silu", "linear"]
 mlp_activations_limit: -1.0
 dropout_rate: 0.0
-logits_via_embedding: False
-normalize_embedding_logits: True  # whether to normalize pre-softmax logits if logits_via_embedding is true
-logits_dot_in_fp32: False  # whether to use fp32 in logits_dense or shared_embedding dot product for stability
-cast_logits_to_fp32: True # whether to cast the logits to fp32. The higher precision is generally beneficial, but it can vary slightly.
-float32_qk_product: False # in dot_product attention, whether to cast to fp32 the inputs to qk product
-float32_logits: False # in dot_product attention, whether to cast to fp32 the inputs to softmax
-float32_weight_sum: True # whether to use full fp32 precision for weight_sum during final unpermute in moe
-
-# Multi-Token Prediction Configs
-# The number of auxiliary prediction layers to use for MTP.
-# Set to 0 to disable the feature.
+logits_via_embedding: false
+normalize_embedding_logits: true  # whether to normalize pre-softmax logits if logits_via_embedding is true
+logits_dot_in_fp32: false  # whether to use fp32 in logits_dense or shared_embedding dot product for stability
+cast_logits_to_fp32: true # whether to cast the logits to fp32. the higher precision is generally beneficial, but it can vary slightly.
+float32_qk_product: false # in dot_product attention, whether to cast to fp32 the inputs to qk product
+float32_logits: false # in dot_product attention, whether to cast to fp32 the inputs to softmax
+float32_weight_sum: true # whether to use full fp32 precision for weight_sum during final unpermute in moe
+
+# multi-token prediction configs
+# the number of auxiliary prediction layers to use for mtp.
+# set to 0 to disable the feature.
 mtp_num_layers: 0
-# The scaling factor (lambda) for the MTP auxiliary loss. The final loss is:
+# the scaling factor (lambda) for the mtp auxiliary loss. the final loss is:
 # main_loss + mtp_loss_scaling_factor * avg_mtp_loss
 mtp_loss_scaling_factor: 0.1
-# Specifies which MTP layer (1-indexed) is used to calculate metrics like the
-# acceptance rate during evaluation. For example, a value of `1` targets the
-# first auxiliary prediction head. Set to 0 to disable this specific evaluation
+# specifies which mtp layer (1-indexed) is used to calculate metrics like the
+# acceptance rate during evaluation. for example, a value of `1` targets the
+# first auxiliary prediction head. set to 0 to disable this specific evaluation
 mtp_eval_target_module: 0
 
 # mixture of experts (moe)
 num_experts: 1
 num_experts_per_tok: 1
-megablox: True
-sparse_matmul: True
+megablox: true
+sparse_matmul: true
 capacity_factor: -1.0 # a factor to decide expert capacity for token dropping, and no dropping by default
 load_balance_loss_weight: 0.01 # weight for the load balance loss
-use_random_routing: False # whether to use random routing for debug/test purpose
-use_custom_sort_vjp: True # whether to use a custom sort vjp for sparse matmul ops
-use_ring_of_experts: False # whether to use ring of experts for sparse matmul expert parallelism
-# Tunable tiling dimensions used for MLP GMM
-# Megablox/JAX Ragged Dot - supports forward pass only (6 configs: `wi_tile_fwd...` and `wo_tile_fwd_...`)
-# Tokamax Ragged Dot - supports all 18 configs
+use_random_routing: false # whether to use random routing for debug/test purpose
+use_custom_sort_vjp: true # whether to use a custom sort vjp for sparse matmul ops
+use_ring_of_experts: false # whether to use ring of experts for sparse matmul expert parallelism
+# tunable tiling dimensions used for mlp gmm
+# megablox/jax ragged dot - supports forward pass only (6 configs: `wi_tile_fwd...` and `wo_tile_fwd_...`)
+# tokamax ragged dot - supports all 18 configs
 wi_tile_fwd_batch_seq: 512
 wi_tile_fwd_embed_dim: 1024
 wi_tile_fwd_mlp_dim: 1024
@@ -201,17 +202,19 @@ wo_tile_dlhs_mlp_dim: 1024
 wo_tile_drhs_batch_seq: 512
 wo_tile_drhs_embed_dim: 1024
 wo_tile_drhs_mlp_dim: 1024
-norm_topk_prob: False # Boolean to enable the top-k probability normalization. Qwen3-specific normalization of router weights.
+norm_topk_prob: false # boolean to enable the top-k probability normalization. qwen3-specific normalization of router weights.
 
-# How the expert axis is used to shard attention weights and activations
+# how the expert axis is used to shard attention weights and activations
 # "fsdp" (ep acts as fsdp parallelism)
 # "context" (ep acts as context parallelism, training only)
 expert_shard_attention_option: "fsdp"
 
-# When MoE weight matrices are sharded on both FSDP and FSDP-transpose axes, use two separate All-Gather calls
-moe_fsdp_use_two_stage_all_gather: False
+# when moe weight matrices are sharded on both fsdp and fsdp-transpose axes, use two separate all-gather calls
+moe_fsdp_use_two_stage_all_gather: false
 # shard the moe weights on num_expert_dim. this can be performanct when num_expert % fdsp_parallisum
 fsdp_shard_on_exp: False
+# use fsdp and fsdp_transpose axes for sharding the moe weights
+use_2d_fsdp_sharding: False
 
 # deepseek moe
 base_moe_mlp_dim: 7168 # intermediate dimension at MoE layer. For a fully MoE model, base_mlp_dim must be equal to base_moe_mlp_dim.
 
@@ -0,0 +1,75 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek V3 - 671B that uses fsdp on two logical axes
+
+# For DeepSeek default device-limited routing, 
+# please set n_routing_groups=8 and topk_routing_group=4 in your command-line arguments.
+
+base_emb_dim: 7168
+base_num_query_heads: 128
+base_num_kv_heads: 128
+base_mlp_dim: 18432
+base_moe_mlp_dim: 2048
+base_num_decoder_layers: 61
+first_num_dense_layers: 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+num_experts: 256
+num_experts_per_tok: 8
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek"
+# MLA
+attention_type: "mla"
+q_lora_rank: 1536
+kv_lora_rank: 512
+qk_nope_head_dim: 128
+qk_rope_head_dim: 64
+v_head_dim: 128
+mscale: 1.0
+# RoPE
+rope_type: "yarn"
+rope_max_timescale: 10_000 # DeepSeek uses  "rope_theta": 10000
+max_position_embeddings: 163840
+original_max_position_embeddings: 4096
+rope_factor: 40
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
+
+override_logical_axis_rules: True
+mesh_axes: ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']
+data_sharding: [['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']]
+logical_axis_rules: [
+    ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_heads', []],
+    ['embed', ['fsdp']],
+    ['embed_no_exp', ['fsdp']],
+    ['q_lora', ['fsdp']],
+    ['kv_lora', ['fsdp']],
+    ['q_lora_up_proj', ['fsdp_transpose']],
+    ['kv_lora_up_proj', ['fsdp_transpose']],
+    ['q_heads', ['fsdp_transpose']],
+    ['kv_heads', ['fsdp_transpose']],
+    ['heads', ['fsdp_transpose']],
+    ['mlp', ['fsdp_transpose']],
+]
@@ -187,6 +187,7 @@ class ProfilerType(str, Enum):
     "deepseek2-16b",
     "deepseek2-236b",
     "deepseek3-671b",
+    "deepseek3-671b-2dfsdp",
     "deepseek3-test",
     "deepseek3-tiny",
     "kimi-k2-1t",
@@ -233,6 +234,10 @@ class RunInfo(BaseModel):
   )
   model_name: ModelName = Field("default", description="The name of the model configuration to use.")
   override_model_config: bool = Field(False, description="If True, allows overriding model parameters via CLI.")
+  override_logical_axis_rules: bool = Field(
+      False,
+      description="If True, logical_axis_rules will be overridden instead of merged.",
+  )
   log_config: bool = Field(
       True,
       description="If True, prints the final configuration after initialization.",
@@ -563,6 +568,10 @@ class MoEGeneral(BaseModel):
       description="Shard the MoE weights on the num_expert dimension. Can be performant when "
       "num_experts % fsdp_parallelism != 0.",
   )
+  use_2d_fsdp_sharding: bool = Field(
+      False,
+      description="Use `fsdp` and `fsdp_transpose` axes for 2D FSDP sharding.",
+  )
   norm_topk_prob: bool = Field(
       False,
       description="Enable top-k probability normalization for router weights (Qwen3-specific).",
@@ -2137,34 +2146,37 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           self.dcn_autoregressive_parallelism,
       ]
     else:
-      self.ici_parallelism = [
-          self.ici_data_parallelism,
-          self.ici_pipeline_parallelism,
-          self.ici_fsdp_parallelism,
-          self.ici_fsdp_transpose_parallelism,
-          self.ici_sequence_parallelism,
-          self.ici_context_parallelism,
-          self.ici_context_autoregressive_parallelism,
-          self.ici_tensor_parallelism,
-          self.ici_tensor_transpose_parallelism,
-          self.ici_tensor_sequence_parallelism,
-          self.ici_expert_parallelism,
-          self.ici_autoregressive_parallelism,
-      ]
-      self.dcn_parallelism = [
-          self.dcn_data_parallelism,
-          self.dcn_pipeline_parallelism,
-          self.dcn_fsdp_parallelism,
-          self.dcn_fsdp_transpose_parallelism,
-          self.dcn_sequence_parallelism,
-          self.dcn_context_parallelism,
-          self.dcn_context_autoregressive_parallelism,
-          self.dcn_tensor_parallelism,
-          self.dcn_tensor_transpose_parallelism,
-          self.dcn_tensor_sequence_parallelism,
-          self.dcn_expert_parallelism,
-          self.dcn_autoregressive_parallelism,
-      ]
+      ici_map = {
+          "data": self.ici_data_parallelism,
+          "stage": self.ici_pipeline_parallelism,
+          "fsdp": self.ici_fsdp_parallelism,
+          "fsdp_transpose": self.ici_fsdp_transpose_parallelism,
+          "sequence": self.ici_sequence_parallelism,
+          "context": self.ici_context_parallelism,
+          "context_autoregressive": self.ici_context_autoregressive_parallelism,
+          "tensor": self.ici_tensor_parallelism,
+          "tensor_transpose": self.ici_tensor_transpose_parallelism,
+          "tensor_sequence": self.ici_tensor_sequence_parallelism,
+          "expert": self.ici_expert_parallelism,
+          "autoregressive": self.ici_autoregressive_parallelism,
+      }
+      self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]
+
+      dcn_map = {
+          "data": self.dcn_data_parallelism,
+          "stage": self.dcn_pipeline_parallelism,
+          "fsdp": self.dcn_fsdp_parallelism,
+          "fsdp_transpose": self.dcn_fsdp_transpose_parallelism,
+          "sequence": self.dcn_sequence_parallelism,
+          "context": self.dcn_context_parallelism,
+          "context_autoregressive": self.dcn_context_autoregressive_parallelism,
+          "tensor": self.dcn_tensor_parallelism,
+          "tensor_transpose": self.dcn_tensor_transpose_parallelism,
+          "tensor_sequence": self.dcn_tensor_sequence_parallelism,
+          "expert": self.dcn_expert_parallelism,
+          "autoregressive": self.dcn_autoregressive_parallelism,
+      }
+      self.dcn_parallelism = [dcn_map[axis] for axis in self.mesh_axes]
 
     # Final string-to-enum conversions if they haven't been coerced by pydantic yet.
     if isinstance(self.decoder_block, str):