AI-Hypercomputer
diff --git a/‎docs/reference/core_concepts/moe_configuration.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/reference/core_concepts/moe_configuration.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 2 additions & 1 deletion b/‎src/MaxText/configs/base.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 4 additions & 1 deletion b/‎src/MaxText/configs/types.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/MaxText/layers/deepseek.py‎
Lines changed: 10 additions & 4 deletions b/‎src/MaxText/layers/deepseek.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎src/MaxText/layers/gpt_oss.py‎
Lines changed: 2 additions & 2 deletions b/‎src/MaxText/layers/gpt_oss.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/MaxText/layers/llama4.py‎
Lines changed: 5 additions & 1 deletion b/‎src/MaxText/layers/llama4.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/MaxText/layers/mixtral.py‎
Lines changed: 2 additions & 2 deletions b/‎src/MaxText/layers/mixtral.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/MaxText/layers/moe.py‎
Lines changed: 77 additions & 17 deletions b/‎src/MaxText/layers/moe.py‎
Lines changed: 77 additions & 17 deletions
@@ -59,6 +59,8 @@ Dropping:
 
 `routed_bias`: If enabled, adds a learnable bias term to the gate logits to facilitate load balancing.
 
+`routed_bias_update_rate`: Defines the update rate to routed bias term above. Applicable only to the DeepSeek decoder block.
+
 `routed_score_func`: Defines the scoring function for the router.
 
 `routed_scaling_factor`: A scalar multiplier applied to the expert weights.
 
@@ -177,7 +177,7 @@ num_experts_per_tok: 1
 megablox: true
 sparse_matmul: true
 capacity_factor: -1.0 # a factor to decide expert capacity for token dropping, and no dropping by default
-load_balance_loss_weight: 0.01 # weight for the load balance loss
+load_balance_loss_weight: 0.0 # weight for the load balance loss
 use_random_routing: false # whether to use random routing for debug/test purpose
 use_custom_sort_vjp: true # whether to use a custom sort vjp for sparse matmul ops
 use_ring_of_experts: false # whether to use ring of experts for sparse matmul expert parallelism
@@ -224,6 +224,7 @@ shared_experts: 1
 routed_scaling_factor: 1.0 # scaling factor for routing scores
 routed_score_func: "" # scoring function for routing
 routed_bias: False # a flag if a learnable bias is added for routing
+routed_bias_update_rate: 0.0 # a flag indicate the update rate applied to the router bias term
 mlp_bias: False # a flag if a learnable bias is added for MLP matmul
 n_routing_groups: -1 # number of groups for routing, disabled by default
 topk_routing_group: -1 # number of top groups to route inputs. For EP,
 
@@ -552,7 +552,7 @@ class MoEGeneral(BaseModel):
   num_experts: PositiveInt = Field(1, description="The total number of experts in each MoE layer.")
   num_experts_per_tok: PositiveInt = Field(1, description="The number of experts to route each token to.")
   capacity_factor: float = Field(-1.0, description="Expert capacity factor. If < 0, no token dropping.")
-  load_balance_loss_weight: NonNegativeFloat = Field(0.01, description="Weight for the load balancing auxiliary loss.")
+  load_balance_loss_weight: NonNegativeFloat = Field(0.0, description="Weight for the load balancing auxiliary loss.")
   use_custom_sort_vjp: bool = Field(True, description="Whether to use a custom sort VJP for sparse matmul ops.")
   use_ring_of_experts: bool = Field(
       False,
@@ -639,6 +639,7 @@ class DeepSeekMoE(BaseModel):
   routed_scaling_factor: float = Field(1.0, description="Scaling factor for routing scores.")
   routed_score_func: str = Field("", description="Scoring function for routing (e.g., 'softmax', 'sigmoid').")
   routed_bias: bool = Field(False, description="Whether to add a bias term for routing.")
+  routed_bias_update_rate: float = Field(0.0, description="Update rate applied to the router bias term.")
   mlp_bias: bool = Field(False, description="Whether to add a learnable bias for MLP matmul.")
   n_routing_groups: int = Field(-1, description="Number of groups for routing, disabled by default.")
   topk_routing_group: int = Field(-1, description="Number of top groups to route inputs to.")
@@ -2043,6 +2044,8 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           )
       if self.decoder_block == DecoderBlockType.GPT_OSS and not self.sparse_matmul and self.capacity_factor != -1:
         raise ValueError("GPT-OSS MoE only supports dropless (capacity_factor=-1) with dense matmul.")
+      if self.routed_bias and self.routed_bias_update_rate > 0.0 and self.decoder_block != DecoderBlockType.DEEPSEEK:
+        raise ValueError("Loss-free load balancing is only supported for the DeepSeek decoder block.")
     if self.use_multimodal:
       valid_mm_models = (
           "gemma3-4b",
 
@@ -138,8 +138,14 @@ def self_attention_with_norm(
   return hidden_states, intermediate_inputs
 
 
-def post_process(cfg, layer_output, sow, kv_cache=None):
+def post_process(cfg, layer_output, load_balance_loss, moe_bias_updates, sow, kv_cache=None):
   """postprocessing."""
+  if cfg.load_balance_loss_weight > 0.0 and load_balance_loss is not None:
+    sow("intermediates", "moe_lb_loss", load_balance_loss)
+
+  if cfg.routed_bias and cfg.routed_bias_update_rate > 0.0 and moe_bias_updates is not None:
+    sow("intermediates", "moe_bias_updates", moe_bias_updates)
+
   if cfg.record_internal_nn_metrics:
     sow("intermediates", "activation_mean", jnp.mean(layer_output))
     sow("intermediates", "activation_stdev", jnp.std(layer_output))
@@ -233,7 +239,7 @@ def __call__(
         layer_output,
         logical_axis_names,
     )
-    return post_process(cfg, layer_output, self.sow)
+    return post_process(cfg, layer_output, None, None, self.sow)
 
 
 class DeepSeekMoELayer(nn.Module):
@@ -296,7 +302,7 @@ def __call__(
     # NOTE: the naming mismatch here is to ensure reverse compatibility with existing checkpoints.
     # The `name` represents the weight name in JAX/checkpoints and so the class name
     # is just for readability.
-    mlp_lnx = moe.get_routed_and_shared_moe(
+    mlp_lnx, load_balance_loss, moe_bias_updates = moe.get_routed_and_shared_moe(
         name="DeepSeekMoeBlock_0",
         config=cfg,
         mesh=self.mesh,
@@ -314,4 +320,4 @@ def __call__(
         layer_output,
         logical_axis_names,
     )
-    return post_process(cfg, layer_output, self.sow)
+    return post_process(cfg, layer_output, load_balance_loss, moe_bias_updates, self.sow)
@@ -182,7 +182,7 @@ def __call__(
     )
 
     load_balance_loss = None
-    mlp_lnx, load_balance_loss = self.GptOssMlp(hidden_states)
+    mlp_lnx, load_balance_loss, _ = self.GptOssMlp(hidden_states)
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
@@ -193,7 +193,7 @@ def __call__(
         ("activation_batch", "activation_norm_length", "activation_embed"),
     )
 
-    if load_balance_loss is not None:
+    if cfg.load_balance_loss_weight > 0.0 and load_balance_loss is not None:
       self.sow("intermediates", "moe_lb_loss", load_balance_loss)
 
     if cfg.record_internal_nn_metrics:
 
@@ -484,8 +484,9 @@ def __call__(
     hidden_states = self.post_self_attention_layer_norm(intermediate_inputs)
     hidden_states = nn.with_logical_constraint(hidden_states, self.activation_axis_names)
 
+    load_balance_loss = None
     if self.is_moe_layer:
-      mlp_lnx = self.moe_block(hidden_states)
+      mlp_lnx, load_balance_loss, _ = self.moe_block(hidden_states)
     else:
       mlp_lnx = self.mlp(hidden_states, deterministic=deterministic)
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, self.activation_axis_names)
@@ -494,6 +495,9 @@ def __call__(
     layer_output = self.dropout(layer_output, deterministic=deterministic)
     layer_output = nn.with_logical_constraint(layer_output, self.activation_axis_names)
 
+    if self.config.load_balance_loss_weight > 0.0 and load_balance_loss is not None:
+      self.sow("intermediates", "moe_lb_loss", load_balance_loss)
+
     if cfg.record_internal_nn_metrics:
       self.sow("intermediates", "activation_mean", jnp.mean(layer_output))
       self.sow("intermediates", "activation_stdev", jnp.std(layer_output))
 
@@ -172,14 +172,14 @@ def __call__(
     # NOTE: the naming mismatch here is to ensure reverse compatibility with existing checkpoints.
     # The `name` represents the weight name in JAX/checkpoints and so the class name
     # is just for readability.
-    mlp_lnx, load_balance_loss = self.MoeBlock_0(hidden_states)
+    mlp_lnx, load_balance_loss, _ = self.MoeBlock_0(hidden_states)
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, self.activation_axis_names)
 
     layer_output = mlp_lnx + intermediate_inputs
     layer_output = self.dropout(layer_output, deterministic=deterministic)
     layer_output = nn.with_logical_constraint(layer_output, self.activation_axis_names)
 
-    if load_balance_loss is not None:
+    if self.config.load_balance_loss_weight > 0.0 and load_balance_loss is not None:
       self.sow("intermediates", "moe_lb_loss", load_balance_loss)
 
     if self.config.record_internal_nn_metrics:
 
@@ -27,6 +27,7 @@
 from jax import ad_checkpoint as adc
 from jax.experimental import xla_metadata
 from jax.sharding import NamedSharding, Mesh
+from jax.sharding import PartitionSpec as P
 import jax.numpy as jnp
 from MaxText import common_types as ctypes
 from MaxText import max_logging
@@ -133,6 +134,30 @@ def random_routing(rng_key, gate_logits, num_experts_per_tok):
   return top_k_weights, top_k_indices
 
 
+def calculate_load_balance_updates(top_k_indices, num_experts, rate):
+  """
+  Computes a bias adjustment update based on expert load.
+  Used in DeepSeek V3: https://arxiv.org/html/2412.19437v1.
+  Implementation reference: https://arxiv.org/pdf/2408.15664.
+
+  Args:
+      top_k_indices: Shape (batch, sequence, top_k).
+      num_experts: Total number of experts.
+      rate: The update rate.
+
+  Returns:
+      update: The value to add to the expert bias. Shape (num_experts,).
+  """
+  flat_indices = top_k_indices.ravel()
+  expert_counts = jnp.bincount(flat_indices, length=num_experts)
+
+  total_tokens = flat_indices.size
+  average_load = total_tokens / num_experts
+  direction = jnp.sign(average_load - expert_counts)
+  output = direction * rate
+  return output
+
+
 class GateLogit(nnx.Module):
   """A layer used to compute gate logits, allowing to return the pre bias values for DeepSeek routing."""
 
@@ -436,6 +461,10 @@ def get_tensor_transpose_parallelism_size(self):
   def get_context_autoregressive_parallelism_size(self):
     return self.mesh.shape.get("context_autoregressive", 1)
 
+  def should_update_load_balance(self):
+    """Determines if loss-free load balancing updates should be applied."""
+    return self.config.routed_bias and self.config.routed_bias_update_rate > 0.0
+
   def get_topk(self, gate_logits, pre_bias_logits, rngs=None):
     """get topk."""
     # shape of top_k_weights & top_k_indices:
@@ -560,6 +589,18 @@ def permute(self, inputs, gate_logits, pre_bias_logits, use_custom_sort_vjp=True
     inputs_2d = jnp.reshape(inputs, (bsz_times_seq_len, inputs_shape[2]))
     weights, selected_experts = self.get_topk(gate_logits, pre_bias_logits, rngs)
 
+    lb_loss = None
+    if self.config.load_balance_loss_weight > 0.0:
+      softmax_probs = jax.nn.softmax(gate_logits.astype(jnp.float32), axis=-1).astype(self.dtype)
+      lb_loss = self.load_balance_loss(selected_experts, softmax_probs)
+
+    if self.should_update_load_balance():
+      bias_updates = calculate_load_balance_updates(
+          selected_experts, self.config.num_experts, self.config.routed_bias_update_rate
+      )
+    else:
+      bias_updates = None
+
     if self.config.decoder_block == ctypes.DecoderBlockType.LLAMA4:
       # weights will be of shape (batch_size, seq_len, num_experts_per_tok)
       router_scores = jax.nn.sigmoid(weights.astype(jnp.float32))  # weights are top_k_weights here
@@ -589,6 +630,8 @@ def permute(self, inputs, gate_logits, pre_bias_logits, use_custom_sort_vjp=True
         weights,
         group_size,
         sorted_experts,
+        lb_loss,
+        bias_updates,
     )
 
   def unpermute(
@@ -1010,9 +1053,13 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_a
             w0_bias_pspec,
             w1_bias_pspec,
             wo_bias_pspec,
-            None,
+            P(),  # Replicate the input key
+        ),
+        out_specs=(
+            self._logical_to_mesh_axes((batch_logical_axis, "activation_norm_length", "activation_embed")),
+            P(),  # Handle None or replicate the output
+            P(),  # Handle None or replicate the output
         ),
-        out_specs=(self._logical_to_mesh_axes((batch_logical_axis, "activation_norm_length", "activation_embed"))),
         check_vma=False,
     )
     def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, rngs):
@@ -1035,7 +1082,7 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
 
         # "Route" tokens within each shard.
         num_experts_per_shard = self.config.num_experts // num_expert_parallelism
-        x, sorted_selected_experts, weights, group_sizes, selected_experts = self.permute(
+        x, sorted_selected_experts, weights, group_sizes, selected_experts, lb_loss, bias_updates = self.permute(
             x,
             logits,
             pre_bias_logits,
@@ -1049,7 +1096,7 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
         mask = jnp.arange(x.shape[0]) < jnp.sum(group_sizes)
         x = jnp.where(mask[:, None], x, 0)
       else:
-        x, sorted_selected_experts, weights, group_sizes, selected_experts = self.permute(
+        x, sorted_selected_experts, weights, group_sizes, selected_experts, lb_loss, bias_updates = self.permute(
             x, logits, pre_bias_logits, self.config.use_custom_sort_vjp, rngs
         )
 
@@ -1264,7 +1311,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
             use_custom_sort_vjp=self.config.use_custom_sort_vjp,
         )
 
-      return output, None
+      return output, lb_loss, bias_updates
 
     if self.config.moe_fsdp_use_two_stage_all_gather:
       # Unshard on fsdp axis
@@ -1563,7 +1610,7 @@ def dense_matmul(
       w0_bias,
       w1_bias,
       wo_bias,
-  ) -> tuple[jax.Array, Optional[jax.Array]]:
+  ) -> tuple[jax.Array, Optional[jax.Array], Optional[jax.Array]]:
     """Dense matrix multiplication."""
     # gate_logits: batch, length, expert
     gate_logits = self._maybe_shard_with_logical(gate_logits, ("activation_batch", "activation_norm_length", None))
@@ -1581,11 +1628,23 @@ def dense_matmul(
       weights = self.reshape_and_update_weights(top_k_weights, top_k_indices)
     matmul_precision = jax.lax.Precision(self.config.matmul_precision)
 
+    # Calculate load balance loss
     if self.config.model_call_mode != "inference":
       softmax_probs = jax.nn.softmax(gate_logits.astype(jnp.float32), axis=-1).astype(self.dtype)
-      loss = self.load_balance_loss(top_k_indices, softmax_probs)
+      lb_loss = (
+          self.load_balance_loss(top_k_indices, softmax_probs) if self.config.load_balance_loss_weight > 0.0 else None
+      )
     else:
-      loss = None
+      lb_loss = None
+
+    # Calculate routed bias updates (loss-free)
+    if self.should_update_load_balance():
+      bias_updates = calculate_load_balance_updates(
+          top_k_indices, self.config.num_experts, self.config.routed_bias_update_rate
+      )
+    else:
+      bias_updates = None
+
     batch_size = inputs.shape[0]
     seq_len = inputs.shape[1]
 
@@ -1783,7 +1842,7 @@ def dense_matmul(
                   output.shape[3],
               ),
           )
-      return output, loss
+      return output, lb_loss, bias_updates
     else:
       inputs = self._maybe_shard_with_logical(inputs, ("activation_batch", "activation_norm_length", "activation_embed"))
       with jax.named_scope("wi_0"):
@@ -1831,7 +1890,7 @@ def dense_matmul(
             weights,
             precision=matmul_precision,
         ).astype(self.dtype)
-      return output, None
+      return output, lb_loss, bias_updates
 
   def retrieve_quantized_weight(
       self,
@@ -1864,7 +1923,7 @@ def retrieve_quantized_weight(
 
   def __call__(
       self, inputs: jax.Array, out_sharding: NamedSharding | None = None
-  ) -> tuple[jax.Array, Optional[jax.Array]]:
+  ) -> tuple[jax.Array, Optional[jax.Array], Optional[jax.Array]]:
     cfg = self.config
     inputs = inputs.astype(cfg.dtype)
     gate_logits, pre_bias_logits = self.gate(inputs)
@@ -1893,13 +1952,14 @@ def __call__(
             w1_bias,
             wo_bias,
         )
-      return self.sparse_matmul(
+      output, lb_loss, bias_updates = self.sparse_matmul(
           inputs, gate_logits, pre_bias_logits, w0_kernel, w1_kernel, wo_kernel, w0_bias, w1_bias, wo_bias
       )
     else:
-      return self.dense_matmul(
+      output, lb_loss, bias_updates = self.dense_matmul(
           inputs, gate_logits, pre_bias_logits, w0_kernel, w1_kernel, wo_kernel, w0_bias, w1_bias, wo_bias
       )
+    return output, lb_loss, bias_updates
 
 
 class RoutedAndSharedMoE(nnx.Module):
@@ -1916,7 +1976,7 @@ def __init__(
       dtype: ctypes.DType = jnp.float32,
       quant: Optional[quantizations.AqtQuantization] = None,
   ):
-    """nitializes the RoutedAndSharedMoE module.
+    """Initializes the RoutedAndSharedMoE module.
 
     Attributes:
       config: The main config setting.
@@ -1973,10 +2033,10 @@ def __call__(
       inputs: jax.Array,
       intermediate_sharding: NamedSharding | None = None,
       out_sharding: NamedSharding | None = None,
-  ) -> jax.Array:
-    routed_experts, _ = self.routed_moe(inputs, out_sharding=out_sharding)
+  ) -> tuple[jax.Array, Optional[jax.Array], Optional[jax.Array]]:
+    routed_experts, load_balance_loss, moe_bias_updates = self.routed_moe(inputs, out_sharding=out_sharding)
     shared_experts = self.shared_experts(inputs, intermediate_sharding=intermediate_sharding, out_sharding=out_sharding)
-    return routed_experts + shared_experts
+    return routed_experts + shared_experts, load_balance_loss, moe_bias_updates
 
 
 def get_gate_logit(
Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ def __call__(`
`182`	`182`	`)`
`183`	`183`
`184`	`184`	`load_balance_loss = None`
`185`		`- mlp_lnx, load_balance_loss = self.GptOssMlp(hidden_states)`
	`185`	`+ mlp_lnx, load_balance_loss, _ = self.GptOssMlp(hidden_states)`
`186`	`186`	`mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))`
`187`	`187`
`188`	`188`	`layer_output = mlp_lnx + intermediate_inputs`
`@@ -193,7 +193,7 @@ def __call__(`
`193`	`193`	`("activation_batch", "activation_norm_length", "activation_embed"),`
`194`	`194`	`)`
`195`	`195`
`196`		`- if load_balance_loss is not None:`
	`196`	`+ if cfg.load_balance_loss_weight > 0.0 and load_balance_loss is not None:`
`197`	`197`	`self.sow("intermediates", "moe_lb_loss", load_balance_loss)`
`198`	`198`
`199`	`199`	`if cfg.record_internal_nn_metrics:`