Merge pull request #2926 from AI-Hypercomputer:chengnuojin-debug-sharding

Google-ML-Automation · Google-ML-Automation · commit f27ac673245c · 2026-01-12T13:41:14.000-08:00
PiperOrigin-RevId: 855366909
diff --git a/src/MaxText/data_loader.py b/src/MaxText/data_loader.py
@@ -20,7 +20,7 @@
 from jax.experimental import checkify
 
 from MaxText import exceptions
-from MaxText.sharding import get_input_data_sharding, maybe_shard_with_name
+from MaxText.sharding import get_input_data_sharding
 from MaxText.utils.goodput_utils import (
     GoodputEvent,
     maybe_record_goodput,
@@ -70,10 +70,9 @@ def load_next_batch_pre_sharding(self):
 
   def load_next_batch(self, *args, **kwargs):
     """Loads the next batch with sharding hint"""
-    return maybe_shard_with_name(
+    return jax.device_put(
         self.load_next_batch_pre_sharding(),
         self.input_data_shardings,
-        self.config.shard_mode,
     )
 
   def check_example_batch(self):
@@ -154,7 +153,7 @@ def _slice(data):
       self.buffer_start = slice_end
       output = jax.tree.map(_slice, self.batch_buffer)
     self.rampup_active = rampup_manager.update()
-    return maybe_shard_with_name(output, self.input_data_shardings, self.config.shard_mode)
+    return jax.device_put(output, self.input_data_shardings)
 
 
 def create_dataloader(config, mesh, data_iterator, goodput_recorder, rampup_manager):
diff --git a/src/MaxText/gradient_accumulation.py b/src/MaxText/gradient_accumulation.py
@@ -65,7 +65,7 @@ def gradient_accumulation_loss_and_grad(
 
   def _maybe_shard_with_name(inputs, sharding_names):
     """Wrapper of maybe_shard_with_name with fixed shard_mode"""
-    return maybe_shard_with_name(inputs, sharding_names, config.shard_mode)
+    return maybe_shard_with_name(inputs, sharding_names, config.shard_mode, debug_sharding=config.debug_sharding)
 
   # For more efficient DP/ZeRO-1 + GA
   if config.shard_mode == ShardMode.EXPLICIT and config.ici_data_parallelism > 1:
diff --git a/src/MaxText/layers/attention_op.py b/src/MaxText/layers/attention_op.py
@@ -1198,17 +1198,10 @@ def wrap_splash_kernel(single_head_mask, shard_head_size=1):
         segment_axis_names_splash_kernel = self._logical_to_mesh_axes((Q_LENGTH,))
       else:
         segment_axis_names_splash_kernel = self._logical_to_mesh_axes((Q_LENGTH_NO_EXP,))
-    elif (
-        self.config.use_jax_splash
-        and self.config.expert_shard_attention_option == EP_AS_FSDP
-    ):
+    elif self.config.use_jax_splash and self.config.expert_shard_attention_option == EP_AS_FSDP:
       if self.config.use_max_logit_estimate > 0:
-        sa_config = dataclasses.replace(
-            sa_config, max_logit_const=self.config.use_max_logit_estimate
-        )
-      segment_axis_names_splash_kernel = nn.logical_to_mesh_axes((
-          Q_LENGTH_NO_EXP,
-      ))
+        sa_config = dataclasses.replace(sa_config, max_logit_const=self.config.use_max_logit_estimate)
+      segment_axis_names_splash_kernel = nn.logical_to_mesh_axes((Q_LENGTH_NO_EXP,))
     else:
       # Create multi-head mask
       multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
@@ -1327,7 +1320,13 @@ def _maybe_shard_with_pspec(inputs, pspec: jax.sharding.PartitionSpec | None):
       if pspec is None:
         return None
       sharding = NamedSharding(self.mesh, pspec)
-      return maybe_shard_with_name(inputs, sharding, shard_mode=self.config.shard_mode)
+      return maybe_shard_with_name(
+          inputs,
+          sharding,
+          shard_mode=self.config.shard_mode,
+          debug_sharding=self.config.debug_sharding,
+          extra_stack_level=1,
+      )
 
     query = _maybe_shard_with_pspec(query, axis_names_q)
     key = _maybe_shard_with_pspec(key, axis_names_kv)
diff --git a/src/MaxText/layers/attentions.py b/src/MaxText/layers/attentions.py
@@ -525,6 +525,7 @@ def __init__(
         maybe_shard_with_logical,
         mesh=mesh,
         shard_mode=config.shard_mode,
+        debug_sharding=config.debug_sharding,
     )
 
   def _init_projections(self, inputs_q_shape: Tuple, inputs_kv_shape: Tuple) -> None:
diff --git a/src/MaxText/layers/decoders.py b/src/MaxText/layers/decoders.py
@@ -98,6 +98,7 @@ def __call__(
         sharding.maybe_shard_with_logical,
         mesh=mesh,
         shard_mode=cfg.shard_mode,
+        debug_sharding=cfg.debug_sharding,
     )
 
     if self.model_mode == MODEL_MODE_PREFILL:
diff --git a/src/MaxText/layers/deepseek.py b/src/MaxText/layers/deepseek.py
@@ -129,7 +129,11 @@ def mlp_op(self, x, deterministic, *args, **kwargs):
 
   def with_logical_constraint(self, x):
     return maybe_shard_with_logical(
-        x, logical_axes=self.logical_axis_names, mesh=self.mesh, shard_mode=self.config.shard_mode
+        x,
+        logical_axes=self.logical_axis_names,
+        mesh=self.mesh,
+        shard_mode=self.config.shard_mode,
+        debug_sharding=self.config.debug_sharding,
     )
 
   def dropout_op(self, x, deterministic):
diff --git a/src/MaxText/layers/deepseek_batchsplit.py b/src/MaxText/layers/deepseek_batchsplit.py
@@ -179,7 +179,7 @@ def mlp_logical_axis_names(self):
   def with_logical_constraint(self, x):
     return maybe_shard_with_logical(
       x, logical_axes=self.logical_axis_names,
-      mesh=self.mesh, shard_mode=self.config.shard_mode
+      mesh=self.mesh, shard_mode=self.config.shard_mode, debug_sharding=self.config.debug_sharding,
     )
 
   def pre_attention_norm_op(self, x):
diff --git a/src/MaxText/layers/linears.py b/src/MaxText/layers/linears.py
@@ -459,6 +459,7 @@ def __init__(
         maybe_shard_with_logical,
         mesh=mesh,
         shard_mode=config.shard_mode,
+        debug_sharding=config.debug_sharding,
     )
 
   def get_norm_layer(self, num_features: int):
diff --git a/src/MaxText/layers/llama2.py b/src/MaxText/layers/llama2.py
@@ -134,6 +134,7 @@ def __init__(
         maybe_shard_with_logical,
         mesh=self.mesh,
         shard_mode=config.shard_mode,
+        debug_sharding=config.debug_sharding,
     )
 
   def __call__(
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -444,7 +444,13 @@ def __init__(
       self.wo_bias = None
 
   def _maybe_shard_with_logical(self, inputs, logical_name):
-    return maybe_shard_with_logical(inputs, logical_name, mesh=self.mesh, shard_mode=self.config.shard_mode)
+    return maybe_shard_with_logical(
+        inputs,
+        logical_name,
+        mesh=self.mesh,
+        shard_mode=self.config.shard_mode,
+        debug_sharding=self.config.debug_sharding,
+    )
 
   def _logical_to_mesh_axes(self, logical_name):
     return logical_to_mesh_axes(logical_name, mesh=self.mesh, rules=self.config.logical_axis_rules)
diff --git a/src/MaxText/layers/pipeline.py b/src/MaxText/layers/pipeline.py
@@ -133,11 +133,17 @@ def _maybe_shard_with_logical(self, inputs, logical_axes):
         shard_mode=self.config.shard_mode,
         mesh=self.mesh,
         rules=self.config.logical_axis_rules,
+        debug_sharding=self.config.debug_sharding,
     )
 
   def _maybe_shard_with_name(self, inputs, sharding_name):
     """Wrapper of maybe_shard_with_name"""
-    return maybe_shard_with_name(inputs, sharding_name, shard_mode=self.config.shard_mode)
+    return maybe_shard_with_name(
+        inputs,
+        sharding_name,
+        shard_mode=self.config.shard_mode,
+        debug_sharding=self.config.debug_sharding,
+    )
 
   def init_states(self, inputs):
     """Initialize components of state: state_io, shift, circular_storage and circular_storage_mover
diff --git a/src/MaxText/max_logging.py b/src/MaxText/max_logging.py
@@ -28,9 +28,9 @@ def debug(user_str):
   logging.debug(user_str, stacklevel=2)
 
 
-def info(user_str):
+def info(user_str, stacklevel=2):
   """Logs a message at the INFO level."""
-  logging.info(user_str, stacklevel=2)
+  logging.info(user_str, stacklevel=stacklevel)
 
 
 def warning(user_str):
diff --git a/src/MaxText/maxtext_utils.py b/src/MaxText/maxtext_utils.py
@@ -1179,4 +1179,4 @@ def print_state_mesh_shardings_params(state, state_sharding, mesh):
     path_str = "/".join(str(p.key) for p in path)
     shape = jax.typeof(leaf_val)
     pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
-    print(f"{path_str:.<80} {shape} {pspec}", flush=True)
+    max_logging.log(f"{path_str:.<80} {shape} {tuple(pspec)}")
diff --git a/src/MaxText/sharding.py b/src/MaxText/sharding.py
@@ -20,40 +20,57 @@
 from collections.abc import Iterable
 
 import jax
+from jax.core import Tracer
 from jax.sharding import PartitionSpec as P, NamedSharding, reshard
 
 import optax
 
 from MaxText import max_utils
+from MaxText import max_logging
 from MaxText.common_types import ShardMode
 
 
+_LOGGED_ACTIVATION_SHARDINGS = set()
+
+
 def get_input_data_sharding(config, mesh):
   """Get the input data sharding for the model"""
   return create_sharding(mesh, config.input_data_sharding_logical_axes, rules=config.logical_axis_rules)
 
 
-def maybe_shard_with_name(inputs, named_sharding, shard_mode):
+def maybe_shard_with_name(inputs, named_sharding, shard_mode, debug_sharding=False, extra_stack_level=0):
   """
   In auto shardmode, this function hints inputs follow given named_sharding.
   In explicit shardmode, this function enforces inputs following named_sharding.
   """
   if inputs is None:
     return None
+  if (
+      debug_sharding and isinstance(inputs, Tracer) and isinstance(named_sharding, NamedSharding)
+  ):  # only print pspec for JitTracer
+    pspec = remove_size_one_mesh_axis(getattr(named_sharding, "spec"), getattr(named_sharding, "mesh"))
+    log_key = (str(jax.typeof(inputs)), tuple(pspec), extra_stack_level)
+    if log_key not in _LOGGED_ACTIVATION_SHARDINGS:
+      max_logging.info(f"{log_key[0]:.<80} {log_key[1]}.", stacklevel=3 + extra_stack_level)
+      _LOGGED_ACTIVATION_SHARDINGS.add(log_key)
   if shard_mode == ShardMode.EXPLICIT:
     return reshard(inputs, named_sharding)
   else:
     return jax.lax.with_sharding_constraint(inputs, named_sharding)
 
 
-def maybe_shard_with_logical(inputs, logical_axes, mesh, shard_mode, rules=None):
+def maybe_shard_with_logical(
+    inputs, logical_axes, mesh, shard_mode, rules=None, debug_sharding=False, extra_stack_level=0
+):
   """
   A wrapper of maybe_shard_with_name when logical axes are inputs
   """
   if inputs is None:
     return None
   named_sharding = create_sharding(mesh, logical_axes, rules=rules)
-  return maybe_shard_with_name(inputs, named_sharding, shard_mode)
+  return maybe_shard_with_name(
+      inputs, named_sharding, shard_mode, debug_sharding=debug_sharding, extra_stack_level=extra_stack_level + 1
+  )
 
 
 def remove_size_one_mesh_axis(spec, mesh):
diff --git a/src/MaxText/train.py b/src/MaxText/train.py
@@ -146,6 +146,7 @@ def loss_fn(model, config, data, dropout_rng, params, is_train=True):
           ("activation_embed_and_logits_batch", "activation_length"),
           model.mesh,
           config.shard_mode,
+          debug_sharding=config.debug_sharding,
       )
       # Mask out paddings at the end of each example.
       xent = xent * (data["targets_segmentation"] != 0)
@@ -445,12 +446,6 @@ def train_loop(config, recorder, state=None):
 
       with jax.profiler.StepTraceAnnotation("train", step_num=step):
         example_batch = data_loader.load_next_batch(rampup_manager=rampup_manager)
-        # Reshard data from loaded sharding to performant activation sharding
-        example_batch = sharding.maybe_shard_with_name(
-            example_batch,
-            sharding.get_input_data_sharding(config, mesh),
-            shard_mode=config.shard_mode,
-        )
         # pylint: disable=not-callable
         nextrng = jax.jit(jax.random.fold_in)(init_rng, step)
         with maybe_record_goodput(recorder, GoodputEvent.STEP, step):
diff --git a/src/MaxText/vocabulary_tiling.py b/src/MaxText/vocabulary_tiling.py
@@ -88,7 +88,9 @@ def vocab_tiling_linen_loss(
       ("activation_embed_and_logits_batch_sequence", "activation_vocab"),
   )
 
-  _maybe_shard_with_name = functools.partial(maybe_shard_with_name, shard_mode=config.shard_mode)
+  _maybe_shard_with_name = functools.partial(
+      maybe_shard_with_name, shard_mode=config.shard_mode, debug_sharding=config.debug_sharding
+  )
 
   def _reshape(inputs, out_shape, out_sharding):
     reshape_out_sharding = out_sharding if config.shard_mode == ShardMode.EXPLICIT else None

Original file line number	Diff line number	Diff line change
`@@ -525,6 +525,7 @@ def __init__(`
`525`	`525`	`maybe_shard_with_logical,`
`526`	`526`	`mesh=mesh,`
`527`	`527`	`shard_mode=config.shard_mode,`
	`528`	`+ debug_sharding=config.debug_sharding,`
`528`	`529`	`)`
`529`	`530`
`530`	`531`	`def _init_projections(self, inputs_q_shape: Tuple, inputs_kv_shape: Tuple) -> None:`
Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,7 @@ def __call__(`
`98`	`98`	`sharding.maybe_shard_with_logical,`
`99`	`99`	`mesh=mesh,`
`100`	`100`	`shard_mode=cfg.shard_mode,`
	`101`	`+ debug_sharding=cfg.debug_sharding,`
`101`	`102`	`)`
`102`	`103`
`103`	`104`	`if self.model_mode == MODEL_MODE_PREFILL:`
Original file line number	Diff line number	Diff line change
`@@ -179,7 +179,7 @@ def mlp_logical_axis_names(self):`
`179`	`179`	`def with_logical_constraint(self, x):`
`180`	`180`	`return maybe_shard_with_logical(`
`181`	`181`	`x, logical_axes=self.logical_axis_names,`
`182`		`- mesh=self.mesh, shard_mode=self.config.shard_mode`
	`182`	`+ mesh=self.mesh, shard_mode=self.config.shard_mode, debug_sharding=self.config.debug_sharding,`
`183`	`183`	`)`
`184`	`184`
`185`	`185`	`def pre_attention_norm_op(self, x):`
Original file line number	Diff line number	Diff line change
`@@ -459,6 +459,7 @@ def __init__(`
`459`	`459`	`maybe_shard_with_logical,`
`460`	`460`	`mesh=mesh,`
`461`	`461`	`shard_mode=config.shard_mode,`
	`462`	`+ debug_sharding=config.debug_sharding,`
`462`	`463`	`)`
`463`	`464`
`464`	`465`	`def get_norm_layer(self, num_features: int):`
Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ def __init__(`
`134`	`134`	`maybe_shard_with_logical,`
`135`	`135`	`mesh=self.mesh,`
`136`	`136`	`shard_mode=config.shard_mode,`
	`137`	`+ debug_sharding=config.debug_sharding,`
`137`	`138`	`)`
`138`	`139`
`139`	`140`	`def __call__(`