debug for NaNs

prishajain1 · prishajain1 · commit e7d7ba13b12d · 2025-12-29T20:55:20.000+05:30
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -51,21 +51,6 @@
 def _maybe_aqt_einsum(quant: Quant):
   return jnp.einsum if quant is None else quant.einsum()
 
-def check_nan_attn(tensor: jax.Array, name: str, tag: str = ""):
-    if tensor is None:
-        # This print is fine, it's not in JIT on None
-        print(f"[DEBUG ATTN PY {jax.process_index()}] {tag} {name}: Tensor is None")
-        return
-
-    # These are JAX boolean arrays (tracers when JITted)
-    has_nans = jnp.isnan(tensor).any()
-    has_infs = jnp.isinf(tensor).any()
-
-    # Pass the tracers as keyword arguments to jax.debug.print
-    jax.debug.print(f"[DEBUG ATTN JIT {jax.process_index()}] {tag} {name}: "
-                    "Shape: {shape}, Has NaNs: {has_nans_val}, Has Infs: {has_infs_val}",
-                    shape=tensor.shape, has_nans_val=has_nans, has_infs_val=has_infs)
-
 
 
 def _check_attention_inputs(query: Array, key: Array, value: Array) -> None:
@@ -961,13 +946,7 @@ def __call__(
       rotary_emb: Optional[jax.Array] = None,
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
-      tag: str = "attn"
   ) -> jax.Array:
-    check_nan_attn(hidden_states, "Input hidden_states", tag)
-    if encoder_hidden_states is not None:
-        check_nan_attn(encoder_hidden_states, "Input encoder_hidden_states", tag)
-    if rotary_emb is not None:
-        check_nan_attn(rotary_emb, "Input rotary_emb", tag)
 
     hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
     encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", "tensor"))
@@ -982,79 +961,58 @@ def __call__(
       with self.conditional_named_scope("attn_qkv_proj"):
         with self.conditional_named_scope("proj_query"):
           query_proj = self.query(hidden_states)
-          check_nan_attn(query_proj, "query_proj", tag)
         with self.conditional_named_scope("proj_key"):
           key_proj = self.key(encoder_hidden_states)
-          check_nan_attn(key_proj, "key_proj", tag)
         with self.conditional_named_scope("proj_value"):
           value_proj = self.value(encoder_hidden_states)
-          check_nan_attn(value_proj, "value_proj", tag)
 
       if self.qk_norm:
         with self.conditional_named_scope("attn_q_norm"):
           query_proj = self.norm_q(query_proj)
-          check_nan_attn(query_proj, "query_proj normed", tag)
         with self.conditional_named_scope("attn_k_norm"):
           key_proj = self.norm_k(key_proj)
-          check_nan_attn(key_proj, "key_proj normed", tag)
 
       if rotary_emb is not None:  # Only for SELF-ATTENTION
         with self.conditional_named_scope("attn_rope"):
           # Unflatten is done HERE for RoPE
           query_proj = _unflatten_heads(query_proj, self.heads)
-          check_nan_attn(query_proj, "query_proj unflattened", tag)
           key_proj = _unflatten_heads(key_proj, self.heads)
-          check_nan_attn(key_proj, "key_proj unflattened", tag)
           value_proj = _unflatten_heads(value_proj, self.heads)
-          check_nan_attn(value_proj, "value_proj unflattened", tag)
           # output of _unflatten_heads Batch, heads, seq_len, head_dim
           query_proj, key_proj = self._apply_rope(query_proj, key_proj, rotary_emb)
-          check_nan_attn(query_proj, "query_proj after RoPE", tag)
-          check_nan_attn(key_proj, "key_proj after RoPE", tag)
       query_proj = checkpoint_name(query_proj, "query_proj")
       key_proj = checkpoint_name(key_proj, "key_proj")
       value_proj = checkpoint_name(value_proj, "value_proj")
       with self.conditional_named_scope("attn_compute"):
         attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
-        check_nan_attn(attn_output, "attn_output from attention_op", tag)
 
     else:
       # NEW PATH for I2V CROSS-ATTENTION
       with self.conditional_named_scope("proj_query"):
         query_proj = self.query(hidden_states)
-        check_nan_attn(query_proj, "query_proj I2V", tag)
       if self.qk_norm:
         with self.conditional_named_scope("attn_q_norm"):
           query_proj = self.norm_q(query_proj)
-          check_nan_attn(query_proj, "query_proj normed I2V", tag)
 
       encoder_hidden_states_img = encoder_hidden_states[:, :self.image_seq_len, :]
       encoder_hidden_states_text = encoder_hidden_states[:, self.image_seq_len:, :]
-      check_nan_attn(encoder_hidden_states_img, "EHS_img", tag)
-      check_nan_attn(encoder_hidden_states_text, "EHS_text", tag)
 
       # Text K/V
       with self.conditional_named_scope("proj_key"):
         key_proj_text = self.key(encoder_hidden_states_text)
-        check_nan_attn(key_proj_text, "key_proj_text", tag)
       if self.qk_norm:
         with self.conditional_named_scope("attn_k_norm"):
           key_proj_text = self.norm_k(key_proj_text)
-          check_nan_attn(key_proj_text, "key_proj_text normed", tag)
       with self.conditional_named_scope("proj_value"):
         value_proj_text = self.value(encoder_hidden_states_text)
-        check_nan_attn(value_proj_text, "value_proj_text", tag)
 
       # Image K/V
       with self.conditional_named_scope("add_proj_k"):
         key_proj_img = self.add_k_proj(encoder_hidden_states_img)
-        check_nan_attn(key_proj_img, "key_proj_img", tag)
       with self.conditional_named_scope("norm_add_k"):
         key_proj_img = self.norm_added_k(key_proj_img)
-        check_nan_attn(key_proj_img, "key_proj_img normed", tag)
       with self.conditional_named_scope("add_proj_v"):
         value_proj_img = self.add_v_proj(encoder_hidden_states_img)
-        check_nan_attn(value_proj_img, "value_proj_img", tag)
 
       # Checkpointing
       query_proj = checkpoint_name(query_proj, "query_proj")
@@ -1066,25 +1024,19 @@ def __call__(
       # Attention - tensors are (B, S, D)
       with self.conditional_named_scope("cross_attn_text_apply"):
         attn_output_text = self.attention_op.apply_attention(query_proj, key_proj_text, value_proj_text)
-        check_nan_attn(attn_output_text, "attn_output_text_h", tag)
       with self.conditional_named_scope("norm_added_q"):
         query_proj_img = self.norm_added_q(query_proj)
-        check_nan_attn(query_proj_img, "query_proj_img normed", tag)
       with self.conditional_named_scope("cross_attn_img_apply"):
         attn_output_img = self.attention_op.apply_attention(query_proj_img, key_proj_img, value_proj_img)
-        check_nan_attn(attn_output_img, "attn_output_img", tag)
 
       attn_output = attn_output_text + attn_output_img
-      check_nan_attn(attn_output, "attn_output final I2V", tag)
 
     attn_output = attn_output.astype(dtype=dtype)
     attn_output = checkpoint_name(attn_output, "attn_output")
 
     with self.conditional_named_scope("attn_out_proj"):
       hidden_states = self.proj_attn(attn_output)
-      check_nan_attn(hidden_states, "hidden_states after proj_attn", tag)
       hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
-      check_nan_attn(hidden_states, "hidden_states after dropout", tag)
     return hidden_states
 
 
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -40,21 +40,6 @@
 
 BlockSizes = common_types.BlockSizes
 
-def check_nan(tensor: jax.Array, name: str):
-    if tensor is None:
-        # jax.debug.print works fine with regular python strings and values
-        print(f"[DEBUG NaN Check] {name} on process {jax.process_index()}: Tensor is None")
-        return
-
-    has_nans = jnp.isnan(tensor).any()
-    has_infs = jnp.isinf(tensor).any()
-
-    # Pass the JAX arrays (has_nans, has_infs) as kwargs
-    # Use placeholders {} in the f-string for these runtime values
-    jax.debug.print(f"[DEBUG NaN Check] {name} on process {jax.process_index()}: "
-                    "Has NaNs: {has_nans_val}, Has Infs: {has_infs_val}",
-                    has_nans_val=has_nans, has_infs_val=has_infs)
-
 
 def get_frequencies(max_seq_len: int, theta: int, attention_head_dim: int, use_real: bool):
   h_dim = w_dim = 2 * (attention_head_dim // 6)
@@ -388,15 +373,8 @@ def __call__(
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
   ):
-    check_nan(hidden_states, "TransformerBlock Input hidden_states")
-    check_nan(encoder_hidden_states, "TransformerBlock Input encoder_hidden_states")
-    check_nan(temb, "TransformerBlock Input temb")
-    if rotary_emb is not None:
-        check_nan(rotary_emb, "TransformerBlock Input rotary_emb")
     with self.conditional_named_scope("transformer_block"):
       with self.conditional_named_scope("adaln"):
-        scale_shift_all = (self.adaln_scale_shift_table.value + temb.astype(jnp.float32))
-        check_nan(scale_shift_all, "AdaLN scale_shift_all")
         shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
             (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
         )
@@ -408,56 +386,44 @@ def __call__(
     with self.conditional_named_scope("self_attn"):
       with self.conditional_named_scope("self_attn_norm"):
         norm_hidden_states = self.norm1(hidden_states.astype(jnp.float32))
-        check_nan(norm_hidden_states, "Self-Attn norm1 output")
         norm_hidden_states = (norm_hidden_states * (1 + scale_msa) + shift_msa).astype(
             hidden_states.dtype
         )
-        check_nan(norm_hidden_states, "Self-Attn norm_hidden_states after AdaLN")
       with self.conditional_named_scope("self_attn_attn"):
         attn_output = self.attn1(
             hidden_states=norm_hidden_states,
             encoder_hidden_states=norm_hidden_states,
             rotary_emb=rotary_emb,
             deterministic=deterministic,
             rngs=rngs,
-            tag="SELF",
         )
-        check_nan(attn_output, "Self-Attn attn_output (attn1)")
       with self.conditional_named_scope("self_attn_residual"):
         hidden_states = (hidden_states.astype(jnp.float32) + attn_output * gate_msa).astype(hidden_states.dtype)
-        check_nan(hidden_states, "Self-Attn hidden_states after residual")
 
     # 2. Cross-attention
     residual = hidden_states
     norm_hidden_states = self.norm2(hidden_states.astype(jnp.float32)).astype(hidden_states.dtype)
-    check_nan(norm_hidden_states, "Cross-Attn norm_hidden_states (norm2)")
     attn_output = self.attn2(
-        hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs, tag="CROSS"
+        hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs
     )
-    check_nan(attn_output, "Cross-Attn attn_output (attn2)")
     hidden_states = residual + attn_output
-    check_nan(hidden_states, "Cross-Attn hidden_states after residual")
 
     # 3. Feed-forward
     residual = hidden_states
     with self.conditional_named_scope("mlp"):
       with self.conditional_named_scope("mlp_norm"):
         norm_hidden_states = self.norm3(hidden_states.astype(jnp.float32))
-        check_nan(norm_hidden_states, "MLP norm3 output")
         norm_hidden_states = (norm_hidden_states * (1 + c_scale_msa) + c_shift_msa).astype(
             hidden_states.dtype
         )
-        check_nan(norm_hidden_states, "MLP norm_hidden_states after AdaLN")
   
     with self.conditional_named_scope("mlp_ffn"):
       ff_output = self.ffn(norm_hidden_states, deterministic=deterministic, rngs=rngs)
-      check_nan(ff_output, "MLP ff_output")
 
     with self.conditional_named_scope("mlp_residual"):
       hidden_states = (hidden_states.astype(jnp.float32) + ff_output.astype(jnp.float32) * c_gate_msa).astype(
             hidden_states.dtype
         )
-      check_nan(hidden_states, "MLP hidden_states after residual (Block Output)")
     return hidden_states
 
 
diff --git a/src/maxdiffusion/schedulers/scheduling_unipc_multistep_flax.py b/src/maxdiffusion/schedulers/scheduling_unipc_multistep_flax.py
@@ -30,6 +30,16 @@
     add_noise_common,
 )
 
+def check_nan_jit(tensor: jax.Array, name: str, step: jax.Array):
+    if tensor is None:
+      return
+
+    has_nans = jnp.isnan(tensor).any()
+    has_infs = jnp.isinf(tensor).any()
+    jax.debug.print(f"[DEBUG SCHEDULER {jax.process_index()}] Step: {{step}} - {name}: "
+                    "Shape: {shape}, Has NaNs: {has_nans_val}, Has Infs: {has_infs_val}",
+                    step=step, shape=tensor.shape, has_nans_val=has_nans, has_infs_val=has_infs)
+
 
 @flax.struct.dataclass
 class UniPCMultistepSchedulerState:
@@ -285,14 +295,18 @@ def convert_model_output(
       state: UniPCMultistepSchedulerState,
       model_output: jnp.ndarray,
       sample: jnp.ndarray,
+      step: jax.Array,
   ) -> jnp.ndarray:
     """
     Converts the model output based on the prediction type and current state.
     """
     sigma = state.sigmas[state.step_index]  # Current sigma
+    check_nan_jit(sigma, "convert_model_output sigma", step)
 
     # Ensure sigma is a JAX array for _sigma_to_alpha_sigma_t
     alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+    check_nan_jit(alpha_t, "convert_model_output alpha_t", step)
+    check_nan_jit(sigma_t, "convert_model_output sigma_t", step)
 
     if self.config.predict_x0:
       if self.config.prediction_type == "epsilon":
@@ -310,6 +324,7 @@ def convert_model_output(
             f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, "
             "`v_prediction`, or `flow_prediction` for the UniPCMultistepScheduler."
         )
+      check_nan_jit(x0_pred, "convert_model_output x0_pred", step)
 
       if self.config.thresholding:
         raise NotImplementedError("Dynamic thresholding isn't implemented.")
@@ -336,6 +351,7 @@ def multistep_uni_p_bh_update(
       model_output: jnp.ndarray,
       sample: jnp.ndarray,
       order: int,
+      step: jax.Array,
   ) -> jnp.ndarray:
     """
     One step for the UniP (B(h) version) - the Predictor.
@@ -358,6 +374,7 @@ def multistep_uni_p_bh_update(
     lambda_s0 = jnp.log(alpha_s0 + 1e-10) - jnp.log(sigma_s0 + 1e-10)
 
     h = lambda_t - lambda_s0
+    check_nan_jit(h, "predictor h", step)
 
     def rk_d1_loop_body(i, carry):
       # Loop from i = 0 to order-2
@@ -371,6 +388,7 @@ def rk_d1_loop_body(i, carry):
 
       rk = (lambda_si - lambda_s0) / h
       Di = (mi - m0) / rk
+      check_nan_jit(Di, f"predictor Di[{i}]", step)
 
       rks = rks.at[i].set(rk)
       D1s = D1s.at[i].set(Di)
@@ -467,7 +485,7 @@ def solve_for_rhos_p(R_mat, b_vec, current_order):
     else:  # Predict epsilon
       x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
       x_t = x_t_ - sigma_t * B_h * pred_res
-
+    check_nan_jit(x_t, "predictor x_t", step)
     return x_t.astype(x.dtype)
 
   def multistep_uni_c_bh_update(
@@ -477,6 +495,7 @@ def multistep_uni_c_bh_update(
       last_sample: jnp.ndarray,  # Sample after predictor `x_{t-1}`
       this_sample: jnp.ndarray,  # Sample before corrector `x_t` (after predictor step)
       order: int,
+      step: jax.Array,
   ) -> jnp.ndarray:
     """
     One step for the UniC (B(h) version) - the Corrector.
@@ -620,7 +639,8 @@ def solve_for_rhos(R_mat, b_vec, current_order):
     else:
       x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
       x_t = x_t_ - sigma_t * B_h * (corr_res + final_rho * D1_t)
-
+    
+    check_nan_jit(x_t, "corrector x_t", step)
     return x_t.astype(x.dtype)
 
   def index_for_timestep(
@@ -674,6 +694,10 @@ def step(
     Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
     the multistep UniPC.
     """
+    step_val = state.step_index # For debug, might be None initially
+
+    check_nan_jit(model_output, "step input model_output", step_val)
+    check_nan_jit(sample, "step input sample", step_val)
 
     sample = sample.astype(jnp.float32)
 
@@ -685,6 +709,7 @@ def step(
     # Initialize step_index if it's the first step
     if state.step_index is None:
       state = self._init_step_index(state, timestep_scalar)
+    step_val = state.step_index 
 
     # Determine if corrector should be used
     use_corrector = (
@@ -695,6 +720,7 @@ def step(
 
     # Convert model_output (noise/v_pred) to x0_pred or epsilon_pred, based on prediction_type
     model_output_for_history = self.convert_model_output(state, model_output, sample)
+    check_nan_jit(model_output_for_history, "model_output_for_history", step_val)
 
     # Apply corrector if applicable
     sample = jax.lax.cond(
@@ -708,6 +734,7 @@ def step(
         ),
         lambda: sample,
     )
+    check_nan_jit(sample, "sample_corrected", step_val)
 
     # Update history buffers (model_outputs and timestep_list)
     # Shift existing elements to the left and add new one at the end.