debug added for latents + nornalisation changed

prishajain1 · prishajain1 · commit c00f53d41fc5 · 2025-12-30T13:51:24.000+05:30
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -1291,6 +1291,8 @@ def scan_fn(carry, input_slice):
             input_slice = jnp.expand_dims(input_slice, 1)
             out_slice, new_carry = self.decoder(input_slice, carry)
             out_swapped = out_slice[:, jnp.array([0, 2, 1, 3]), ...]
+            jax.debug.print("Decoder output shape: {shape}", shape=out_slice.shape)
+            jax.debug.print("After swap shape: {shape}", shape=out_swapped.shape)
             
             return new_carry, out_swapped
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -532,11 +532,13 @@ def prepare_latents_i2v_base(
 
       with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
           encoded_output = self.vae.encode(video_condition)[0].mode()
+      
+      encoded_output = jnp.transpose(encoded_output, (0, 2, 3, 4, 1))
 
       # Normalize latents
       latents_mean = jnp.array(self.vae.latents_mean).reshape(1, 1, 1, 1, self.vae.z_dim)
-      latents_std = 1.0 / jnp.array(self.vae.latents_std).reshape(1, 1, 1, 1, self.vae.z_dim)
-      latent_condition = (encoded_output - latents_mean) * latents_std
+      latents_std = jnp.array(self.vae.latents_std).reshape(1, 1, 1, 1, self.vae.z_dim)
+      latent_condition = (encoded_output - latents_mean) / latents_std
       latent_condition = latent_condition.astype(dtype)
 
       return latent_condition, video_condition
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
@@ -107,6 +107,10 @@ def prepare_latents(
         
         num_channels_latents = self.vae.z_dim
         num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        jax.debug.print("num_frames: {nf}, num_latent_frames: {nlf}, expected: {exp}",
+                        nf=num_frames,
+                        nlf=latents.shape[1],
+                        exp=num_latent_frames)
         latent_height = height // self.vae_scale_factor_spatial
         latent_width = width // self.vae_scale_factor_spatial
 
@@ -124,6 +128,13 @@ def prepare_latents(
             mask_lat_size = mask_lat_size.at[:, :, 1:-1, :, :].set(0)     
         first_frame_mask = mask_lat_size[:, :, 0:1]
         first_frame_mask = jnp.repeat(first_frame_mask, self.vae_scale_factor_temporal, axis=2)
+        jax.debug.print("first_frame_mask.shape:{shape}, is None:{isnone}",
+                        shape = first_frame_mask.shape if first_frame_mask is not None else (-1,),
+                        isnone = first_frame_mask is None)
+        jax.debug.print("first_frame_mask_stats: min={mn:.2f}, max={mx:.2f}, mean={mean:.2f}",
+                        mn=jnp.min(first_frame_mask) if first_frame_mask is not None else 0.0,
+                        mx=jnp.max(first_frame_mask) if first_frame_mask is not None else 0.0,
+                        mean=jnp.mean(first_frame_mask) if first_frame_mask is not None else 0.0)
         mask_lat_size = jnp.concatenate([first_frame_mask, mask_lat_size[:, :, 1:]], axis=2)
         mask_lat_size = mask_lat_size.reshape(
           batch_size, 
@@ -135,6 +146,12 @@ def prepare_latents(
         )
         mask_lat_size = jnp.transpose(mask_lat_size, (0, 2, 4, 5, 3, 1)).squeeze(-1)
         condition = jnp.concatenate([mask_lat_size, latent_condition], axis=-1)
+        jax.debug.print("condition shape: {shape}, channel dim: {c}",
+                        shape=condition.shape,
+                        c=condition.shape[-1])
+        jax.debug.print("condition stats: mask_mean={mm:.4f}, latent_mean={lm:.4f}",
+                        mm=jnp.mean(condition[..., 0]),
+                        lm=jnp.mean(condition[..., 1:]))
 
         return latents, condition, None
 
@@ -300,11 +317,24 @@ def loop_body(step, vals):
         encoder_hidden_states_image=image_embeds_input,
     )
     noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
+    jax.debug.print("Step {s}: latents_prev std={std:.6f}, mean={mean:.6f}",
+                    s=step,
+                    std=jnp.std(latents),
+                    mean=jnp.mean(latents))
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+    jax.debug.print("Step {s}: latents_next std={std:.6f}, mean={mean:.6f}",
+                    s=step,
+                    std=jnp.std(latents),
+                    mean=jnp.mean(latents))
     latents = latents.astype(original_dtype)
     return latents, scheduler_state, rng
 
   max_logging.log(f"Running fori_loop for {num_inference_steps} steps.")
   latents, _, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state, rng))
+  jax.debug.print("Final latents states: min={lmin:.6f}, max={lmax:.6f}, mean={lmean:.6f}, std={lstd:.6f}",
+                  lmin=jnp.min(latents),
+                  lmax=jnp.max(latents),
+                  lmean=jnp.mean(latents),
+                  lstd=jnp.std(latents))
   max_logging.log("Finished fori_loop.")
   return latents