Fix chessboard issue

mbohlool · mbohlool · commit 02670b26c8e9 · 2026-03-23T23:45:36.000Z
diff --git a/src/maxdiffusion/generate_ltx2.py b/src/maxdiffusion/generate_ltx2.py
@@ -160,6 +160,9 @@ class DummyOut: pass
         params=upsample_params,
         prng_seed=generator,
         latents=latents,
+        height=config.height,
+        width=config.width,
+        num_frames=config.num_frames,
         latents_normalized=False,  # Upsampler operates on normalized latents; VAE decoder handles denorm internally
         adain_factor=getattr(config, "upsampler_adain_factor", 0.0),
         tone_map_compression_ratio=getattr(config, "upsampler_tone_map_compression_ratio", 0.0),
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py
@@ -907,8 +907,7 @@ def __call__(
     p_t = self.patch_size_t
 
     hidden_states = sample.reshape(B, T // p_t, p_t, H // p, p, W // p, p, C)
-    # 0:B, 1:T_p, 3:H_p, 5:W_p, 7:C, 2:p_t, 4:p_h, 6:p_w
-    hidden_states = hidden_states.transpose(0, 1, 3, 5, 7, 2, 4, 6)
+    hidden_states = hidden_states.transpose(0, 1, 3, 5, 7, 2, 6, 4)
     hidden_states = hidden_states.reshape(B, T // p_t, H // p, W // p, -1)
 
     num_blocks = len(self.down_blocks) + 1
@@ -1108,8 +1107,7 @@ def __call__(
     hidden_states = hidden_states.reshape(B, T, H, W, C_out_final, p_t, p, p)
 
     # Pair H (2) with p_h (7) and W (3) with p_w (6)
-    # 0:B, 1:T, 5:p_t, 2:H, 6:p_h, 3:W, 7:p_w, 4:C_out_final
-    hidden_states = hidden_states.transpose(0, 1, 5, 2, 6, 3, 7, 4)
+    hidden_states = hidden_states.transpose(0, 1, 5, 2, 7, 3, 6, 4)
     hidden_states = hidden_states.reshape(B, T * p_t, H * p, W * p, C_out_final)
 
     return hidden_states
diff --git a/src/maxdiffusion/pipelines/ltx2/pipeline_ltx2_latent_upsample.py b/src/maxdiffusion/pipelines/ltx2/pipeline_ltx2_latent_upsample.py
@@ -200,11 +200,8 @@ def __call__(
                 latents_std = getattr(self.vae, "latents_std")
             latents = self._denormalize_latents(latents, latents_mean, latents_std, scaling_factor)
 
-        logging.info(f"[JAX Pipeline] Latents AFTER denorm (upsampler input): shape={latents.shape}, mean={jnp.mean(latents):.4f}, std={jnp.std(latents):.4f}, range=[{jnp.min(latents):.4f}, {jnp.max(latents):.4f}]")
-
         # Run Latent Upsampler model — expects (batch, frames, height, width, channels)
         latents_upsampled = self.latent_upsampler.apply({'params': params['latent_upsampler']}, latents)
-        logging.info(f"[JAX Pipeline] Latents AFTER upsampling: shape={latents_upsampled.shape}, mean={jnp.mean(latents_upsampled):.4f}, std={jnp.std(latents_upsampled):.4f}, range=[{jnp.min(latents_upsampled):.4f}, {jnp.max(latents_upsampled):.4f}]")
 
         if adain_factor > 0.0:
             latents = self.adain_filter_latent(latents_upsampled, latents, adain_factor)
@@ -243,7 +240,6 @@ def __call__(
         # Cast latents to VAE dtype before decoding (matches main pipeline behavior)
         vae_dtype = getattr(self.vae, 'dtype', jnp.float32)
         latents = latents.astype(vae_dtype)
-        logging.info(f"[Upsampler VAE decode] latents shape={latents.shape}, dtype={latents.dtype}")
 
         # Decode latents to video
         if timestep is not None: