fix

prishajain1 · prishajain1 · commit e0888dbab4f8 · 2026-02-06T23:20:47.000+05:30
diff --git a/src/maxdiffusion/models/ltx_2/transformer_ltx2.py b/src/maxdiffusion/models/ltx_2/transformer_ltx2.py
@@ -258,22 +258,12 @@ def __call__(
         # 1. Video and Audio Self-Attention
         norm_hidden_states = self.norm1(hidden_states)
 
-        import sys
-        
         # Calculate Video AdaLN values
         num_ada_params = self.scale_shift_table.shape[0]
         # table shape: (6, dim) -> (1, 1, 6, dim)
         scale_shift_table_reshaped = jnp.expand_dims(self.scale_shift_table, axis=(0, 1))
         # temb shape: (batch, temb_dim) -> (batch, 1, 6, dim)  (assuming temb_dim is num_ada_params * dim)
-        print(f"DEBUG_BLOCK: scale_shift_table_reshaped shape: {scale_shift_table_reshaped.shape}")
-        print(f"DEBUG_BLOCK: temb shape before reshape: {temb.shape}")
-        sys.stdout.flush()
-        
         temb_reshaped = temb.reshape(batch_size, 1, num_ada_params, -1)
-        
-        print(f"DEBUG_BLOCK: temb_reshaped shape: {temb_reshaped.shape}")
-        sys.stdout.flush()
-        
         ada_values = scale_shift_table_reshaped + temb_reshaped
 
         shift_msa = ada_values[:, :, 0, :]
@@ -297,15 +287,7 @@ def __call__(
 
         num_audio_ada_params = self.audio_scale_shift_table.shape[0]
         audio_scale_shift_table_reshaped = jnp.expand_dims(self.audio_scale_shift_table, axis=(0, 1))
-
-        print(f"DEBUG_BLOCK_AUDIO: audio_scale_shift_table_reshaped shape: {audio_scale_shift_table_reshaped.shape}")
-        print(f"DEBUG_BLOCK_AUDIO: temb_audio shape before reshape: {temb_audio.shape}")
-        sys.stdout.flush()
-
         temb_audio_reshaped = temb_audio.reshape(batch_size, 1, num_audio_ada_params, -1)
-
-        print(f"DEBUG_BLOCK_AUDIO: temb_audio_reshaped shape: {temb_audio_reshaped.shape}")
-        sys.stdout.flush()
         audio_ada_values = audio_scale_shift_table_reshaped + temb_audio_reshaped
 
         audio_shift_msa = audio_ada_values[:, :, 0, :]
@@ -518,10 +500,6 @@ def __init__(
         self.audio_caption_projection = NNXPixArtAlphaTextProjection(
             rngs=rngs, in_features=self.caption_channels, hidden_size=audio_inner_dim, dtype=self.dtype, weights_dtype=self.weights_dtype
         )
-        import sys
-        print(f"DEBUG IN INIT: inner_dim={inner_dim}, num_attention_heads={num_attention_heads}, attention_head_dim={attention_head_dim}")
-        sys.stdout.flush()
-        
         # 3. Timestep Modulation Params and Embedding
         self.time_embed = LTX2AdaLayerNormSingle(
             rngs=rngs, embedding_dim=inner_dim, num_mod_params=6, use_additional_conditions=False, dtype=self.dtype, weights_dtype=self.weights_dtype
diff --git a/src/maxdiffusion/tests/ltx_2_transformer_test.py b/src/maxdiffusion/tests/ltx_2_transformer_test.py
@@ -221,7 +221,7 @@ def test_transformer_3d_model_instantiation_and_forward(self):
         hidden_states = jnp.zeros((self.batch_size, self.seq_len, self.in_channels))
         audio_hidden_states = jnp.zeros((self.batch_size, 10, self.audio_in_channels))
         
-        timestep = jnp.array([1.0, 2.0]) # (B,)
+        timestep = jnp.array([1.0]) # (B,)
         
         encoder_hidden_states = jnp.zeros((self.batch_size, 5, 32)) # (B, Lc, Dc)
         audio_encoder_hidden_states = jnp.zeros((self.batch_size, 5, 32))