minor fixes for ltx2 in config and pipeline, reverting back %128 assertion change

prishajain1 · prishajain1 · commit 5f28f8cc9c75 · 2026-03-02T10:15:38.000+05:30
diff --git a/src/maxdiffusion/configs/ltx2_video.yml b/src/maxdiffusion/configs/ltx2_video.yml
@@ -37,19 +37,10 @@ negative_prompt: "Bright tones, overexposed, static, blurred details, subtitles,
 height: 512
 width: 768
 num_frames: 121
-flow_shift: 5.0
-downscale_factor: 0.6666666
-spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
-prompt_enhancement_words_threshold: 120
-stg_mode: "attention_values"
 decode_timestep: 0.05
 decode_noise_scale: 0.025
 quantization: "int8"
 seed: 10
-conditioning_media_paths: None #["IMAGE_PATH"]
-conditioning_start_frames: [0]
-
-
 #parallelism
 mesh_axes: ['data', 'fsdp', 'context', 'tensor']
 logical_axis_rules: [
diff --git a/src/maxdiffusion/maxdiffusion_utils.py b/src/maxdiffusion/maxdiffusion_utils.py
@@ -287,10 +287,10 @@ def get_dummy_flux_inputs(config, pipeline, batch_size):
 
 
 def get_dummy_ltx2_inputs(config, pipeline, batch_size):
-  height = 32
-  width = 32
-  num_frames = 1
   raw_keys = config.get_keys() if hasattr(config, "get_keys") else {}
+  height = raw_keys.get("height", 512) if raw_keys.get("height") else 512
+  width = raw_keys.get("width", 768) if raw_keys.get("width") else 768
+  num_frames = raw_keys.get("num_frames", 121) if raw_keys.get("num_frames") else 121
   fps = raw_keys.get("fps", 24.0) if raw_keys.get("fps") else 24.0
   duration_s = num_frames / fps
   audio_latents_per_second = (
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -235,10 +235,8 @@ def _tpu_flash_attention(
   q_max_block_size = 1024 if dtype == jnp.bfloat16 else 512
   # This is the case for cross-attn.
   if key.shape[1] != query.shape[1]:
-    if key.shape[1] % 128 != 0:
-      kv_max_block_size = ((key.shape[1] + 127) // 128) * 128
-    else:
-      kv_max_block_size = key.shape[1]
+    assert key.shape[1] % 128 == 0
+    kv_max_block_size = key.shape[1]
   else:
     kv_max_block_size = q_max_block_size
   # ensure that for cross attention we override the block sizes.
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -677,20 +677,20 @@ def load_transformer(
 
   @staticmethod
   def _pack_text_embeds(
-      text_hidden_states: torch.Tensor,
-      sequence_lengths: torch.Tensor,
+      text_hidden_states: jax.Array,
+      sequence_lengths: jax.Array,
       padding_side: str = "left",
       scale_factor: int = 8,
       eps: float = 1e-6,
-  ) -> torch.Tensor:
+  ) -> jax.Array:
       """
-      Packs and normalizes text encoder hidden states using PyTorch to save device HBM.
+      Packs and normalizes text encoder hidden states using JAX natively.
       """
       batch_size, seq_len, hidden_dim, num_layers = text_hidden_states.shape
       original_dtype = text_hidden_states.dtype
 
       # Create padding mask
-      token_indices = torch.arange(seq_len, device=text_hidden_states.device).unsqueeze(0)
+      token_indices = jnp.arange(seq_len)[None, :]
       if padding_side == "right":
           mask = token_indices < sequence_lengths[:, None]
       elif padding_side == "left":
@@ -700,20 +700,20 @@ def _pack_text_embeds(
           raise ValueError(f"padding_side must be 'left' or 'right', got {padding_side}")
       mask = mask[:, :, None, None]
 
-      masked_text_hidden_states = text_hidden_states.masked_fill(~mask, 0.0)
-      num_valid_positions = (sequence_lengths * hidden_dim).view(batch_size, 1, 1, 1)
-      masked_mean = masked_text_hidden_states.sum(dim=(1, 2), keepdim=True) / (num_valid_positions + eps)
+      masked_text_hidden_states = jnp.where(mask, text_hidden_states, 0.0)
+      num_valid_positions = (sequence_lengths * hidden_dim).reshape(batch_size, 1, 1, 1)
+      masked_mean = jnp.sum(masked_text_hidden_states, axis=(1, 2), keepdims=True) / (num_valid_positions + eps)
 
-      x_min = text_hidden_states.masked_fill(~mask, float("inf")).amin(dim=(1, 2), keepdim=True)
-      x_max = text_hidden_states.masked_fill(~mask, float("-inf")).amax(dim=(1, 2), keepdim=True)
+      x_min = jnp.min(jnp.where(mask, text_hidden_states, jnp.inf), axis=(1, 2), keepdims=True)
+      x_max = jnp.max(jnp.where(mask, text_hidden_states, -jnp.inf), axis=(1, 2), keepdims=True)
 
       normalized_hidden_states = (text_hidden_states - masked_mean) / (x_max - x_min + eps)
       normalized_hidden_states = normalized_hidden_states * scale_factor
 
-      normalized_hidden_states = normalized_hidden_states.flatten(2)
-      mask_flat = mask.squeeze(-1).expand(-1, -1, hidden_dim * num_layers)
-      normalized_hidden_states = normalized_hidden_states.masked_fill(~mask_flat, 0.0)
-      normalized_hidden_states = normalized_hidden_states.to(dtype=original_dtype)
+      normalized_hidden_states = normalized_hidden_states.reshape(batch_size, seq_len, -1)
+      mask_flat = jnp.broadcast_to(mask.squeeze(-1), (batch_size, seq_len, hidden_dim * num_layers))
+      normalized_hidden_states = jnp.where(mask_flat, normalized_hidden_states, 0.0)
+      normalized_hidden_states = normalized_hidden_states.astype(original_dtype)
       return normalized_hidden_states
 
   def _get_gemma_prompt_embeds(
@@ -733,7 +733,6 @@ def _get_gemma_prompt_embeds(
               self.tokenizer.pad_token = self.tokenizer.eos_token
 
       prompt = [p.strip() for p in prompt]
-      # Return Numpy tensors to be compatible with JAX if no text encoder, else PyTorch
 
       if self.text_encoder is not None:
            # PyTorch Text Encoder
@@ -748,49 +747,41 @@ def _get_gemma_prompt_embeds(
            text_input_ids = text_inputs.input_ids
            prompt_attention_mask = text_inputs.attention_mask
 
-           # Move to device if needed (assuming text_encoder is on correct device or CPU)
-           # For now, keep on CPU or same device as model
            text_input_ids = text_input_ids.to(self.text_encoder.device)
            prompt_attention_mask = prompt_attention_mask.to(self.text_encoder.device)
            
-           max_logging.log(f"DEBUG: text_encoder is on {self.text_encoder.device}")
-           max_logging.log(f"DEBUG: text_input_ids is on {text_input_ids.device}")
-
            with torch.no_grad():
                 text_encoder_outputs = self.text_encoder(
                     input_ids=text_input_ids, attention_mask=prompt_attention_mask, output_hidden_states=True
                 )
            
-           text_encoder_hidden_states = text_encoder_outputs.hidden_states
-           del text_encoder_outputs # Free memory
+           text_encoder_hidden_states = torch.stack(text_encoder_outputs.hidden_states, dim=-1)
+           sequence_lengths = prompt_attention_mask.sum(dim=-1)
            
-           prompt_embeds_list = []
-           for state in text_encoder_hidden_states:
-               state_np = state.cpu().to(torch.float32).numpy()
-               prompt_embeds_list.append(jnp.array(state_np, dtype=jnp.bfloat16))
-
-           prompt_embeds = prompt_embeds_list
+           # Convert to JAX arrays to do native JAX math
+           hidden_states_jax = jnp.array(text_encoder_hidden_states.cpu().to(torch.float32).numpy())
+           sequence_lengths_jax = jnp.array(sequence_lengths.cpu().numpy())
+           prompt_attention_mask_jax = jnp.array(prompt_attention_mask.cpu().numpy())
+           
+           del text_encoder_outputs # Free memory
            del text_encoder_hidden_states # Free PyTorch tensor memory
            
-           prompt_attention_mask = jnp.array(prompt_attention_mask.cpu().to(torch.float32).numpy(), dtype=jnp.bfloat16)
+           prompt_embeds = self._pack_text_embeds(
+               hidden_states_jax,
+               sequence_lengths_jax,
+               padding_side=self.tokenizer.padding_side,
+               scale_factor=scale_factor,
+           )
+           prompt_attention_mask = prompt_attention_mask_jax
       else:
           raise ValueError("`text_encoder` is required to encode prompts.")
+      
       if dtype is not None:
-          if isinstance(prompt_embeds, list):
-              prompt_embeds = [state.astype(dtype) for state in prompt_embeds]
-          else:
-              prompt_embeds = prompt_embeds.astype(dtype)
-
-      if isinstance(prompt_embeds, list):
-          _, seq_len, _ = prompt_embeds[0].shape
-          prompt_embeds = [
-              jnp.repeat(state, num_videos_per_prompt, axis=0).reshape(batch_size * num_videos_per_prompt, seq_len, -1)
-              for state in prompt_embeds
-          ]
-      else:
-          _, seq_len, _ = prompt_embeds.shape
-          prompt_embeds = jnp.repeat(prompt_embeds, num_videos_per_prompt, axis=0)
-          prompt_embeds = prompt_embeds.reshape(batch_size * num_videos_per_prompt, seq_len, -1)
+          prompt_embeds = prompt_embeds.astype(dtype)
+
+      _, seq_len, _ = prompt_embeds.shape
+      prompt_embeds = jnp.repeat(prompt_embeds, num_videos_per_prompt, axis=0)
+      prompt_embeds = prompt_embeds.reshape(batch_size * num_videos_per_prompt, seq_len, -1)
 
       prompt_attention_mask = prompt_attention_mask.reshape(batch_size, -1)
       prompt_attention_mask = jnp.repeat(prompt_attention_mask, num_videos_per_prompt, axis=0)