replicating stacked-batch activations

prishajain1 · prishajain1 · commit f1a579167513 · 2026-04-11T20:56:45.000+05:30
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -1431,17 +1431,37 @@ def _sched_cfg_get(key: str, default):
     else:
       perturbation_mask = None
 
+    # CFG / STG / modality stack duplicate batch rows as [uncond, cond, ...]. Guidance mixes rows
+    # (e.g. cond vs STG perturb) that must refer to the same sample. If axis 0 is sharded across
+    # data parallel mesh, those rows land on different chips and guidance is wrong — video can
+    # still look plausible while audio (tighter cross-modal coupling) goes silent or garbage.
     if hasattr(self, "mesh") and self.mesh is not None:
-      data_sharding_3d = NamedSharding(self.mesh, P())
-      data_sharding_2d = NamedSharding(self.mesh, P())
-      if hasattr(self, "config") and hasattr(self.config, "data_sharding"):
-        data_sharding_3d = NamedSharding(self.mesh, P(*self.config.data_sharding[:3]))
-        data_sharding_2d = NamedSharding(self.mesh, P(*self.config.data_sharding[:2]))
-      if isinstance(prompt_embeds_jax, list):
-        prompt_embeds_jax = [jax.device_put(x, data_sharding_3d) for x in prompt_embeds_jax]
+      if do_cfg:
+        rep = NamedSharding(self.mesh, P())
+        max_logging.log(
+            "LTX2: replicating stacked-batch activations on all devices (required for CFG/STG; "
+            "data-parallel sharding of batch breaks cross-row guidance)."
+        )
+        if isinstance(prompt_embeds_jax, list):
+          prompt_embeds_jax = [jax.device_put(x, rep) for x in prompt_embeds_jax]
+        else:
+          prompt_embeds_jax = jax.device_put(prompt_embeds_jax, rep)
+        prompt_attention_mask_jax = jax.device_put(prompt_attention_mask_jax, rep)
+        latents_jax = jax.device_put(latents_jax, rep)
+        audio_latents_jax = jax.device_put(audio_latents_jax, rep)
+        if perturbation_mask is not None:
+          perturbation_mask = jax.device_put(perturbation_mask, rep)
       else:
-        prompt_embeds_jax = jax.device_put(prompt_embeds_jax, data_sharding_3d)
-      prompt_attention_mask_jax = jax.device_put(prompt_attention_mask_jax, data_sharding_2d)
+        data_sharding_3d = NamedSharding(self.mesh, P())
+        data_sharding_2d = NamedSharding(self.mesh, P())
+        if hasattr(self, "config") and hasattr(self.config, "data_sharding"):
+          data_sharding_3d = NamedSharding(self.mesh, P(*self.config.data_sharding[:3]))
+          data_sharding_2d = NamedSharding(self.mesh, P(*self.config.data_sharding[:2]))
+        if isinstance(prompt_embeds_jax, list):
+          prompt_embeds_jax = [jax.device_put(x, data_sharding_3d) for x in prompt_embeds_jax]
+        else:
+          prompt_embeds_jax = jax.device_put(prompt_embeds_jax, data_sharding_3d)
+        prompt_attention_mask_jax = jax.device_put(prompt_attention_mask_jax, data_sharding_2d)
 
     # GraphDef and State
     graphdef, state = nnx.split(self.transformer)
@@ -1471,21 +1491,33 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
       video_embeds_sharded = video_embeds
       audio_embeds_sharded = audio_embeds
 
-      if not self.transformer.scan_layers:
+      if hasattr(self, "mesh") and self.mesh is not None and do_cfg:
+        rep = NamedSharding(self.mesh, P())
+        video_embeds_sharded = jax.device_put(video_embeds_sharded, rep)
+        audio_embeds_sharded = jax.device_put(audio_embeds_sharded, rep)
+        new_attention_mask = jax.device_put(new_attention_mask, rep)
+
+      if not self.transformer.scan_layers and not do_cfg:
         activation_axes = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_embed"))
         spec = NamedSharding(self.mesh, P(*activation_axes))
-        video_embeds_sharded = jax.device_put(video_embeds, spec)
-        audio_embeds_sharded = jax.device_put(audio_embeds, spec)
+        video_embeds_sharded = jax.device_put(video_embeds_sharded, spec)
+        audio_embeds_sharded = jax.device_put(audio_embeds_sharded, spec)
 
       timesteps_jax = jnp.array(timesteps, dtype=jnp.float32)
+      guidance_rep = (
+          NamedSharding(self.mesh, P())
+          if (do_cfg and hasattr(self, "mesh") and self.mesh is not None)
+          else None
+      )
+
       for i in range(len(timesteps_jax)):
         t = timesteps_jax[i]
 
         # Isolate input sharding to scan_layers=False to avoid affecting the standard path
         latents_jax_sharded = latents_jax
         audio_latents_jax_sharded = audio_latents_jax
 
-        if not self.transformer.scan_layers:
+        if not self.transformer.scan_layers and not do_cfg:
           activation_axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_embed"))
           latents_jax_sharded = jax.lax.with_sharding_constraint(latents_jax, activation_axis_names)
           audio_latents_jax_sharded = jax.lax.with_sharding_constraint(audio_latents_jax, activation_axis_names)
@@ -1511,6 +1543,10 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
             use_cross_timestep=use_cross_timestep,
         )
 
+        if guidance_rep is not None:
+          noise_pred = jax.device_put(noise_pred, guidance_rep)
+          noise_pred_audio = jax.device_put(noise_pred_audio, guidance_rep)
+
         do_cfg = guidance_scale > 1.0
         do_stg = stg_scale > 0.0
         sigma_t = sigmas[i]