spatio temporal guidance

prishajain1 · prishajain1 · commit 665024204ddd · 2026-04-11T12:04:03.000+05:30
diff --git a/src/maxdiffusion/configs/ltx2_3_video.yml b/src/maxdiffusion/configs/ltx2_3_video.yml
@@ -28,8 +28,12 @@ sampler: "from_checkpoint"
 global_batch_size_to_train_on: 1
 num_inference_steps: 40
 guidance_scale: 3.0
-stg_scale: 0.0
-spatio_temporal_guidance_blocks: []
+audio_guidance_scale: 7.0
+stg_scale: 1.0
+audio_stg_scale: 1.0
+modality_scale: 3.0
+audio_modality_scale: 3.0
+spatio_temporal_guidance_blocks: [28]
 fps: 24
 pipeline_type: multi-scale
 prompt: "A man in a brightly lit room talks on a vintage telephone. In a low, heavy voice, he says, 'I understand. I won't call again. Goodbye.' He hangs up the receiver and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is brightly lit by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a dramatic movie."
diff --git a/src/maxdiffusion/generate_ltx2.py b/src/maxdiffusion/generate_ltx2.py
@@ -98,6 +98,11 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
       decode_timestep=getattr(config, "decode_timestep", 0.0),
       decode_noise_scale=getattr(config, "decode_noise_scale", None),
       max_sequence_length=getattr(config, "max_sequence_length", 1024),
+      audio_guidance_scale=getattr(config, "audio_guidance_scale", None),
+      stg_scale=getattr(config, "stg_scale", 0.0),
+      audio_stg_scale=getattr(config, "audio_stg_scale", None),
+      modality_scale=getattr(config, "modality_scale", 1.0),
+      audio_modality_scale=getattr(config, "audio_modality_scale", None),
       dtype=jnp.bfloat16 if getattr(config, "activations_dtype", "bfloat16") == "bfloat16" else jnp.float32,
   )
   return out
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -108,7 +108,11 @@ def create_sharded_logical_transformer(
     tensors: dict = None,
 ):
   def create_model(rngs: nnx.Rngs, ltx2_config: dict):
-    transformer = LTX2VideoTransformer3DModel(**ltx2_config, rngs=rngs)
+    transformer = LTX2VideoTransformer3DModel(
+        **ltx2_config,
+        spatio_temporal_guidance_blocks=tuple(getattr(config, "spatio_temporal_guidance_blocks", ())),
+        rngs=rngs
+    )
     return transformer
 
   # 1. Load config.