initial cleaning

Serenagu525 · Serenagu525 · commit 972e316601fc · 2025-07-11T23:19:45.000Z
diff --git a/src/maxdiffusion/configs/ltx_video.yml b/src/maxdiffusion/configs/ltx_video.yml
@@ -32,18 +32,10 @@ flow_shift: 5.0
 downscale_factor: 0.6666666
 spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
 prompt_enhancement_words_threshold: 120
-# guidance_scale: [1, 1, 6, 8, 6, 1, 1] #4.5
-# stg_scale: [0, 0, 4, 4, 4, 2, 1] #1.0
-# rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1] #0.7
-# num_inference_steps: 30
-# skip_final_inference_steps: 3
-# skip_initial_inference_steps: 0
-# guidance_timesteps: [1.0, 0.996,  0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
-# skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
 stg_mode: "attention_values"
 decode_timestep: 0.05
 decode_noise_scale: 0.025
-# cfg_star_rescale: True
+models_dir: "/mnt/disks/diffusionproj" #where safetensor file is
 
 
 first_pass:
diff --git a/src/maxdiffusion/generate_ltx_video.py b/src/maxdiffusion/generate_ltx_video.py
@@ -4,16 +4,12 @@
 from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXVideoPipeline
 from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXMultiScalePipeline
 from maxdiffusion import pyconfig
-import jax.numpy as jnp
-from maxdiffusion.models.ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
 from maxdiffusion.models.ltx_video.autoencoders.latent_upsampler import LatentUpsampler
 from huggingface_hub import hf_hub_download
 import imageio
 from datetime import datetime
-from maxdiffusion.utils import export_to_video
 
 import os
-import json
 import torch
 from pathlib import Path
 
@@ -96,52 +92,12 @@ def run(config):
     num_frames_padded = ((config.num_frames - 2) // 8 + 1) * 8 + 1
     padding = calculate_padding(
         config.height, config.width, height_padded, width_padded)
-    # prompt_enhancement_words_threshold = config.prompt_enhancement_words_threshold
-    # prompt_word_count = len(config.prompt.split())
-    # enhance_prompt = (
-    #     prompt_enhancement_words_threshold > 0 and prompt_word_count < prompt_enhancement_words_threshold
-    # )
 
-    seed = 10  # change this, generator in pytorch, used in prepare_latents
+    seed = 10 
     generator = torch.Generator().manual_seed(seed)
     pipeline = LTXVideoPipeline.from_pretrained(config, enhance_prompt = False)
-    if config.pipeline_type == "multi-scale":   #move this to pipeline file??
-        spatial_upscaler_model_name_or_path = config.spatial_upscaler_model_path
-    
-        if spatial_upscaler_model_name_or_path and not os.path.isfile(
-            spatial_upscaler_model_name_or_path
-        ):
-            spatial_upscaler_model_path = hf_hub_download(
-                repo_id="Lightricks/LTX-Video",
-                filename=spatial_upscaler_model_name_or_path,
-                local_dir= "/mnt/disks/diffusionproj",
-                repo_type="model",
-            )
-        else:
-            spatial_upscaler_model_path = spatial_upscaler_model_name_or_path
-        if not config.spatial_upscaler_model_path:
-            raise ValueError(
-                "spatial upscaler model path is missing from pipeline config file and is required for multi-scale rendering"
-            )
-        latent_upsampler = create_latent_upsampler(
-            spatial_upscaler_model_path, "cpu"  #device set to cpu for now
-        )
-        pipeline = LTXMultiScalePipeline(pipeline, latent_upsampler=latent_upsampler)
-    stg_mode = config.stg_mode
-    if stg_mode.lower() == "stg_av" or stg_mode.lower() == "attention_values":
-        skip_layer_strategy = SkipLayerStrategy.AttentionValues
-    elif stg_mode.lower() == "stg_as" or stg_mode.lower() == "attention_skip":
-        skip_layer_strategy = SkipLayerStrategy.AttentionSkip
-    elif stg_mode.lower() == "stg_r" or stg_mode.lower() == "residual":
-        skip_layer_strategy = SkipLayerStrategy.Residual
-    elif stg_mode.lower() == "stg_t" or stg_mode.lower() == "transformer_block":
-        skip_layer_strategy = SkipLayerStrategy.TransformerBlock
-    else:
-        raise ValueError(f"Invalid spatiotemporal guidance mode: {stg_mode}")
-    # images = pipeline(height=height_padded, width=width_padded, num_frames=num_frames_padded,
-    #                   is_video=True, output_type='pt', generator=generator, guidance_scale = config.first_pass.guidance_scale, stg_scale = config.stg_scale, rescaling_scale = config.rescaling_scale, skip_initial_inference_steps= config.skip_initial_inference_steps, skip_final_inference_steps= config.skip_final_inference_steps, num_inference_steps = config.num_inference_steps,
-    #                   guidance_timesteps = config.guidance_timesteps, cfg_star_rescale = config.cfg_star_rescale, skip_layer_strategy = None, skip_block_list=config.skip_block_list).images
-    images = pipeline(height=height_padded, width=width_padded, num_frames=num_frames_padded, is_video=True, output_type='pt', generator=generator, config = config)
+    pipeline = LTXMultiScalePipeline(pipeline)
+    images = pipeline(height=height_padded, width=width_padded, num_frames=num_frames_padded, output_type='pt', generator=generator, config = config)
     (pad_left, pad_right, pad_top, pad_bottom) = padding
     pad_bottom = -pad_bottom
     pad_right = -pad_right
diff --git a/src/maxdiffusion/models/ltx_video/repeatable_layer.py b/src/maxdiffusion/models/ltx_video/repeatable_layer.py
@@ -25,8 +25,7 @@ def __call__(self, carry: Tuple[jax.Array, jax.Array], *block_args) -> Tuple[Tup
 
         mod = self.module(*self.module_init_args, **self.module_init_kwargs)
 
-        # block_args are the static arguments passed to each individual block
-        output_data = mod(index_input, data_input, *block_args) # Pass block_args to the module
+        output_data = mod(index_input, data_input, *block_args) # Pass index_input to facilitate skip layers
 
         next_index = index_input + 1
         new_carry = (output_data, next_index)
@@ -76,14 +75,14 @@ class RepeatableLayer(nn.Module):
     """
 
     @nn.compact
-    def __call__(self, *args): # args is now the full input to RepeatableLayer
+    def __call__(self, *args): 
         if not args:
             raise ValueError("RepeatableLayer expects at least one argument for initial data input.")
 
-        initial_data_input = args[0] # The first element is your main data input
-        static_block_args = args[1:] # Any subsequent elements are static args for each block
+        initial_data_input = args[0]
+        static_block_args = args[1:] 
 
-        initial_index = jnp.array(0, dtype=jnp.int32)
+        initial_index = jnp.array(0, dtype=jnp.int32) #index of current transformer block
 
         scan_kwargs = {}
         if self.pspec_name is not None:
@@ -92,9 +91,6 @@ def __call__(self, *args): # args is now the full input to RepeatableLayer
         initializing = self.is_mutable_collection("params")
         params_spec = self.param_scan_axis if initializing else partitioning.ScanIn(self.param_scan_axis)
 
-        # in_axes for the scanned function (RepeatableCarryBlock.__call__):
-        # 1. The 'carry' tuple ((0, 0))
-        # 2. Then, nn.broadcast for each of the `static_block_args`
         in_axes_for_scan = (nn.broadcast,) * (len(args)-1)
 
         scan_fn = nn.scan(
@@ -117,5 +113,4 @@ def __call__(self, *args): # args is now the full input to RepeatableLayer
         # Call wrapped_function with the initial carry tuple and the static_block_args
         (final_data, final_index), _ = wrapped_function((initial_data_input, initial_index), *static_block_args)
 
-        # Typically, you only want the final data output from the sequence of layers
         return final_data
diff --git a/src/maxdiffusion/pipelines/ltx_video/ltx_video_pipeline.py b/src/maxdiffusion/pipelines/ltx_video/ltx_video_pipeline.py