pipeline cleaned

Serenagu525 · Serenagu525 · commit 0577d3ebe1b2 · 2025-07-22T02:04:25.000Z
diff --git a/src/maxdiffusion/configs/ltx_video.yml b/src/maxdiffusion/configs/ltx_video.yml
@@ -19,16 +19,12 @@ frame_rate: 30
 max_sequence_length: 512
 sampler: "from_checkpoint"
 
-
-
-
-
 # Generation parameters
-pipeline_type: None
-prompt: "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie."
+pipeline_type: multi-scale
+prompt: "A woman with light skin, wearing a blue jacket and a black hat with a veil, looks down and to her right, then back up as she speaks; she has brown hair styled in an updo, light brown eyebrows, and is wearing a white collared shirt under her jacket; the camera remains stationary on her face as she speaks; the background is out of focus, but shows trees and people in period clothing; the scene is captured in real-life footage."
 height: 512
 width: 512
-num_frames: 88 #344
+num_frames: 344 #344
 flow_shift: 5.0
 fps: 24
 downscale_factor: 0.6666666
diff --git a/src/maxdiffusion/generate_ltx_video.py b/src/maxdiffusion/generate_ltx_video.py
@@ -4,16 +4,12 @@
 from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXVideoPipeline
 from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXMultiScalePipeline
 from maxdiffusion import pyconfig
-import jax.numpy as jnp
 from maxdiffusion.models.ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
-from maxdiffusion.models.ltx_video.autoencoders.latent_upsampler import LatentUpsampler
 from huggingface_hub import hf_hub_download
 import imageio
 from datetime import datetime
-from maxdiffusion.utils import export_to_video
 
 import os
-import json
 import torch
 from pathlib import Path
 
@@ -62,25 +58,19 @@ def convert_prompt_to_filename(text: str, max_len: int = 20) -> str:
   return "-".join(result)
 
 
-def create_latent_upsampler(latent_upsampler_model_path: str, device: str):
-  latent_upsampler = LatentUpsampler.from_pretrained(latent_upsampler_model_path)
-  latent_upsampler.to(device)
-  latent_upsampler.eval()
-  return latent_upsampler
 
 
 def get_unique_filename(
     base: str,
     ext: str,
     prompt: str,
-    seed: int,
     resolution: tuple[int, int, int],
     dir: Path,
     endswith=None,
     index_range=1000,
 ) -> Path:
   base_filename = (
-      f"{base}_{convert_prompt_to_filename(prompt, max_len=30)}_{seed}_{resolution[0]}x{resolution[1]}x{resolution[2]}"
+      f"{base}_{convert_prompt_to_filename(prompt, max_len=30)}_{resolution[0]}x{resolution[1]}x{resolution[2]}"
   )
   for i in range(index_range):
     filename = dir / f"{base_filename}_{i}{endswith if endswith else ''}{ext}"
@@ -94,55 +84,23 @@ def run(config):
   width_padded = ((config.width - 1) // 32 + 1) * 32
   num_frames_padded = ((config.num_frames - 2) // 8 + 1) * 8 + 1
   padding = calculate_padding(config.height, config.width, height_padded, width_padded)
-  # prompt_enhancement_words_threshold = config.prompt_enhancement_words_threshold
-  # prompt_word_count = len(config.prompt.split())
-  # enhance_prompt = (
-  #     prompt_enhancement_words_threshold > 0 and prompt_word_count < prompt_enhancement_words_threshold
-  # )
-
-  seed = 10  # change this, generator in pytorch, used in prepare_latents
-  generator = torch.Generator().manual_seed(seed)
-  pipeline = LTXVideoPipeline.from_pretrained(config, enhance_prompt=False)
-  if config.pipeline_type == "multi-scale":  # move this to pipeline file??
-    spatial_upscaler_model_name_or_path = config.spatial_upscaler_model_path
-
-    if spatial_upscaler_model_name_or_path and not os.path.isfile(spatial_upscaler_model_name_or_path):
-      spatial_upscaler_model_path = hf_hub_download(
-          repo_id="Lightricks/LTX-Video",
-          filename=spatial_upscaler_model_name_or_path,
-          local_dir="/mnt/disks/diffusionproj",
-          repo_type="model",
-      )
-    else:
-      spatial_upscaler_model_path = spatial_upscaler_model_name_or_path
-    if not config.spatial_upscaler_model_path:
-      raise ValueError(
-          "spatial upscaler model path is missing from pipeline config file and is required for multi-scale rendering"
-      )
-    latent_upsampler = create_latent_upsampler(spatial_upscaler_model_path, "cpu")  # device set to cpu for now
-    pipeline = LTXMultiScalePipeline(pipeline, latent_upsampler=latent_upsampler)
-  stg_mode = config.stg_mode
-  if stg_mode.lower() == "stg_av" or stg_mode.lower() == "attention_values":
-    skip_layer_strategy = SkipLayerStrategy.AttentionValues
-  elif stg_mode.lower() == "stg_as" or stg_mode.lower() == "attention_skip":
-    skip_layer_strategy = SkipLayerStrategy.AttentionSkip
-  elif stg_mode.lower() == "stg_r" or stg_mode.lower() == "residual":
-    skip_layer_strategy = SkipLayerStrategy.Residual
-  elif stg_mode.lower() == "stg_t" or stg_mode.lower() == "transformer_block":
-    skip_layer_strategy = SkipLayerStrategy.TransformerBlock
-  else:
-    raise ValueError(f"Invalid spatiotemporal guidance mode: {stg_mode}")
-  # images = pipeline(height=height_padded, width=width_padded, num_frames=num_frames_padded,
-  #                   is_video=True, output_type='pt', generator=generator, guidance_scale = config.first_pass.guidance_scale, stg_scale = config.stg_scale, rescaling_scale = config.rescaling_scale, skip_initial_inference_steps= config.skip_initial_inference_steps, skip_final_inference_steps= config.skip_final_inference_steps, num_inference_steps = config.num_inference_steps,
-  #                   guidance_timesteps = config.guidance_timesteps, cfg_star_rescale = config.cfg_star_rescale, skip_layer_strategy = None, skip_block_list=config.skip_block_list).images
+  prompt_enhancement_words_threshold = config.prompt_enhancement_words_threshold
+  prompt_word_count = len(config.prompt.split())
+  enhance_prompt = (
+      prompt_enhancement_words_threshold > 0 and prompt_word_count < prompt_enhancement_words_threshold
+  )
+
+  pipeline = LTXVideoPipeline.from_pretrained(config, enhance_prompt=enhance_prompt)
+  if config.pipeline_type == "multi-scale": 
+    pipeline = LTXMultiScalePipeline(pipeline)
   images = pipeline(
       height=height_padded,
       width=width_padded,
       num_frames=num_frames_padded,
       is_video=True,
       output_type="pt",
-      generator=generator,
       config=config,
+      enhance_prompt = False
   )
   (pad_left, pad_right, pad_top, pad_bottom) = padding
   pad_bottom = -pad_bottom
@@ -167,7 +125,6 @@ def run(config):
           f"image_output_{i}",
           ".png",
           prompt=config.prompt,
-          seed=seed,
           resolution=(height, width, config.num_frames),
           dir=output_dir,
       )
@@ -177,7 +134,6 @@ def run(config):
           f"video_output_{i}",
           ".mp4",
           prompt=config.prompt,
-          seed=seed,
           resolution=(height, width, config.num_frames),
           dir=output_dir,
       )
diff --git a/src/maxdiffusion/pipelines/ltx_video/ltx_video_pipeline.py b/src/maxdiffusion/pipelines/ltx_video/ltx_video_pipeline.py