AI-Hypercomputer
diff --git a/‎ ‎ b/‎ ‎
diff --git a/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 3 additions & 1 deletion b/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 53 additions & 2 deletions b/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 53 additions & 2 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/autoencoders/__init__.py‎ b/‎src/maxdiffusion/models/ltx_video/autoencoders/__init__.py‎
diff --git a/‎src/maxdiffusion/models/ltx_video/autoencoders/causal_conv3d.py‎
Lines changed: 49 additions & 54 deletions b/‎src/maxdiffusion/models/ltx_video/autoencoders/causal_conv3d.py‎
Lines changed: 49 additions & 54 deletions
@@ -10,6 +10,7 @@ activations_dtype: 'bfloat16'
 run_name: ''
 output_dir: 'ltx-video-output'
 save_config_to_gcs: False
+
 #Checkpoints
 text_encoder_model_name_or_path: "ariG23498/t5-v1-1-xxl-flax"
 prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
@@ -23,12 +24,13 @@ sampler: "from_checkpoint"
 
 
 # Generation parameters
-pipeline_type: multi-scale
+pipeline_type: None
 prompt: "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie."
 height: 512
 width: 512
 num_frames: 88 #344
 flow_shift: 5.0
+fps: 24
 downscale_factor: 0.6666666
 spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
 prompt_enhancement_words_threshold: 120
 
@@ -4,9 +4,16 @@
 from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXVideoPipeline
 from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXMultiScalePipeline
 from maxdiffusion import pyconfig
+import jax.numpy as jnp
+from maxdiffusion.models.ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+from maxdiffusion.models.ltx_video.autoencoders.latent_upsampler import LatentUpsampler
+from huggingface_hub import hf_hub_download
 import imageio
 from datetime import datetime
+from maxdiffusion.utils import export_to_video
+
 import os
+import json
 import torch
 from pathlib import Path
 
@@ -55,6 +62,13 @@ def convert_prompt_to_filename(text: str, max_len: int = 20) -> str:
   return "-".join(result)
 
 
+def create_latent_upsampler(latent_upsampler_model_path: str, device: str):
+  latent_upsampler = LatentUpsampler.from_pretrained(latent_upsampler_model_path)
+  latent_upsampler.to(device)
+  latent_upsampler.eval()
+  return latent_upsampler
+
+
 def get_unique_filename(
     base: str,
     ext: str,
@@ -80,15 +94,52 @@ def run(config):
   width_padded = ((config.width - 1) // 32 + 1) * 32
   num_frames_padded = ((config.num_frames - 2) // 8 + 1) * 8 + 1
   padding = calculate_padding(config.height, config.width, height_padded, width_padded)
+  # prompt_enhancement_words_threshold = config.prompt_enhancement_words_threshold
+  # prompt_word_count = len(config.prompt.split())
+  # enhance_prompt = (
+  #     prompt_enhancement_words_threshold > 0 and prompt_word_count < prompt_enhancement_words_threshold
+  # )
 
-  seed = 10
+  seed = 10  # change this, generator in pytorch, used in prepare_latents
   generator = torch.Generator().manual_seed(seed)
   pipeline = LTXVideoPipeline.from_pretrained(config, enhance_prompt=False)
-  pipeline = LTXMultiScalePipeline(pipeline)
+  if config.pipeline_type == "multi-scale":  # move this to pipeline file??
+    spatial_upscaler_model_name_or_path = config.spatial_upscaler_model_path
+
+    if spatial_upscaler_model_name_or_path and not os.path.isfile(spatial_upscaler_model_name_or_path):
+      spatial_upscaler_model_path = hf_hub_download(
+          repo_id="Lightricks/LTX-Video",
+          filename=spatial_upscaler_model_name_or_path,
+          local_dir="/mnt/disks/diffusionproj",
+          repo_type="model",
+      )
+    else:
+      spatial_upscaler_model_path = spatial_upscaler_model_name_or_path
+    if not config.spatial_upscaler_model_path:
+      raise ValueError(
+          "spatial upscaler model path is missing from pipeline config file and is required for multi-scale rendering"
+      )
+    latent_upsampler = create_latent_upsampler(spatial_upscaler_model_path, "cpu")  # device set to cpu for now
+    pipeline = LTXMultiScalePipeline(pipeline, latent_upsampler=latent_upsampler)
+  stg_mode = config.stg_mode
+  if stg_mode.lower() == "stg_av" or stg_mode.lower() == "attention_values":
+    skip_layer_strategy = SkipLayerStrategy.AttentionValues
+  elif stg_mode.lower() == "stg_as" or stg_mode.lower() == "attention_skip":
+    skip_layer_strategy = SkipLayerStrategy.AttentionSkip
+  elif stg_mode.lower() == "stg_r" or stg_mode.lower() == "residual":
+    skip_layer_strategy = SkipLayerStrategy.Residual
+  elif stg_mode.lower() == "stg_t" or stg_mode.lower() == "transformer_block":
+    skip_layer_strategy = SkipLayerStrategy.TransformerBlock
+  else:
+    raise ValueError(f"Invalid spatiotemporal guidance mode: {stg_mode}")
+  # images = pipeline(height=height_padded, width=width_padded, num_frames=num_frames_padded,
+  #                   is_video=True, output_type='pt', generator=generator, guidance_scale = config.first_pass.guidance_scale, stg_scale = config.stg_scale, rescaling_scale = config.rescaling_scale, skip_initial_inference_steps= config.skip_initial_inference_steps, skip_final_inference_steps= config.skip_final_inference_steps, num_inference_steps = config.num_inference_steps,
+  #                   guidance_timesteps = config.guidance_timesteps, cfg_star_rescale = config.cfg_star_rescale, skip_layer_strategy = None, skip_block_list=config.skip_block_list).images
   images = pipeline(
       height=height_padded,
       width=width_padded,
       num_frames=num_frames_padded,
+      is_video=True,
       output_type="pt",
       generator=generator,
       config=config,
 
@@ -5,59 +5,54 @@
 
 
 class CausalConv3d(nn.Module):
-    def __init__(
-        self,
+
+  def __init__(
+      self,
+      in_channels,
+      out_channels,
+      kernel_size: int = 3,
+      stride: Union[int, Tuple[int]] = 1,
+      dilation: int = 1,
+      groups: int = 1,
+      spatial_padding_mode: str = "zeros",
+      **kwargs,
+  ):
+    super().__init__()
+
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+
+    kernel_size = (kernel_size, kernel_size, kernel_size)
+    self.time_kernel_size = kernel_size[0]
+
+    dilation = (dilation, 1, 1)
+
+    height_pad = kernel_size[1] // 2
+    width_pad = kernel_size[2] // 2
+    padding = (0, height_pad, width_pad)
+
+    self.conv = nn.Conv3d(
         in_channels,
         out_channels,
-        kernel_size: int = 3,
-        stride: Union[int, Tuple[int]] = 1,
-        dilation: int = 1,
-        groups: int = 1,
-        spatial_padding_mode: str = "zeros",
-        **kwargs,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-
-        kernel_size = (kernel_size, kernel_size, kernel_size)
-        self.time_kernel_size = kernel_size[0]
-
-        dilation = (dilation, 1, 1)
-
-        height_pad = kernel_size[1] // 2
-        width_pad = kernel_size[2] // 2
-        padding = (0, height_pad, width_pad)
-
-        self.conv = nn.Conv3d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=padding,
-            padding_mode=spatial_padding_mode,
-            groups=groups,
-        )
-
-    def forward(self, x, causal: bool = True):
-        if causal:
-            first_frame_pad = x[:, :, :1, :, :].repeat(
-                (1, 1, self.time_kernel_size - 1, 1, 1)
-            )
-            x = torch.concatenate((first_frame_pad, x), dim=2)
-        else:
-            first_frame_pad = x[:, :, :1, :, :].repeat(
-                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
-            )
-            last_frame_pad = x[:, :, -1:, :, :].repeat(
-                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
-            )
-            x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
-        x = self.conv(x)
-        return x
-
-    @property
-    def weight(self):
-        return self.conv.weight
+        kernel_size,
+        stride=stride,
+        dilation=dilation,
+        padding=padding,
+        padding_mode=spatial_padding_mode,
+        groups=groups,
+    )
+
+  def forward(self, x, causal: bool = True):
+    if causal:
+      first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, self.time_kernel_size - 1, 1, 1))
+      x = torch.concatenate((first_frame_pad, x), dim=2)
+    else:
+      first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, (self.time_kernel_size - 1) // 2, 1, 1))
+      last_frame_pad = x[:, :, -1:, :, :].repeat((1, 1, (self.time_kernel_size - 1) // 2, 1, 1))
+      x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
+    x = self.conv(x)
+    return x
+
+  @property
+  def weight(self):
+    return self.conv.weight