Skip to content

Commit 8042df0

Browse files
committed
pipeline cleaned, licence added
1 parent 072982c commit 8042df0

23 files changed

Lines changed: 434 additions & 222 deletions

src/maxdiffusion/configs/ltx_video.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ activations_dtype: 'bfloat16'
88

99

1010
run_name: ''
11-
output_dir: 'ltx-video-output'
11+
output_dir: '/mnt/disks/diffusionproj'
1212
save_config_to_gcs: False
1313

1414
#Checkpoints
@@ -21,19 +21,19 @@ sampler: "from_checkpoint"
2121

2222
# Generation parameters
2323
pipeline_type: multi-scale
24-
prompt: "A woman with light skin, wearing a blue jacket and a black hat with a veil, looks down and to her right, then back up as she speaks; she has brown hair styled in an updo, light brown eyebrows, and is wearing a white collared shirt under her jacket; the camera remains stationary on her face as she speaks; the background is out of focus, but shows trees and people in period clothing; the scene is captured in real-life footage."
24+
prompt: "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie. "
25+
#negative_prompt: "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
2526
height: 512
2627
width: 512
27-
num_frames: 88 #344
28+
num_frames: 88
2829
flow_shift: 5.0
29-
fps: 24
3030
downscale_factor: 0.6666666
3131
spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
3232
prompt_enhancement_words_threshold: 120
3333
stg_mode: "attention_values"
3434
decode_timestep: 0.05
3535
decode_noise_scale: 0.025
36-
models_dir: "/mnt/disks/diffusionproj" #where safetensor file is
36+
seed: 10
3737

3838

3939
first_pass:

src/maxdiffusion/generate_ltx_video.py

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,28 @@
1+
"""
2+
Copyright 2025 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
16+
117
import numpy as np
218
from absl import app
319
from typing import Sequence
420
from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXVideoPipeline
521
from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXMultiScalePipeline
622
from maxdiffusion import pyconfig
7-
from maxdiffusion.models.ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
8-
from huggingface_hub import hf_hub_download
923
import imageio
1024
from datetime import datetime
11-
1225
import os
13-
import torch
1426
import time
1527
from pathlib import Path
1628

@@ -28,9 +40,6 @@ def calculate_padding(
2840
pad_bottom = pad_height - pad_top # Handles odd padding
2941
pad_left = pad_width // 2
3042
pad_right = pad_width - pad_left # Handles odd padding
31-
32-
# Return padded tensor
33-
# Padding format is (left, right, top, bottom)
3443
padding = (pad_left, pad_right, pad_top, pad_bottom)
3544
return padding
3645

@@ -59,8 +68,6 @@ def convert_prompt_to_filename(text: str, max_len: int = 20) -> str:
5968
return "-".join(result)
6069

6170

62-
63-
6471
def get_unique_filename(
6572
base: str,
6673
ext: str,
@@ -70,9 +77,7 @@ def get_unique_filename(
7077
endswith=None,
7178
index_range=1000,
7279
) -> Path:
73-
base_filename = (
74-
f"{base}_{convert_prompt_to_filename(prompt, max_len=30)}_{resolution[0]}x{resolution[1]}x{resolution[2]}"
75-
)
80+
base_filename = f"{base}_{convert_prompt_to_filename(prompt, max_len=30)}_{resolution[0]}x{resolution[1]}x{resolution[2]}"
7681
for i in range(index_range):
7782
filename = dir / f"{base_filename}_{i}{endswith if endswith else ''}{ext}"
7883
if not os.path.exists(filename):
@@ -87,13 +92,23 @@ def run(config):
8792
padding = calculate_padding(config.height, config.width, height_padded, width_padded)
8893
prompt_enhancement_words_threshold = config.prompt_enhancement_words_threshold
8994
prompt_word_count = len(config.prompt.split())
90-
enhance_prompt = (
91-
prompt_enhancement_words_threshold > 0 and prompt_word_count < prompt_enhancement_words_threshold
92-
)
95+
enhance_prompt = prompt_enhancement_words_threshold > 0 and prompt_word_count < prompt_enhancement_words_threshold
9396

9497
pipeline = LTXVideoPipeline.from_pretrained(config, enhance_prompt=enhance_prompt)
95-
if config.pipeline_type == "multi-scale":
98+
if config.pipeline_type == "multi-scale":
9699
pipeline = LTXMultiScalePipeline(pipeline)
100+
# s0 = time.perf_counter()
101+
# images = pipeline(
102+
# height=height_padded,
103+
# width=width_padded,
104+
# num_frames=num_frames_padded,
105+
# is_video=True,
106+
# output_type="pt",
107+
# config=config,
108+
# enhance_prompt=enhance_prompt,
109+
# seed = config.seed
110+
# )
111+
# print("compile time: ", (time.perf_counter() - s0))
97112
s0 = time.perf_counter()
98113
images = pipeline(
99114
height=height_padded,
@@ -102,21 +117,11 @@ def run(config):
102117
is_video=True,
103118
output_type="pt",
104119
config=config,
105-
enhance_prompt = False
106-
)
107-
print("compile time: ", (time.perf_counter() - s0))
108-
s0 = time.perf_counter()
109-
images = pipeline(
110-
height=height_padded,
111-
width=width_padded,
112-
num_frames=num_frames_padded,
113-
is_video=True,
114-
output_type="pt",
115-
config=config,
116-
enhance_prompt = False
120+
enhance_prompt=enhance_prompt,
121+
seed=config.seed,
117122
)
118123
print("generation time: ", (time.perf_counter() - s0))
119-
124+
120125
(pad_left, pad_right, pad_top, pad_bottom) = padding
121126
pad_bottom = -pad_bottom
122127
pad_right = -pad_right
@@ -127,6 +132,7 @@ def run(config):
127132
images = images[:, :, : config.num_frames, pad_top:pad_bottom, pad_left:pad_right]
128133
output_dir = Path(f"outputs/{datetime.today().strftime('%Y-%m-%d')}")
129134
output_dir.mkdir(parents=True, exist_ok=True)
135+
130136
for i in range(images.shape[0]):
131137
# Gathering from B, C, F, H, W to C, F, H, W and then permuting to F, H, W, C
132138
video_np = images[i].permute(1, 2, 3, 0).detach().float().numpy()
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright 2025 Lightricks Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This implementation is based on the Torch version available at:
16+
# https://github.com/Lightricks/LTX-Video/tree/main

src/maxdiffusion/models/ltx_video/autoencoders/causal_conv3d.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Copyright 2025 Lightricks Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This implementation is based on the Torch version available at:
16+
# https://github.com/Lightricks/LTX-Video/tree/main
117
from typing import Tuple, Union
218

319
import torch

src/maxdiffusion/models/ltx_video/autoencoders/causal_video_autoencoder.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Copyright 2025 Lightricks Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This implementation is based on the Torch version available at:
16+
# https://github.com/Lightricks/LTX-Video/tree/main
117
import json
218
import os
319
from functools import partial
@@ -218,11 +234,11 @@ def to_json_string(self) -> str:
218234
return json.dumps(self.config.__dict__)
219235

220236
def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
221-
if any([key.startswith("vae.") for key in state_dict.keys()]):
237+
if any([key.startswith("vae.") for key in state_dict.keys()]): # noqa: C419
222238
state_dict = {key.replace("vae.", ""): value for key, value in state_dict.items() if key.startswith("vae.")}
223239
ckpt_state_dict = {key: value for key, value in state_dict.items() if not key.startswith(PER_CHANNEL_STATISTICS_PREFIX)}
224240

225-
model_keys = set(name for name, _ in self.named_modules())
241+
model_keys = set(name for name, _ in self.named_modules()) # noqa: C401
226242

227243
key_mapping = {
228244
".resnets.": ".res_blocks.",

src/maxdiffusion/models/ltx_video/autoencoders/conv_nd_factory.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Copyright 2025 Lightricks Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This implementation is based on the Torch version available at:
16+
# https://github.com/Lightricks/LTX-Video/tree/main
117
from typing import Tuple, Union
218

319
import torch

src/maxdiffusion/models/ltx_video/autoencoders/dual_conv3d.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Copyright 2025 Lightricks Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This implementation is based on the Torch version available at:
16+
# https://github.com/Lightricks/LTX-Video/tree/main
117
import math
218
from typing import Tuple, Union
319

src/maxdiffusion/models/ltx_video/autoencoders/latent_upsampler.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Copyright 2025 Lightricks Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This implementation is based on the Torch version available at:
16+
# https://github.com/Lightricks/LTX-Video/tree/main
117
from typing import Optional, Union
218
from pathlib import Path
319
import os

src/maxdiffusion/models/ltx_video/autoencoders/pixel_norm.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Copyright 2025 Lightricks Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This implementation is based on the Torch version available at:
16+
# https://github.com/Lightricks/LTX-Video/tree/main
117
import torch
218
from torch import nn
319

src/maxdiffusion/models/ltx_video/autoencoders/pixel_shuffle.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Copyright 2025 Lightricks Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This implementation is based on the Torch version available at:
16+
# https://github.com/Lightricks/LTX-Video/tree/main
117
import torch.nn as nn
218
from einops import rearrange
319

0 commit comments

Comments
 (0)