Skip to content

Commit 634591b

Browse files
committed
save now
1 parent 7af151a commit 634591b

22 files changed

Lines changed: 5953 additions & 21 deletions

src/maxdiffusion/configs/ltx_video.yml

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ sampler: "from_checkpoint"
2424

2525
# Generation parameters
2626
pipeline_type: multi-scale
27-
prompt: ["A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie.", "A man walks towards a window, looks out, and then turns around. He has short, dark hair, dark skin, and is wearing a brown coat over a red and gray scarf. He walks from left to right towards a window, his gaze fixed on something outside. The camera follows him from behind at a medium distance. The room is brightly lit, with white walls and a large window covered by a white curtain. As he approaches the window, he turns his head slightly to the left, then back to the right. He then turns his entire body to the right, facing the window. The camera remains stationary as he stands in front of the window. The scene is captured in real-life footage."]
27+
prompt: "A man walks towards a window, looks out, and then turns around. He has short, dark hair, dark skin, and is wearing a brown coat over a red and gray scarf. He walks from left to right towards a window, his gaze fixed on something outside. The camera follows him from behind at a medium distance. The room is brightly lit, with white walls and a large window covered by a white curtain. As he approaches the window, he turns his head slightly to the left, then back to the right. He then turns his entire body to the right, facing the window. The camera remains stationary as he stands in front of the window. The scene is captured in real-life footage."
2828
height: 512
2929
width: 512
3030
num_frames: 88 #344
@@ -68,34 +68,29 @@ second_pass:
6868
skip_final_inference_steps: 0
6969
cfg_star_rescale: True
7070

71-
#Parallelism
72-
mesh_axes: ['data', 'fsdp', 'tensor', 'fsdp_transpose', 'expert', 'tensor_transpose', 'tensor_sequence', 'sequence']
71+
#parallelism
72+
mesh_axes: ['data', 'fsdp', 'tensor']
7373
logical_axis_rules: [
7474
['batch', 'data'],
75+
['activation_heads', 'fsdp'],
7576
['activation_batch', ['data','fsdp']],
76-
['activation_heads', 'tensor'],
7777
['activation_kv', 'tensor'],
7878
['mlp','tensor'],
7979
['embed','fsdp'],
8080
['heads', 'tensor'],
81+
['norm', 'fsdp'],
8182
['conv_batch', ['data','fsdp']],
8283
['out_channels', 'tensor'],
8384
['conv_out', 'fsdp'],
85+
['conv_in', 'fsdp']
8486
]
85-
data_sharding: [['data', 'fsdp', 'tensor', 'fsdp_transpose', 'expert', 'tensor_transpose', 'tensor_sequence', 'sequence']]
87+
data_sharding: [['data', 'fsdp', 'tensor']]
8688
dcn_data_parallelism: 1 # recommended DCN axis to be auto-sharded
8789
dcn_fsdp_parallelism: -1
8890
dcn_tensor_parallelism: 1
89-
90-
ici_data_parallelism: -1
91-
ici_fsdp_parallelism: 1 # recommended ICI axis to be auto-sharded
91+
ici_data_parallelism: 1
92+
ici_fsdp_parallelism: -1 # recommended ICI axis to be auto-sharded
9293
ici_tensor_parallelism: 1
93-
ici_fsdp_transpose_parallelism: 1
94-
ici_sequence_parallelism: 1
95-
ici_tensor_transpose_parallelism: 1
96-
ici_expert_parallelism: 1
97-
ici_sequence_parallelism: 1
98-
9994

10095

10196

src/maxdiffusion/models/ltx_video/autoencoders/__init__.py

Whitespace-only changes.
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from typing import Tuple, Union
2+
3+
import torch
4+
import torch.nn as nn
5+
6+
7+
class CausalConv3d(nn.Module):
8+
def __init__(
9+
self,
10+
in_channels,
11+
out_channels,
12+
kernel_size: int = 3,
13+
stride: Union[int, Tuple[int]] = 1,
14+
dilation: int = 1,
15+
groups: int = 1,
16+
spatial_padding_mode: str = "zeros",
17+
**kwargs,
18+
):
19+
super().__init__()
20+
21+
self.in_channels = in_channels
22+
self.out_channels = out_channels
23+
24+
kernel_size = (kernel_size, kernel_size, kernel_size)
25+
self.time_kernel_size = kernel_size[0]
26+
27+
dilation = (dilation, 1, 1)
28+
29+
height_pad = kernel_size[1] // 2
30+
width_pad = kernel_size[2] // 2
31+
padding = (0, height_pad, width_pad)
32+
33+
self.conv = nn.Conv3d(
34+
in_channels,
35+
out_channels,
36+
kernel_size,
37+
stride=stride,
38+
dilation=dilation,
39+
padding=padding,
40+
padding_mode=spatial_padding_mode,
41+
groups=groups,
42+
)
43+
44+
def forward(self, x, causal: bool = True):
45+
if causal:
46+
first_frame_pad = x[:, :, :1, :, :].repeat(
47+
(1, 1, self.time_kernel_size - 1, 1, 1)
48+
)
49+
x = torch.concatenate((first_frame_pad, x), dim=2)
50+
else:
51+
first_frame_pad = x[:, :, :1, :, :].repeat(
52+
(1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
53+
)
54+
last_frame_pad = x[:, :, -1:, :, :].repeat(
55+
(1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
56+
)
57+
x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
58+
x = self.conv(x)
59+
return x
60+
61+
@property
62+
def weight(self):
63+
return self.conv.weight

0 commit comments

Comments
 (0)