Skip to content

Commit a56787c

Browse files
committed
config file added
1 parent bb7e8f0 commit a56787c

2 files changed

Lines changed: 107 additions & 1 deletion

File tree

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#hardware
2+
hardware: 'tpu'
3+
skip_jax_distributed_system: False
4+
attention: 'flash'
5+
attention_sharding_uniform: True
6+
7+
jax_cache_dir: ''
8+
weights_dtype: 'bfloat16'
9+
activations_dtype: 'bfloat16'
10+
11+
12+
run_name: ''
13+
output_dir: ''
14+
config_path: ''
15+
save_config_to_gcs: False
16+
17+
#Checkpoints
18+
text_encoder_model_name_or_path: "ariG23498/t5-v1-1-xxl-flax"
19+
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
20+
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
21+
frame_rate: 30
22+
max_sequence_length: 512
23+
sampler: "from_checkpoint"
24+
25+
# Generation parameters
26+
pipeline_type: multi-scale
27+
prompt: "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie."
28+
#negative_prompt: "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
29+
height: 512
30+
width: 512
31+
num_frames: 88
32+
flow_shift: 5.0
33+
downscale_factor: 0.6666666
34+
spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
35+
prompt_enhancement_words_threshold: 120
36+
stg_mode: "attention_values"
37+
decode_timestep: 0.05
38+
decode_noise_scale: 0.025
39+
seed: 10
40+
conditioning_media_paths: None #["IMAGE_PATH"]
41+
conditioning_start_frames: [0]
42+
43+
44+
first_pass:
45+
guidance_scale: [1, 1, 6, 8, 6, 1, 1]
46+
stg_scale: [0, 0, 4, 4, 4, 2, 1]
47+
rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
48+
guidance_timesteps: [1.0, 0.996, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
49+
skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
50+
num_inference_steps: 30
51+
skip_final_inference_steps: 3
52+
skip_initial_inference_steps: 0
53+
cfg_star_rescale: True
54+
55+
second_pass:
56+
guidance_scale: [1]
57+
stg_scale: [1]
58+
rescaling_scale: [1]
59+
guidance_timesteps: [1.0]
60+
skip_block_list: [27]
61+
num_inference_steps: 30
62+
skip_initial_inference_steps: 17
63+
skip_final_inference_steps: 0
64+
cfg_star_rescale: True
65+
66+
#parallelism
67+
mesh_axes: ['data', 'fsdp', 'context', 'tensor']
68+
logical_axis_rules: [
69+
['batch', 'data'],
70+
['activation_heads', 'fsdp'],
71+
['activation_batch', 'data'],
72+
['activation_kv', 'tensor'],
73+
['mlp','tensor'],
74+
['embed','fsdp'],
75+
['heads', 'tensor'],
76+
['norm', 'fsdp'],
77+
['conv_batch', ['data','fsdp']],
78+
['out_channels', 'tensor'],
79+
['conv_out', 'fsdp'],
80+
['conv_in', 'fsdp']
81+
]
82+
data_sharding: [['data', 'fsdp', 'context', 'tensor']]
83+
dcn_data_parallelism: 1 # recommended DCN axis to be auto-sharded
84+
dcn_fsdp_parallelism: -1
85+
dcn_context_parallelism: 1
86+
dcn_tensor_parallelism: 1
87+
ici_data_parallelism: 1
88+
ici_fsdp_parallelism: -1 # recommended ICI axis to be auto-sharded
89+
ici_context_parallelism: 1
90+
ici_tensor_parallelism: 1
91+
92+
allow_split_physical_axes: False
93+
learning_rate_schedule_steps: -1
94+
max_train_steps: 500
95+
pretrained_model_name_or_path: ''
96+
unet_checkpoint: ''
97+
dataset_name: 'diffusers/pokemon-gpt4-captions'
98+
train_split: 'train'
99+
dataset_type: 'tf'
100+
cache_latents_text_encoder_outputs: True
101+
per_device_batch_size: 1
102+
compile_topology_num_slices: -1
103+
quantization_local_shard_count: -1
104+
use_qwix_quantization: False
105+
jit_initializers: True
106+
enable_single_replica_ckpt_restoring: False

src/maxdiffusion/tests/ltx_2_transformer_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def setUp(self):
2121
# Initialize config and mesh for sharding
2222
# using standard MaxDiffusion pattern
2323
pyconfig.initialize(
24-
[None, os.path.join(os.path.dirname(__file__), "..", "configs", "ltx_video.yml")],
24+
[None, os.path.join(os.path.dirname(__file__), "..", "configs", "ltx2_video.yml")],
2525
unittest=True,
2626
)
2727
self.config = pyconfig.config

0 commit comments

Comments
 (0)