format fixed

Serenagu525 · Serenagu525 · commit 7e098c586fad · 2025-06-26T22:56:13.000Z
diff --git a/src/maxdiffusion/configs/ltx_video.yml b/src/maxdiffusion/configs/ltx_video.yml
@@ -62,4 +62,4 @@ cache_latents_text_encoder_outputs: True
 per_device_batch_size: 1
 compile_topology_num_slices: -1 
 quantization_local_shard_count: -1
-jit_initializers: True 
+jit_initializers: True 
diff --git a/src/maxdiffusion/generate_ltx_video.py b/src/maxdiffusion/generate_ltx_video.py
@@ -20,11 +20,13 @@
 import json
 from maxdiffusion.models.ltx_video.transformers.transformer3d import Transformer3DModel
 import os
+import functools
 import jax.numpy as jnp
 from maxdiffusion import pyconfig
 from maxdiffusion.max_utils import (
     create_device_mesh,
 )
+from jax.sharding import Mesh
 
 
 def validate_transformer_inputs(prompt_embeds, fractional_coords, latents, noise_cond):
@@ -38,7 +40,7 @@ def run(config):
   key = jax.random.PRNGKey(0)
 
   devices_array = create_device_mesh(config)
-  mesh = Mesh(devices_array, config.mesh_axes)
+  mesh = Mesh(devices_array, config.mesh_axes)  # noqa F841
 
   batch_size, text_tokens, num_tokens, features = 4, 256, 2048, 128
   base_dir = os.path.dirname(__file__)
@@ -49,12 +51,10 @@ def run(config):
     model_config = json.load(f)
 
   transformer = Transformer3DModel(**model_config, dtype=jnp.bfloat16, gradient_checkpointing="matmul_without_batch")
-  transformer_param_shapes = transformer.init_weights(key, batch_size, text_tokens, num_tokens, features, eval_only=False)
+  transformer_param_shapes = transformer.init_weights(key, batch_size, text_tokens, num_tokens, features, eval_only=False)  # noqa F841
 
   key, split_key = jax.random.split(key)
-
-
-  weights_init_fn = functools.partial(
+  weights_init_fn = functools.partial(  # noqa F841
       transformer.init_weights, split_key, batch_size, text_tokens, num_tokens, features, eval_only=True
   )
 
diff --git a/src/maxdiffusion/models/ltx_video/transformers/attention.py b/src/maxdiffusion/models/ltx_video/transformers/attention.py
@@ -438,7 +438,7 @@ def __call__(
       deterministic: bool = True,
       **cross_attention_kwargs,
   ) -> jnp.ndarray:
-    cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
+    cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}  # noqa F821
     assert cross_attention_kwargs.get("scale", None) is None, "Not supported"
 
     input_axis_names = ("activation_batch", "activation_length", "activation_embed")
diff --git a/src/maxdiffusion/models/ltx_video/transformers/transformer3d.py b/src/maxdiffusion/models/ltx_video/transformers/transformer3d.py
@@ -25,15 +25,13 @@ class Transformer3DModel(nn.Module):
   only_cross_attention: bool = False
   double_self_attention: bool = False
   upcast_attention: bool = False
-  # 'single_scale_shift' or 'single_scale'
-  adaptive_norm: str = "single_scale_shift"
+  adaptive_norm: str = "single_scale_shift"  # 'single_scale_shift' or 'single_scale'
   standardization_norm: str = "layer_norm"  # 'layer_norm' or 'rms_norm'
   norm_elementwise_affine: bool = True
   norm_eps: float = 1e-5
   attention_type: str = "default"
   caption_channels: int = None
-  # if True uses the TPU attention offload ('flash attention')
-  use_tpu_flash_attention: bool = True
+  use_tpu_flash_attention: bool = True  # if True uses the TPU attention offload ('flash attention')
   qk_norm: Optional[str] = None
   positional_embedding_type: str = "rope"
   positional_embedding_theta: Optional[float] = None
@@ -98,7 +96,7 @@ def scale_shift_table_init(key):
       self.transformer_blocks = RepeatableLayer(
           RemattedBasicTransformerBlock,
           num_layers=self.num_layers,
-          module_init_kwargs=dict(
+          module_init_kwargs=dict(  # noqa C408
               dim=self.inner_dim,
               num_attention_heads=self.num_attention_heads,
               attention_head_dim=self.attention_head_dim,
@@ -139,46 +137,30 @@ def scale_shift_table_init(key):
           matmul_precision=self.matmul_precision,
       )
 
-  def init_weights(self, key, batch_size, text_tokens, num_tokens, features, eval_only=True):
-
-    # bookkeeping, for convenient changes later
-    latents_shape = (batch_size, num_tokens, features)
-    fractional_cords_shape = (batch_size, 3, num_tokens)
-    prompt_embeds_shape = (batch_size, text_tokens, features)
-    noise_cond_shape = (batch_size, 1)
-    latents_dtype = jnp.bfloat16
-    fractional_coords_dtype = jnp.bfloat16
-    prompt_embeds_dtype = jnp.bfloat16
-    noise_cond_dtype = jnp.bfloat16
-
-    # initialize to random
-    key, split_key = jax.random.split(key)
-    prompt_embeds = jax.random.normal(split_key, shape=prompt_embeds_shape, dtype=latents_dtype)
-    key, split_key = jax.random.split(key)
-    fractional_coords = jax.random.normal(split_key, shape=fractional_cords_shape, dtype=fractional_coords_dtype)
-    key, split_key = jax.random.split(key)
-    latents = jax.random.normal(split_key, shape=latents_shape, dtype=prompt_embeds_dtype)
-    key, split_key = jax.random.split(key)
-    noise_cond = jax.random.normal(split_key, shape=noise_cond_shape, dtype=noise_cond_dtype)
-
-    key, split_key = jax.random.split(key)
+  def init_weights(self, in_channels, key, caption_channels, eval_only=True):
+    example_inputs = {}
+    batch_size, num_tokens = 4, 256
+    input_shapes = {
+        "hidden_states": (batch_size, num_tokens, in_channels),
+        "indices_grid": (batch_size, 3, num_tokens),
+        "encoder_hidden_states": (batch_size, 128, caption_channels),
+        "timestep": (batch_size, 256),
+        "segment_ids": (batch_size, 256),
+        "encoder_attention_segment_ids": (batch_size, 128),
+    }
+    for name, shape in input_shapes.items():
+      example_inputs[name] = jnp.ones(
+          shape, dtype=jnp.float32 if name not in ["attention_mask", "encoder_attention_mask"] else jnp.bool
+      )
+
     if eval_only:
       return jax.eval_shape(
           self.init,
-          rngs={"params": split_key},
-          hidden_states=latents,
-          indices_grid=fractional_coords,
-          encoder_hidden_states=prompt_embeds,
-          timestep=noise_cond,
+          key,
+          **example_inputs,
       )["params"]
     else:
-      return self.init(
-          rngs={"params": split_key},
-          hidden_states=latents,
-          indices_grid=fractional_coords,
-          encoder_hidden_states=prompt_embeds,
-          timestep=noise_cond,
-      )["params"]
+      return self.init(key, **example_inputs)["params"]
 
   def __call__(
       self,
@@ -271,8 +253,7 @@ def get_fractional_positions(self, indices_grid: jax.Array) -> jax.Array:
   @nn.compact
   def __call__(self, indices_grid: jax.Array) -> Tuple[jax.Array, jax.Array]:
     source_dtype = indices_grid.dtype
-    # We need full precision in the freqs_cis computation.
-    dtype = jnp.float32
+    dtype = jnp.float32  # We need full precision in the freqs_cis computation.
     dim = self.inner_dim
     theta = self.positional_embedding_theta
 
@@ -294,8 +275,7 @@ def __call__(self, indices_grid: jax.Array) -> Tuple[jax.Array, jax.Array]:
     indices = indices * jnp.pi / 2
 
     freqs = (indices * (jnp.expand_dims(fractional_positions, axis=-1) * 2 - 1)).swapaxes(-1, -2)
-    # Flatten along axis 2
-    freqs = freqs.reshape(freqs.shape[0], freqs.shape[1], -1)
+    freqs = freqs.reshape(freqs.shape[0], freqs.shape[1], -1)  # Flatten along axis 2
 
     cos_freq = jnp.cos(freqs).repeat(2, axis=-1)
     sin_freq = jnp.sin(freqs).repeat(2, axis=-1)
diff --git a/src/maxdiffusion/models/ltx_video/xora_v1.2-13B-balanced-128.json b/src/maxdiffusion/models/ltx_video/xora_v1.2-13B-balanced-128.json
@@ -20,5 +20,6 @@
     "positional_embedding_type": "rope",
     "positional_embedding_theta": 10000.0,
     "positional_embedding_max_pos": [20, 2048, 2048],
-    "timestep_scale_multiplier": 1000
+    "timestep_scale_multiplier": 1000,
+    "in_channels": 128
 }

Original file line number	Diff line number	Diff line change
`@@ -20,5 +20,6 @@`
`20`	`20`	`"positional_embedding_type": "rope",`
`21`	`21`	`"positional_embedding_theta": 10000.0,`
`22`	`22`	`"positional_embedding_max_pos": [20, 2048, 2048],`
`23`		`- "timestep_scale_multiplier": 1000`
	`23`	`+ "timestep_scale_multiplier": 1000,`
	`24`	`+ "in_channels": 128`
`24`	`25`	`}`