adding mixed precision for better video generation.

entrpn · entrpn · commit c95fc1a81d3d · 2025-10-15T17:33:07.000Z
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -332,33 +332,33 @@ def __call__(
       rngs: nnx.Rngs = None,
   ):
     shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
-        (self.adaln_scale_shift_table + temb), 6, axis=1
+        (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
     )
     hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
     encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", None))
 
     # 1. Self-attention
-    norm_hidden_states = (self.norm1(hidden_states) * (1 + scale_msa) + shift_msa).astype(hidden_states.dtype)
+    norm_hidden_states = (self.norm1(hidden_states.astype(jnp.float32)) * (1 + scale_msa) + shift_msa).astype(hidden_states.dtype)
     attn_output = self.attn1(
         hidden_states=norm_hidden_states,
         encoder_hidden_states=norm_hidden_states,
         rotary_emb=rotary_emb,
         deterministic=deterministic,
         rngs=rngs,
     )
-    hidden_states = (hidden_states + attn_output * gate_msa).astype(hidden_states.dtype)
+    hidden_states = (hidden_states.astype(jnp.float32) + attn_output * gate_msa).astype(hidden_states.dtype)
 
     # 2. Cross-attention
-    norm_hidden_states = self.norm2(hidden_states)
+    norm_hidden_states = self.norm2(hidden_states.astype(jnp.float32)).astype(hidden_states.dtype)
     attn_output = self.attn2(
         hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs
     )
     hidden_states = hidden_states + attn_output
 
     # 3. Feed-forward
-    norm_hidden_states = (self.norm3(hidden_states) * (1 + c_scale_msa) + c_shift_msa).astype(hidden_states.dtype)
+    norm_hidden_states = (self.norm3(hidden_states.astype(jnp.float32)) * (1 + c_scale_msa) + c_shift_msa).astype(hidden_states.dtype)
     ff_output = self.ffn(norm_hidden_states, deterministic=deterministic, rngs=rngs)
-    hidden_states = (hidden_states + ff_output * c_gate_msa).astype(hidden_states.dtype)
+    hidden_states = (hidden_states.astype(jnp.float32) + ff_output.astype(jnp.float32) * c_gate_msa).astype(hidden_states.dtype)
     return hidden_states
 
 
@@ -526,7 +526,7 @@ def scan_fn(carry, block):
 
     shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
 
-    hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).astype(hidden_states.dtype)
+    hidden_states = (self.norm_out(hidden_states.astype(jnp.float32)) * (1 + scale) + shift).astype(hidden_states.dtype)
     hidden_states = self.proj_out(hidden_states)
 
     hidden_states = hidden_states.reshape(
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -39,7 +39,28 @@
 import torch
 import qwix
 
-
+def cast_with_exclusion(path, x, dtype_to_cast):
+  """
+  Casts arrays to dtype_to_cast, but keeps params from any 'norm' layer in float32.
+  """
+  is_norm_param = any('norm' in str(key).lower() for key in path)
+
+  exclusion_keywords = [
+    "norm",                   # For all LayerNorm/GroupNorm layers
+    "condition_embedder",     # The entire time/text conditioning module
+    "scale_shift_table",      # Catches both the final and the AdaLN tables
+  ]
+
+  path_str = ".".join(str(k.key) if isinstance(k, jax.tree_util.DictKey) else str(k) for k in path)
+  
+  if any(keyword in path_str.lower() for keyword in exclusion_keywords):
+    print("is_norm_path: ", path)
+    # Keep LayerNorm/GroupNorm weights and biases in full precision
+    return x.astype(jnp.float32)
+  else:
+    # Cast everything else to dtype_to_cast
+    return x.astype(dtype_to_cast)
+  
 def basic_clean(text):
   if is_ftfy_available():
     import ftfy
@@ -113,7 +134,11 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
     params = load_wan_transformer(
         config.wan_transformer_pretrained_model_name_or_path, params, "cpu", num_layers=wan_config["num_layers"]
     )
-  params = jax.tree_util.tree_map(lambda x: x.astype(config.weights_dtype), params)
+  
+  params = jax.tree_util.tree_map_with_path(
+    lambda path, x: cast_with_exclusion(path, x, dtype_to_cast=config.weights_dtype),
+    params
+  )
   for path, val in flax.traverse_util.flatten_dict(params).items():
     if restored_checkpoint:
       path = path[:-1]
diff --git a/src/maxdiffusion/schedulers/scheduling_unipc_multistep_flax.py b/src/maxdiffusion/schedulers/scheduling_unipc_multistep_flax.py
@@ -674,6 +674,10 @@ def step(
     Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
     the multistep UniPC.
     """
+
+    original_dtype = sample.dtype
+    sample = sample.astype(jnp.float32)
+
     if state.timesteps is None:
       raise ValueError("Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler")