wip - adds trainer and attn changes.

entrpn · entrpn · commit e9eb4ca4db4e · 2025-07-23T04:39:55.000Z
diff --git a/src/maxdiffusion/data_preprocessing/wan_pusav1_to_tfrecords.py b/src/maxdiffusion/data_preprocessing/wan_pusav1_to_tfrecords.py
@@ -103,9 +103,11 @@ def generate_dataset(config):
       loaded_state_dict = torch.load(pth_path, map_location=torch.device('cpu'))
       prompt_embeds = loaded_state_dict["prompt_emb"]["context"]
       latent = loaded_state_dict["latents"]
-      # Format we want(4, 16, 1, 64, 64)
-      latent = jnp.array(latent.float().numpy(), dtype=config.weights_dtype)
-      prompt_embeds = jnp.array(prompt_embeds.float().numpy(), dtype=config.weights_dtype)
+      
+      # Format we want(Batch, channels, Frames, Height, Width)
+      # Save them as float32 because numpy cannot read bfloat16.
+      latent = jnp.array(latent.float().numpy(), dtype=jnp.float32)
+      prompt_embeds = jnp.array(prompt_embeds.float().numpy(), dtype=jnp.float32)
       writer.write(create_example(latent, prompt_embeds))
       shard_record_count += 1
       global_record_count += 1
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -380,7 +380,7 @@ def _apply_attention(
     )
   else:
     can_use_flash_attention = True
-
+  can_use_flash_attention=True
   if attention_kernel == "dot_product" or use_memory_efficient_attention or not can_use_flash_attention:
     return _apply_attention_dot(
         query, key, value, dtype, heads, dim_head, scale, split_head_dim, float32_qk_product, use_memory_efficient_attention
@@ -509,7 +509,7 @@ def __init__(
       heads: int,
       dim_head: int,
       use_memory_efficient_attention: bool = False,
-      split_head_dim: bool = False,
+      split_head_dim: bool = True,
       float32_qk_product: bool = True,
       axis_names_q: AxisNames = (BATCH, HEAD, LENGTH, D_KV),
       axis_names_kv: AxisNames = (BATCH, HEAD, KV_LENGTH, D_KV),
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -165,7 +165,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, data_itera
 
     state = state.to_pure_dict()
     p_train_step = jax.jit(
-        functools.partial(train_step, scheduler=pipeline.scheduler),
+        functools.partial(train_step, scheduler=pipeline.scheduler, config=self.config),
         donate_argnums=(0,),
     )
     rng = jax.random.key(self.config.seed)
@@ -219,16 +219,18 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, data_itera
       return pipeline
 
 
-def train_step(state, graphdef, scheduler_state, data, rng, scheduler):
-  return step_optimizer(graphdef, state, scheduler, scheduler_state, data, rng)
+def train_step(state, graphdef, scheduler_state, data, rng, scheduler, config):
+  return step_optimizer(graphdef, state, scheduler, scheduler_state, data, rng, config)
 
 
-def step_optimizer(graphdef, state, scheduler, scheduler_state, data, rng):
+def step_optimizer(graphdef, state, scheduler, scheduler_state, data, rng, config):
   _, new_rng, timestep_rng = jax.random.split(rng, num=3)
 
   def loss_fn(model):
-    latents = data["latents"]
-    encoder_hidden_states = data["encoder_hidden_states"]
+    latents = data["latents"].astype(config.weights_dtype)
+    encoder_hidden_states = data["encoder_hidden_states"].astype(config.weights_dtype)
+    # TODO - fix tf record conversion.
+    encoder_hidden_states = jax.numpy.squeeze(encoder_hidden_states, axis=1)
     bsz = latents.shape[0]
     timesteps = jax.random.randint(
         timestep_rng,