support wan transformers for nnx.scan.

entrpn · entrpn · commit 34968e059ee2 · 2025-07-23T20:18:57.000Z
diff --git a/src/maxdiffusion/data_preprocessing/wan_pusav1_to_tfrecords.py b/src/maxdiffusion/data_preprocessing/wan_pusav1_to_tfrecords.py
@@ -101,7 +101,7 @@ def generate_dataset(config):
       video_name = row[0]
       pth_path = os.path.join(config.train_data_dir,"train", f"{video_name}.tensors.pth")
       loaded_state_dict = torch.load(pth_path, map_location=torch.device('cpu'))
-      prompt_embeds = loaded_state_dict["prompt_emb"]["context"]
+      prompt_embeds = loaded_state_dict["prompt_emb"]["context"].squeeze()
       latent = loaded_state_dict["latents"]
       
       # Format we want(Batch, channels, Frames, Height, Width)
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -83,9 +83,20 @@ def load_fusionx_transformer(pretrained_model_name_or_path: str, eval_shapes: di
 
         pt_tuple_key = tuple(renamed_pt_key.split("."))
 
+        if "blocks" in pt_tuple_key:
+          new_key = ("blocks",) + pt_tuple_key[2:]
+          block_index = int(pt_tuple_key[1])
+          pt_tuple_key = new_key
         flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict, model_type=WAN_MODEL)
         flax_key = rename_for_nnx(flax_key)
         flax_key = _tuple_str_to_int(flax_key)
+
+        if "blocks" in flax_key:
+          if flax_key in flax_state_dict:
+            new_tensor = flax_state_dict[flax_key]
+          else:
+            new_tensor = jnp.zeros((40,) + flax_tensor.shape)
+          flax_tensor = new_tensor.at[block_index].set(flax_tensor)
         flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
       validate_flax_state_dict(eval_shapes, flax_state_dict)
       flax_state_dict = unflatten_dict(flax_state_dict)
@@ -118,9 +129,21 @@ def load_causvid_transformer(pretrained_model_name_or_path: str, eval_shapes: di
 
         pt_tuple_key = tuple(renamed_pt_key.split("."))
 
+        if "blocks" in pt_tuple_key:
+          new_key = ("blocks",) + pt_tuple_key[2:]
+          block_index = int(pt_tuple_key[1])
+          pt_tuple_key = new_key
         flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict, model_type=WAN_MODEL)
         flax_key = rename_for_nnx(flax_key)
         flax_key = _tuple_str_to_int(flax_key)
+
+        
+        if "blocks" in flax_key:
+          if flax_key in flax_state_dict:
+            new_tensor = flax_state_dict[flax_key]
+          else:
+            new_tensor = jnp.zeros((40,) + flax_tensor.shape)
+          flax_tensor = new_tensor.at[block_index].set(flax_tensor)
         flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
       validate_flax_state_dict(eval_shapes, flax_state_dict)
       flax_state_dict = unflatten_dict(flax_state_dict)
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -229,8 +229,6 @@ def step_optimizer(graphdef, state, scheduler, scheduler_state, data, rng, confi
   def loss_fn(model):
     latents = data["latents"].astype(config.weights_dtype)
     encoder_hidden_states = data["encoder_hidden_states"].astype(config.weights_dtype)
-    # TODO - fix tf record conversion.
-    encoder_hidden_states = jax.numpy.squeeze(encoder_hidden_states, axis=1)
     bsz = latents.shape[0]
     timesteps = jax.random.randint(
         timestep_rng,