Add Wan-Animate inference pipeline, image processor, and transformer sharding

csgoogle · csgoogle · commit f78d7e19e46d · 2026-03-28T20:04:18.000+05:30
- wan_pipeline_animate.py: Full JAX/Flax port of diffusers WanAnimatePipeline
  supporting both "animate" (pose+face) and "replace" (background+mask) modes,
  segmented inference with overlap conditioning, and optional CFG.
- image_processor.py: WanAnimateImageProcessor with letterbox resize (vs
  center-crop) and vae_scale_factor*spatial_patch_size-aligned dimensions for
  the reference character image.
- transformer_wan_animate.py: Added nnx.with_partitioning sharding annotations
  to patch_embedding, pose_patch_embedding, proj_out, scale_shift_table, and
  all FlaxWanAnimateFaceBlockCrossAttention projections; added
  nn.with_logical_constraint on the input hidden_states.
- wan_utils.py: Added load_wan_animate_transformer with motion-encoder-aware
  weight loading (skip weight→kernel rename and transpose for FlaxMotionConv2d/
  FlaxMotionLinear; map activation.bias→act_fn.bias for FusedLeakyReLU).
diff --git a/src/maxdiffusion/models/wan/transformers/__init__.py b/src/maxdiffusion/models/wan/transformers/__init__.py
@@ -13,3 +13,5 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+
+from .transformer_wan_animate import NNXWanAnimateTransformer3DModel
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan_animate.py b/src/maxdiffusion/models/wan/transformers/transformer_wan_animate.py
@@ -19,6 +19,7 @@
 import math
 import jax
 import jax.numpy as jnp
+import flax.linen as nn
 from flax import nnx
 from .... import common_types
 from ...modeling_flax_utils import FlaxModelMixin
@@ -81,6 +82,13 @@ def __call__(self, x: jax.Array, channel_dim: int = 1) -> jax.Array:
 
 
 class FlaxMotionConv2d(nnx.Module):
+  """2-D convolution with EqualizedLR scaling and optional FusedLeakyReLU.
+
+  Weights are stored in PyTorch OIHW format (out, in, k, k) as raw nnx.Param
+  so that the weight-loading code in wan_utils.py can map them without
+  transposing.  No sharding annotations are applied because this module is
+  part of the small motion encoder network.
+  """
 
   def __init__(
       self,
@@ -123,7 +131,7 @@ def __init__(
       self.blur_kernel = None
 
     key = rngs.params()
-    # Shape: (out_channels, in_channels, kernel, kernel) mapping PyTorch 'OIHW'
+    # Shape: (out_channels, in_channels, kernel, kernel) — PyTorch OIHW format.
     self.weight = nnx.Param(jax.random.normal(key, (out_channels, in_channels, kernel_size, kernel_size), dtype=dtype))
     self.scale = 1.0 / math.sqrt(in_channels * kernel_size**2)
 
@@ -156,7 +164,7 @@ def __call__(self, x: jax.Array, channel_dim: int = 1) -> jax.Array:
           x,
           expanded_kernel,
           window_strides=(1, 1),
-          padding=[(pad_h, pad_h), (pad_w, pad_w)],  # Corrected Symmetric Padding
+          padding=[(pad_h, pad_h), (pad_w, pad_w)],
           dimension_numbers=("NCHW", "OIHW", "NCHW"),
           feature_group_count=self.in_channels,
       )
@@ -186,6 +194,11 @@ def __call__(self, x: jax.Array, channel_dim: int = 1) -> jax.Array:
 
 
 class FlaxMotionLinear(nnx.Module):
+  """Equalized-LR linear layer with optional FusedLeakyReLU.
+
+  Weights are stored in PyTorch (out, in) format as raw nnx.Param — same
+  reason as FlaxMotionConv2d.  No sharding annotations needed (small layer).
+  """
 
   def __init__(
       self,
@@ -296,6 +309,11 @@ def __call__(self, x: jax.Array, channel_dim: int = 1) -> jax.Array:
 
 
 class FlaxWanAnimateMotionEncoder(nnx.Module):
+  """Encodes a face video frame into a motion vector.
+
+  All weights in this network are small (the largest is 32×512→16) so
+  sharding annotations are not applied.
+  """
 
   def __init__(
       self,
@@ -395,7 +413,6 @@ def __init__(
 
     self.act = jax.nn.silu
 
-    # Added explicit padding="VALID" to exactly mirror PyTorch's padding=0 default
     self.conv1_local = nnx.Conv(
         in_dim,
         hidden_dim * num_heads,
@@ -449,7 +466,15 @@ def __init__(
         dtype=dtype,
     )
 
-    self.out_proj = nnx.Linear(hidden_dim, out_dim, rngs=rngs, dtype=dtype)
+    # hidden_dim (mlp) → out_dim (embed): ("mlp", "embed")
+    self.out_proj = nnx.Linear(
+        hidden_dim,
+        out_dim,
+        rngs=rngs,
+        dtype=dtype,
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("mlp", "embed")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+    )
 
     self.padding_tokens = nnx.Param(jnp.zeros((1, 1, 1, out_dim), dtype=dtype))
 
@@ -510,11 +535,45 @@ def __init__(
     self.pre_norm_q = nnx.LayerNorm(dim, epsilon=eps, use_bias=False, use_scale=False, rngs=rngs, dtype=dtype)
     self.pre_norm_kv = nnx.LayerNorm(dim, epsilon=eps, use_bias=False, use_scale=False, rngs=rngs, dtype=dtype)
 
-    self.to_q = nnx.Linear(dim, self.inner_dim, use_bias=use_bias, rngs=rngs, dtype=dtype)
-    self.to_k = nnx.Linear(dim, self.kv_inner_dim, use_bias=use_bias, rngs=rngs, dtype=dtype)
-    self.to_v = nnx.Linear(dim, self.kv_inner_dim, use_bias=use_bias, rngs=rngs, dtype=dtype)
+    # embed → heads
+    self.to_q = nnx.Linear(
+        dim,
+        self.inner_dim,
+        use_bias=use_bias,
+        rngs=rngs,
+        dtype=dtype,
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("heads",)),
+    )
+    self.to_k = nnx.Linear(
+        dim,
+        self.kv_inner_dim,
+        use_bias=use_bias,
+        rngs=rngs,
+        dtype=dtype,
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("heads",)),
+    )
+    self.to_v = nnx.Linear(
+        dim,
+        self.kv_inner_dim,
+        use_bias=use_bias,
+        rngs=rngs,
+        dtype=dtype,
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("heads",)),
+    )
 
-    self.to_out = nnx.Linear(self.inner_dim, dim, use_bias=use_bias, rngs=rngs, dtype=dtype)
+    # heads → embed
+    self.to_out = nnx.Linear(
+        self.inner_dim,
+        dim,
+        use_bias=use_bias,
+        rngs=rngs,
+        dtype=dtype,
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("heads", "embed")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+    )
 
     self.norm_q = nnx.RMSNorm(dim_head, epsilon=eps, use_scale=True, rngs=rngs, dtype=dtype)
     self.norm_k = nnx.RMSNorm(dim_head, epsilon=eps, use_scale=True, rngs=rngs, dtype=dtype)
@@ -544,14 +603,14 @@ def __call__(
 
     query_S = query.shape[1]
 
-    # Prepare for attention by folding Time into the Batch dimension
+    # Fold Time into the Batch dimension for attention
     query = jnp.reshape(query, (B * T, query_S // T, self.heads, -1))
     key = jnp.reshape(key, (B * T, N, self.heads, -1))
     value = jnp.reshape(value, (B * T, N, self.heads, -1))
 
     attn_output = jax.nn.dot_product_attention(query, key, value)
 
-    # Collapse Time, Seq Length, and Heads straight back to (Batch, Total Sequence, Dim)
+    # Restore (Batch, Total Sequence, Dim)
     attn_output = jnp.reshape(attn_output, (B, query_S, -1))
 
     hidden_states = self.to_out(attn_output)
@@ -624,6 +683,8 @@ def __init__(
     self.gradient_checkpoint = GradientCheckpointType.from_str(remat_policy)
 
     self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+
+    # Patch embeddings — shard output (conv_out) axis across model parallelism.
     self.patch_embedding = nnx.Conv(
         in_channels,
         inner_dim,
@@ -632,6 +693,10 @@ def __init__(
         rngs=rngs,
         dtype=dtype,
         param_dtype=weights_dtype,
+        kernel_init=nnx.with_partitioning(
+            nnx.initializers.xavier_uniform(),
+            (None, None, None, None, "conv_out"),
+        ),
     )
     self.pose_patch_embedding = nnx.Conv(
         latent_channels,
@@ -641,6 +706,10 @@ def __init__(
         rngs=rngs,
         dtype=dtype,
         param_dtype=weights_dtype,
+        kernel_init=nnx.with_partitioning(
+            nnx.initializers.xavier_uniform(),
+            (None, None, None, None, "conv_out"),
+        ),
     )
 
     self.condition_embedder = WanTimeTextImageEmbedding(
@@ -714,15 +783,22 @@ def __init__(
     self.face_adapter = nnx.List(face_adapters)
 
     self.norm_out = FP32LayerNorm(rngs=rngs, dim=inner_dim, eps=eps, elementwise_affine=False)
+
+    # Final projection — embed → output tokens.
     self.proj_out = nnx.Linear(
         rngs=rngs,
         in_features=inner_dim,
         out_features=out_channels * math.prod(patch_size),
         dtype=dtype,
         param_dtype=weights_dtype,
+        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("embed", None)),
     )
+
     key = rngs.params()
-    self.scale_shift_table = nnx.Param(jax.random.normal(key, (1, 2, inner_dim), dtype=dtype) / inner_dim**0.5)
+    self.scale_shift_table = nnx.Param(
+        jax.random.normal(key, (1, 2, inner_dim), dtype=dtype) / inner_dim**0.5,
+        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), (None, None, "embed")),
+    )
 
   def conditional_named_scope(self, name: str):
     return jax.named_scope(name) if self.enable_jax_named_scopes else contextlib.nullcontext()
@@ -747,6 +823,9 @@ def __call__(
           f"Pose frames + 1 ({pose_hidden_states.shape[2]} + 1) must equal hidden_states frames ({hidden_states.shape[2]})"
       )
 
+    # Constrain input to batch-sharded layout before any computation.
+    hidden_states = nn.with_logical_constraint(hidden_states, ("batch", None, None, None, None))
+
     batch_size, num_channels, num_frames, height, width = hidden_states.shape
     p_t, p_h, p_w = self.patch_size
     post_patch_num_frames = num_frames // p_t
@@ -850,7 +929,7 @@ def encode_chunk_fn(carry, chunk):
           rngs,
       )
 
-      # Face adapter integration: apply after every 5th block (0, 5, 10, 15, ...)
+      # Face adapter integration: apply after every inject_face_latents_blocks-th block
       if motion_vec is not None and block_idx % self.inject_face_latents_blocks == 0:
         face_adapter_block_idx = block_idx // self.inject_face_latents_blocks
         face_adapter_output = self.face_adapter[face_adapter_block_idx](hidden_states, motion_vec)
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -300,6 +300,144 @@ def load_base_wan_transformer(
     return flax_state_dict
 
 
+def _is_motion_encoder_custom_weight(pt_key: str) -> bool:
+  """Returns True for FlaxMotionConv2d/FlaxMotionLinear weight keys that must NOT be renamed to kernel."""
+  prefixes = (
+      "motion_encoder.conv_in.",
+      "motion_encoder.conv_out.",
+  )
+  if any(pt_key.startswith(p) for p in prefixes) and pt_key.endswith(".weight"):
+    return True
+  if "motion_encoder.res_blocks." in pt_key and pt_key.endswith(".weight"):
+    return True
+  if "motion_encoder.motion_network." in pt_key and pt_key.endswith(".weight"):
+    return True
+  return False
+
+
+def load_wan_animate_transformer(
+    pretrained_model_name_or_path: str,
+    eval_shapes: dict,
+    device: str,
+    hf_download: bool = True,
+    num_layers: int = 40,
+    scan_layers: bool = True,
+    subfolder: str = "transformer",
+):
+  """Loads WanAnimate transformer weights from a HuggingFace checkpoint.
+
+  Handles the additional key mappings for:
+    - pose_patch_embedding (nnx.Conv3d → kernel)
+    - motion_encoder.* (FlaxMotionConv2d/FlaxMotionLinear → keep as 'weight', no transpose)
+    - activation.bias → act_fn.bias  (FusedLeakyReLU bias remapping)
+    - face_encoder.* (nnx.Conv/Linear → standard rename to kernel)
+    - face_adapter.* (nnx.Linear → standard rename to kernel)
+  """
+  device = jax.local_devices(backend=device)[0]
+  filename = "diffusion_pytorch_model.safetensors.index.json"
+  local_files = False
+  if os.path.isdir(pretrained_model_name_or_path):
+    index_file_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
+    if not os.path.isfile(index_file_path):
+      raise FileNotFoundError(f"File {index_file_path} not found for local directory.")
+    local_files = True
+  elif hf_download:
+    index_file_path = hf_hub_download(
+        pretrained_model_name_or_path,
+        subfolder=subfolder,
+        filename=filename,
+    )
+  with jax.default_device(device):
+    with open(index_file_path, "r") as f:
+      index_dict = json.load(f)
+    model_files = set()
+    for key in index_dict["weight_map"].keys():
+      model_files.add(index_dict["weight_map"][key])
+
+    model_files = list(model_files)
+    tensors = {}
+    for model_file in model_files:
+      if local_files:
+        ckpt_shard_path = os.path.join(pretrained_model_name_or_path, subfolder, model_file)
+      else:
+        ckpt_shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=model_file)
+      max_logging.log(f"Load and port {pretrained_model_name_or_path} {subfolder} on {device}")
+      if ckpt_shard_path is not None:
+        with safe_open(ckpt_shard_path, framework="pt") as f:
+          for k in f.keys():
+            tensors[k] = torch2jax(f.get_tensor(k))
+
+    flax_state_dict = {}
+    cpu = jax.local_devices(backend="cpu")[0]
+    flattened_dict = flatten_dict(eval_shapes)
+    random_flax_state_dict = {}
+    for key in flattened_dict:
+      string_tuple = tuple([str(item) for item in key])
+      random_flax_state_dict[string_tuple] = flattened_dict[key]
+    del flattened_dict
+
+    for pt_key, tensor in tensors.items():
+      if "norm_added_q" in pt_key:
+        continue
+
+      renamed_pt_key = rename_key(pt_key)
+
+      # --- Standard WAN transformer renames (shared with base transformer) ---
+      if "condition_embedder" in renamed_pt_key:
+        renamed_pt_key = renamed_pt_key.replace("time_embedding_0", "time_embedder.linear_1")
+        renamed_pt_key = renamed_pt_key.replace("time_embedding_2", "time_embedder.linear_2")
+        renamed_pt_key = renamed_pt_key.replace("time_projection_1", "time_proj")
+        renamed_pt_key = renamed_pt_key.replace("text_embedding_0", "text_embedder.linear_1")
+        renamed_pt_key = renamed_pt_key.replace("text_embedding_2", "text_embedder.linear_2")
+
+      if "image_embedder" in renamed_pt_key:
+        if "net.0.proj" in renamed_pt_key:
+          renamed_pt_key = renamed_pt_key.replace("net.0.proj", "net_0")
+        elif "net_0.proj" in renamed_pt_key:
+          renamed_pt_key = renamed_pt_key.replace("net_0.proj", "net_0")
+        if "net.2" in renamed_pt_key:
+          renamed_pt_key = renamed_pt_key.replace("net.2", "net_2")
+        renamed_pt_key = renamed_pt_key.replace("norm1", "norm1.layer_norm")
+        if "norm1" in renamed_pt_key or "norm2" in renamed_pt_key:
+          renamed_pt_key = renamed_pt_key.replace("weight", "scale")
+          renamed_pt_key = renamed_pt_key.replace("kernel", "scale")
+
+      renamed_pt_key = renamed_pt_key.replace("blocks_", "blocks.")
+      renamed_pt_key = renamed_pt_key.replace(".scale_shift_table", ".adaln_scale_shift_table")
+      renamed_pt_key = renamed_pt_key.replace("to_out_0", "proj_attn")
+      renamed_pt_key = renamed_pt_key.replace("ffn.net_2", "ffn.proj_out")
+      renamed_pt_key = renamed_pt_key.replace("ffn.net_0", "ffn.act_fn")
+      renamed_pt_key = renamed_pt_key.replace("norm2", "norm2.layer_norm")
+
+      # --- Animate-specific renames ---
+      # FusedLeakyReLU bias: HuggingFace stores it under "activation.bias",
+      # JAX stores it under "act_fn.bias" within FlaxMotionConv2d/FlaxMotionLinear.
+      renamed_pt_key = renamed_pt_key.replace(".activation.bias", ".act_fn.bias")
+
+      # face_adapter cross-attention: norm_q/norm_k scale renaming
+      # (rename_for_nnx handles norm_k/norm_q -> scale in get_key_and_value)
+
+      pt_tuple_key = tuple(renamed_pt_key.split("."))
+
+      # FlaxMotionConv2d and FlaxMotionLinear store weights as nnx.Param in PyTorch
+      # OIHW / (out, in) format — do NOT rename weight→kernel or transpose.
+      if _is_motion_encoder_custom_weight(renamed_pt_key):
+        flax_key = _tuple_str_to_int(pt_tuple_key)
+        flax_tensor = tensor
+      else:
+        flax_key, flax_tensor = get_key_and_value(
+            pt_tuple_key, tensor, flax_state_dict, random_flax_state_dict, scan_layers, num_layers
+        )
+
+      flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
+
+    validate_flax_state_dict(eval_shapes, flax_state_dict)
+    flax_state_dict = unflatten_dict(flax_state_dict)
+    del tensors
+    jax.clear_caches()
+    return flax_state_dict
+
+
 def load_wan_vae(pretrained_model_name_or_path: str, eval_shapes: dict, device: str, hf_download: bool = True):
   device = jax.devices(device)[0]
   subfolder = "vae"
diff --git a/src/maxdiffusion/pipelines/wan/__init__.py b/src/maxdiffusion/pipelines/wan/__init__.py
@@ -14,4 +14,6 @@
 limitations under the License.
 """
 
+from .image_processor import WanAnimateImageProcessor
 from .wan_pipeline import WanPipeline
+from .wan_pipeline_animate import WanAnimatePipeline
diff --git a/src/maxdiffusion/pipelines/wan/image_processor.py b/src/maxdiffusion/pipelines/wan/image_processor.py
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_animate.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_animate.py