fix transformer names

csgoogle · csgoogle · commit cde8ab8fecac · 2026-04-06T17:12:33.000Z
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan_animate.py b/src/maxdiffusion/models/wan/transformers/transformer_wan_animate.py
@@ -48,7 +48,7 @@
 }
 
 
-class FlaxFusedLeakyReLU(nnx.Module):
+class FusedLeakyReLU(nnx.Module):
   """
   Fused LeakyRelu with scale factor and channel-wise bias.
   """
@@ -84,7 +84,7 @@ def __call__(self, x: jax.Array, channel_dim: int = 1) -> jax.Array:
     return x
 
 
-class FlaxMotionConv2d(nnx.Module):
+class MotionConv2d(nnx.Module):
   """2-D convolution with EqualizedLR scaling and optional FusedLeakyReLU.
 
   Weights are stored in PyTorch OIHW format (out, in, k, k) as raw nnx.Param
@@ -148,7 +148,7 @@ def __init__(
       self.bias = None
 
     if self.use_activation:
-      self.act_fn = FlaxFusedLeakyReLU(
+      self.act_fn = FusedLeakyReLU(
           rngs=rngs, bias_channels=out_channels, dtype=dtype, weights_dtype=weights_dtype
       )
     else:
@@ -205,11 +205,11 @@ def __call__(self, x: jax.Array, channel_dim: int = 1) -> jax.Array:
     return x
 
 
-class FlaxMotionLinear(nnx.Module):
+class MotionLinear(nnx.Module):
   """Equalized-LR linear layer with optional FusedLeakyReLU.
 
   Weights are stored in PyTorch (out, in) format as raw nnx.Param — same
-  reason as FlaxMotionConv2d.  No sharding annotations needed (small layer).
+  reason as MotionConv2d.  No sharding annotations needed (small layer).
   """
 
   def __init__(
@@ -238,7 +238,7 @@ def __init__(
       self.bias = None
 
     if self.use_activation:
-      self.act_fn = FlaxFusedLeakyReLU(rngs=rngs, bias_channels=out_dim, dtype=dtype, weights_dtype=weights_dtype)
+      self.act_fn = FusedLeakyReLU(rngs=rngs, bias_channels=out_dim, dtype=dtype, weights_dtype=weights_dtype)
     else:
       self.act_fn = None
 
@@ -258,7 +258,7 @@ def __call__(self, inputs: jax.Array, channel_dim: int = 1) -> jax.Array:
     return out
 
 
-class FlaxMotionEncoderResBlock(nnx.Module):
+class MotionEncoderResBlock(nnx.Module):
 
   def __init__(
       self,
@@ -276,7 +276,7 @@ def __init__(
     self.dtype = dtype
 
     # 3 X 3 Conv + fused leaky ReLU
-    self.conv1 = FlaxMotionConv2d(
+    self.conv1 = MotionConv2d(
         rngs,
         in_channels,
         in_channels,
@@ -289,7 +289,7 @@ def __init__(
     )
 
     # 3 X 3 Conv + downsample 2x + fused leaky ReLU
-    self.conv2 = FlaxMotionConv2d(
+    self.conv2 = MotionConv2d(
         rngs,
         in_channels,
         out_channels,
@@ -303,7 +303,7 @@ def __init__(
     )
 
     # 1 X 1 Conv + downsample 2x in skip connection
-    self.conv_skip = FlaxMotionConv2d(
+    self.conv_skip = MotionConv2d(
         rngs,
         in_channels,
         out_channels,
@@ -327,7 +327,7 @@ def __call__(self, x: jax.Array, channel_dim: int = 1) -> jax.Array:
     return x_out
 
 
-class FlaxWanAnimateMotionEncoder(nnx.Module):
+class WanAnimateMotionEncoder(nnx.Module):
   """Encodes a face video frame into a motion vector.
 
   All weights in this network are small (the largest is 32×512→16) so
@@ -353,7 +353,7 @@ def __init__(
     if channels is None:
       channels = WAN_ANIMATE_MOTION_ENCODER_CHANNEL_SIZES
 
-    self.conv_in = FlaxMotionConv2d(
+    self.conv_in = MotionConv2d(
         rngs, 3, channels[str(size)], 1, use_activation=True, dtype=dtype, weights_dtype=weights_dtype
     )
 
@@ -363,12 +363,12 @@ def __init__(
     for i in range(log_size, 2, -1):
       out_channels = channels[str(2 ** (i - 1))]
       res_blocks.append(
-          FlaxMotionEncoderResBlock(rngs, in_channels, out_channels, dtype=dtype, weights_dtype=weights_dtype)
+          MotionEncoderResBlock(rngs, in_channels, out_channels, dtype=dtype, weights_dtype=weights_dtype)
       )
       in_channels = out_channels
     self.res_blocks = nnx.List(res_blocks)
 
-    self.conv_out = FlaxMotionConv2d(
+    self.conv_out = MotionConv2d(
         rngs,
         in_channels,
         style_dim,
@@ -382,9 +382,9 @@ def __init__(
 
     linears = []
     for _ in range(motion_blocks - 1):
-      linears.append(FlaxMotionLinear(rngs, style_dim, style_dim, dtype=dtype, weights_dtype=weights_dtype))
+      linears.append(MotionLinear(rngs, style_dim, style_dim, dtype=dtype, weights_dtype=weights_dtype))
 
-    linears.append(FlaxMotionLinear(rngs, style_dim, motion_dim, dtype=dtype, weights_dtype=weights_dtype))
+    linears.append(MotionLinear(rngs, style_dim, motion_dim, dtype=dtype, weights_dtype=weights_dtype))
     self.motion_network = nnx.List(linears)
 
     key = rngs.params()
@@ -417,7 +417,7 @@ def __call__(self, face_image: jax.Array, channel_dim: int = 1) -> jax.Array:
     return motion_vec.astype(original_dtype)
 
 
-class FlaxWanAnimateFaceEncoder(nnx.Module):
+class WanAnimateFaceEncoder(nnx.Module):
 
   def __init__(
       self,
@@ -544,7 +544,7 @@ def __call__(self, x: jax.Array) -> jax.Array:
     return x
 
 
-class FlaxWanAnimateFaceBlockCrossAttention(nnx.Module):
+class WanAnimateFaceBlockCrossAttention(nnx.Module):
 
   def __init__(
       self,
@@ -763,7 +763,7 @@ def __init__(
         weights_dtype=weights_dtype,
     )
 
-    self.motion_encoder = FlaxWanAnimateMotionEncoder(
+    self.motion_encoder = WanAnimateMotionEncoder(
         rngs=rngs,
         size=motion_encoder_size,
         style_dim=motion_style_dim,
@@ -773,7 +773,7 @@ def __init__(
         dtype=dtype,
         weights_dtype=weights_dtype,
     )
-    self.face_encoder = FlaxWanAnimateFaceEncoder(
+    self.face_encoder = WanAnimateFaceEncoder(
         rngs=rngs,
         in_dim=motion_encoder_dim,
         out_dim=inner_dim,
@@ -840,7 +840,7 @@ def init_block(rngs):
     face_adapters = []
     num_face_adapters = math.ceil(num_layers / inject_face_latents_blocks)
     for _ in range(num_face_adapters):
-      fa = FlaxWanAnimateFaceBlockCrossAttention(
+      fa = WanAnimateFaceBlockCrossAttention(
           rngs=rngs,
           dim=inner_dim,
           heads=num_attention_heads,
@@ -1081,3 +1081,4 @@ def layer_forward(hidden_states):
     if not return_dict:
       return (hidden_states,)
     return {"sample": hidden_states}
+
diff --git a/src/maxdiffusion/tests/wan_animate_module_parity_test.py b/src/maxdiffusion/tests/wan_animate_module_parity_test.py
@@ -47,14 +47,14 @@
 from maxdiffusion import pyconfig
 from maxdiffusion.max_utils import create_device_mesh
 from maxdiffusion.models.wan.transformers.transformer_wan_animate import (
-    FlaxFusedLeakyReLU,
-    FlaxMotionConv2d,
-    FlaxMotionEncoderResBlock,
-    FlaxMotionLinear,
-    FlaxWanAnimateFaceBlockCrossAttention,
-    FlaxWanAnimateFaceEncoder,
-    FlaxWanAnimateMotionEncoder,
+    FusedLeakyReLU,
+    MotionConv2d,
+    MotionEncoderResBlock,
+    MotionLinear,
     NNXWanAnimateTransformer3DModel,
+    WanAnimateFaceBlockCrossAttention,
+    WanAnimateFaceEncoder,
+    WanAnimateMotionEncoder,
 )
 from maxdiffusion.models.wan.wan_utils import (
     _rename_wan_animate_pt_tuple_key,
@@ -189,7 +189,7 @@ def setUp(self):
 
   def test_fused_leaky_relu_parity(self):
     hf_module = HFFusedLeakyReLU(bias_channels=3).eval()
-    max_module = FlaxFusedLeakyReLU(rngs=self.rngs, bias_channels=3)
+    max_module = FusedLeakyReLU(rngs=self.rngs, bias_channels=3)
     copy_fused_leaky_relu_params(max_module, hf_module)
 
     inputs = torch.randn(2, 3, 4, 5)
@@ -200,7 +200,7 @@ def test_fused_leaky_relu_parity(self):
 
   def test_motion_conv2d_parity(self):
     hf_module = HFMotionConv2d(3, 5, kernel_size=3, stride=2, padding=0, blur_kernel=(1, 3, 3, 1)).eval()
-    max_module = FlaxMotionConv2d(
+    max_module = MotionConv2d(
         rngs=self.rngs,
         in_channels=3,
         out_channels=5,
@@ -219,7 +219,7 @@ def test_motion_conv2d_parity(self):
 
   def test_motion_linear_parity(self):
     hf_module = HFMotionLinear(7, 5, use_activation=True).eval()
-    max_module = FlaxMotionLinear(rngs=self.rngs, in_dim=7, out_dim=5, use_activation=True)
+    max_module = MotionLinear(rngs=self.rngs, in_dim=7, out_dim=5, use_activation=True)
     copy_motion_linear_params(max_module, hf_module)
 
     inputs = torch.randn(4, 7)
@@ -230,7 +230,7 @@ def test_motion_linear_parity(self):
 
   def test_motion_encoder_resblock_parity(self):
     hf_module = HFMotionEncoderResBlock(8, 10).eval()
-    max_module = FlaxMotionEncoderResBlock(rngs=self.rngs, in_channels=8, out_channels=10)
+    max_module = MotionEncoderResBlock(rngs=self.rngs, in_channels=8, out_channels=10)
     copy_motion_encoder_resblock_params(max_module, hf_module)
 
     inputs = torch.randn(2, 8, 8, 8)
@@ -249,7 +249,7 @@ def test_motion_encoder_parity(self):
         "channels": {"4": 8, "8": 8, "16": 8},
     }
     hf_module = HFWanAnimateMotionEncoder(**cfg).eval()
-    max_module = FlaxWanAnimateMotionEncoder(rngs=self.rngs, **cfg)
+    max_module = WanAnimateMotionEncoder(rngs=self.rngs, **cfg)
     copy_motion_encoder_params(max_module, hf_module)
 
     inputs = torch.randn(3, 3, 4, 4)
@@ -260,7 +260,7 @@ def test_motion_encoder_parity(self):
 
   def test_face_encoder_parity(self):
     hf_module = HFWanAnimateFaceEncoder(in_dim=8, out_dim=12, hidden_dim=16, num_heads=2).eval()
-    max_module = FlaxWanAnimateFaceEncoder(rngs=self.rngs, in_dim=8, out_dim=12, hidden_dim=16, num_heads=2)
+    max_module = WanAnimateFaceEncoder(rngs=self.rngs, in_dim=8, out_dim=12, hidden_dim=16, num_heads=2)
     copy_face_encoder_params(max_module, hf_module)
 
     inputs = torch.randn(2, 7, 8)
@@ -271,7 +271,7 @@ def test_face_encoder_parity(self):
 
   def test_face_block_cross_attention_parity(self):
     hf_module = HFWanAnimateFaceBlockCrossAttention(dim=12, heads=3, dim_head=4, cross_attention_dim_head=4).eval()
-    max_module = FlaxWanAnimateFaceBlockCrossAttention(
+    max_module = WanAnimateFaceBlockCrossAttention(
         rngs=self.rngs,
         dim=12,
         heads=3,