Format with pyink

jzr99 · jzr99 · commit 95e8f3769786 · 2026-04-16T23:31:22.000+01:00
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -54,13 +54,9 @@ def get_sinusoidal_embeddings(
   scaled_time = scale * emb
 
   if flip_sin_to_cos:
-    signal = jnp.concatenate(
-        [jnp.cos(scaled_time), jnp.sin(scaled_time)], axis=-1
-    )
+    signal = jnp.concatenate([jnp.cos(scaled_time), jnp.sin(scaled_time)], axis=-1)
   else:
-    signal = jnp.concatenate(
-        [jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=-1
-    )
+    signal = jnp.concatenate([jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=-1)
   return signal
 
 
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan_2p2.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan_2p2.py
@@ -48,19 +48,14 @@ def _update_cache(cache, idx, value):
 
 
 # Helper to ensure kernel_size, stride, padding are tuples of 3 integers
-def _canonicalize_tuple(
-    x: Union[int, Sequence[int]], rank: int, name: str
-) -> Tuple[int, ...]:
+def _canonicalize_tuple(x: Union[int, Sequence[int]], rank: int, name: str) -> Tuple[int, ...]:
   """Canonicalizes a value to a tuple of integers."""
   if isinstance(x, int):
     return (x,) * rank
   elif isinstance(x, Sequence) and len(x) == rank:
     return tuple(x)
   else:
-    raise ValueError(
-        f"Argument '{name}' must be an integer or a sequence of {rank}"
-        f" integers. Got {x}"
-    )
+    raise ValueError(f"Argument '{name}' must be an integer or a sequence of {rank}" f" integers. Got {x}")
 
 
 class RepSentinel:
@@ -69,9 +64,7 @@ def __eq__(self, other):
     return isinstance(other, RepSentinel)
 
 
-tree_util.register_pytree_node(
-    RepSentinel, lambda x: ((), None), lambda _, __: RepSentinel()
-)
+tree_util.register_pytree_node(RepSentinel, lambda x: ((), None), lambda _, __: RepSentinel())
 
 
 class WanPatchify(nnx.Module):
@@ -217,9 +210,7 @@ def __init__(
       self.bias = 0
 
   def __call__(self, x: jax.Array) -> jax.Array:
-    normalized = jnp.linalg.norm(
-        x, ord=2, axis=(1 if self.channel_first else -1), keepdims=True
-    )
+    normalized = jnp.linalg.norm(x, ord=2, axis=(1 if self.channel_first else -1), keepdims=True)
     normalized = x / jnp.maximum(normalized, self.eps)
     normalized = normalized * self.scale * self.gamma
     if self.bias:
@@ -229,9 +220,7 @@ def __call__(self, x: jax.Array) -> jax.Array:
 
 class WanUpsample(nnx.Module):
 
-  def __init__(
-      self, scale_factor: Tuple[float, float], method: str = "nearest"
-  ):
+  def __init__(self, scale_factor: Tuple[float, float], method: str = "nearest"):
     # scale_factor for (H, W)
     # JAX resize works on spatial dims, H, W assuming (N, D, H, W, C) or (N, H, W, C)
     self.scale_factor = scale_factor
@@ -244,9 +233,7 @@ def __call__(self, x: jax.Array) -> jax.Array:
     n, h, w, c = in_shape
     target_h = int(h * self.scale_factor[0])
     target_w = int(w * self.scale_factor[1])
-    out = jax.image.resize(
-        x.astype(jnp.float32), (n, target_h, target_w, c), method=self.method
-    )
+    out = jax.image.resize(x.astype(jnp.float32), (n, target_h, target_w, c), method=self.method)
     return out.astype(input_dtype)
 
 
@@ -282,9 +269,7 @@ def __init__(
         use_bias=True,
         padding=[(0, 1), (0, 1)],
         rngs=rngs,
-        kernel_init=nnx.with_partitioning(
-            nnx.initializers.xavier_uniform(), (None, None, None, None)
-        ),
+        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), (None, None, None, None)),
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
@@ -409,11 +394,7 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=0):
           feat_idx += 1
         else:
           cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
-          if (
-              cache_x.shape[1] < 2
-              and feat_cache[idx] is not None
-              and not isinstance(feat_cache[idx], RepSentinel)
-          ):
+          if cache_x.shape[1] < 2 and feat_cache[idx] is not None and not isinstance(feat_cache[idx], RepSentinel):
             # cache last frame of last two chunk
             cache_x = jnp.concatenate(
                 [
@@ -422,14 +403,8 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=0):
                 ],
                 axis=1,
             )
-          if (
-              cache_x.shape[1] < 2
-              and feat_cache[idx] is not None
-              and isinstance(feat_cache[idx], RepSentinel)
-          ):
-            cache_x = jnp.concatenate(
-                [jnp.zeros(cache_x.shape), cache_x], axis=1
-            )
+          if cache_x.shape[1] < 2 and feat_cache[idx] is not None and isinstance(feat_cache[idx], RepSentinel):
+            cache_x = jnp.concatenate([jnp.zeros(cache_x.shape), cache_x], axis=1)
           if isinstance(feat_cache[idx], RepSentinel):
             x = self.time_conv(x)
           else:
@@ -453,9 +428,7 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=0):
           feat_idx += 1
         else:
           cache_x = jnp.copy(x[:, -1:, :, :, :])
-          x = self.time_conv(
-              jnp.concatenate([feat_cache[idx][:, -1:, :, :, :], x], axis=1)
-          )
+          x = self.time_conv(jnp.concatenate([feat_cache[idx][:, -1:, :, :, :], x], axis=1))
           feat_cache = _update_cache(feat_cache, idx, cache_x)
           feat_idx += 1
 
@@ -479,9 +452,7 @@ def __init__(
     self.nonlinearity = get_activation(non_linearity)
 
     # layers
-    self.norm1 = WanRMS_norm(
-        dim=in_dim, rngs=rngs, images=False, channel_first=False
-    )
+    self.norm1 = WanRMS_norm(dim=in_dim, rngs=rngs, images=False, channel_first=False)
     self.conv1 = WanCausalConv3d(
         rngs=rngs,
         in_channels=in_dim,
@@ -493,9 +464,7 @@ def __init__(
         weights_dtype=weights_dtype,
         precision=precision,
     )
-    self.norm2 = WanRMS_norm(
-        dim=out_dim, rngs=rngs, images=False, channel_first=False
-    )
+    self.norm2 = WanRMS_norm(dim=out_dim, rngs=rngs, images=False, channel_first=False)
     self.conv2 = WanCausalConv3d(
         rngs=rngs,
         in_channels=out_dim,
@@ -581,9 +550,7 @@ def __init__(
         out_features=dim * 3,
         kernel_size=(1, 1),
         rngs=rngs,
-        kernel_init=nnx.with_partitioning(
-            nnx.initializers.xavier_uniform(), (None, None, None, "conv_out")
-        ),
+        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), (None, None, None, "conv_out")),
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
@@ -593,9 +560,7 @@ def __init__(
         out_features=dim,
         kernel_size=(1, 1),
         rngs=rngs,
-        kernel_init=nnx.with_partitioning(
-            nnx.initializers.xavier_uniform(), (None, None, "conv_in", None)
-        ),
+        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), (None, None, "conv_in", None)),
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
@@ -709,9 +674,7 @@ def __init__(
     self.factor = self.factor_t * self.factor_s * self.factor_s
     self.group_size = in_channels * self.factor // out_channels
 
-  def __call__(
-      self, x: jax.Array, feat_cache=None, feat_idx=0
-  ) -> Tuple[jax.Array, Any, int]:
+  def __call__(self, x: jax.Array, feat_cache=None, feat_idx=0) -> Tuple[jax.Array, Any, int]:
     if self.factor_t > 1 or self.factor_s > 1:
       n, d, h, w, c = x.shape
       pad_d = (self.factor_t - d % self.factor_t) % self.factor_t
@@ -769,9 +732,7 @@ def __init__(
     self.out_channels = out_channels
     self.repeats = out_channels * self.factor // in_channels
 
-  def __call__(
-      self, x: jax.Array, feat_cache=None, feat_idx=0, first_chunk: bool = False
-  ) -> Tuple[jax.Array, Any, int]:
+  def __call__(self, x: jax.Array, feat_cache=None, feat_idx=0, first_chunk: bool = False) -> Tuple[jax.Array, Any, int]:
     # Duplicate channels to match the expected total channels for upsampling.
     # x: (N, D, H, W, in_channels) -> (N, D, H, W, in_channels * self.repeats)
     x = jnp.repeat(x, repeats=self.repeats, axis=4)
@@ -891,9 +852,7 @@ def __call__(
 
     x_shortcut = None
     if self.avg_shortcut is not None:
-      x_shortcut, feat_cache, feat_idx = self.avg_shortcut(
-          x_main, feat_cache, feat_idx
-      )
+      x_shortcut, feat_cache, feat_idx = self.avg_shortcut(x_main, feat_cache, feat_idx)
       x = x + x_shortcut
 
     if return_shortcut:
@@ -994,9 +953,7 @@ def __call__(
 
     x_shortcut = None
     if self.avg_shortcut is not None:
-      x_shortcut, feat_cache, feat_idx = self.avg_shortcut(
-          x_main, feat_cache, feat_idx, first_chunk
-      )
+      x_shortcut, feat_cache, feat_idx = self.avg_shortcut(x_main, feat_cache, feat_idx, first_chunk)
       x = x + x_shortcut
 
     if return_shortcut:
@@ -1052,9 +1009,7 @@ def __init__(
     self.down_blocks = []
     for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
       if i != len(dim_mult) - 1:
-        downsample_mode = (
-            "downsample3d" if temperal_downsample[i] else "downsample2d"
-        )
+        downsample_mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
       else:
         downsample_mode = None
       self.down_blocks.append(
@@ -1120,9 +1075,7 @@ def __init__(
     )
 
     # output blocks
-    self.norm_out = WanRMS_norm(
-        out_dim, channel_first=False, images=False, rngs=rngs
-    )
+    self.norm_out = WanRMS_norm(out_dim, channel_first=False, images=False, rngs=rngs)
     self.conv_out = WanCausalConv3d(
         rngs=rngs,
         in_channels=out_dim,
@@ -1281,9 +1234,7 @@ def __init__(
     self.up_blocks = nnx.data(self.up_blocks)
 
     # output blocks
-    self.norm_out = WanRMS_norm(
-        dim=out_dim, images=False, rngs=rngs, channel_first=False
-    )
+    self.norm_out = WanRMS_norm(dim=out_dim, images=False, rngs=rngs, channel_first=False)
     self.conv_out = WanCausalConv3d(
         rngs=rngs,
         in_channels=out_dim,
@@ -1297,9 +1248,7 @@ def __init__(
     )
 
   @nnx.jit(static_argnames=("feat_idx", "first_chunk"))
-  def __call__(
-      self, x: jax.Array, feat_cache=None, feat_idx=0, first_chunk: bool = False
-  ):
+  def __call__(self, x: jax.Array, feat_cache=None, feat_idx=0, first_chunk: bool = False):
     if feat_cache is not None:
       idx = feat_idx
       cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
@@ -1553,9 +1502,7 @@ def _encode(self, x: jax.Array, feat_cache: AutoencoderKLWanCache):
     if x.shape[-1] != 3:
       # reshape channel last for JAX
       x = jnp.transpose(x, (0, 2, 3, 4, 1))
-      assert (
-          x.shape[-1] == 3
-      ), f"Expected input shape (N, D, H, W, 3), got {x.shape}"
+      assert x.shape[-1] == 3, f"Expected input shape (N, D, H, W, 3), got {x.shape}"
 
     x = self.patchify(x)
 
@@ -1566,9 +1513,7 @@ def _encode(self, x: jax.Array, feat_cache: AutoencoderKLWanCache):
     for i in range(iter_):
       enc_conv_idx = 0
       if i == 0:
-        out, enc_feat_map, enc_conv_idx = self.encoder(
-            x[:, :1, :, :, :], feat_cache=enc_feat_map, feat_idx=enc_conv_idx
-        )
+        out, enc_feat_map, enc_conv_idx = self.encoder(x[:, :1, :, :, :], feat_cache=enc_feat_map, feat_idx=enc_conv_idx)
       else:
         out_, enc_feat_map, enc_conv_idx = self.encoder(
             x[:, 1 + 4 * (i - 1) : 1 + 4 * i, :, :, :],
@@ -1621,9 +1566,7 @@ def _decode(
             first_chunk=True,
         )
       else:
-        out_, dec_feat_map, conv_idx = self.decoder(
-            x[:, i : i + 1, :, :, :], feat_cache=dec_feat_map, feat_idx=conv_idx
-        )
+        out_, dec_feat_map, conv_idx = self.decoder(x[:, i : i + 1, :, :, :], feat_cache=dec_feat_map, feat_idx=conv_idx)
         out = jnp.concatenate([out, out_], axis=1)
 
     feat_cache._feat_map = dec_feat_map
@@ -1645,9 +1588,7 @@ def decode(
     if z.shape[-1] != self.z_dim:
       # reshape channel last for JAX
       z = jnp.transpose(z, (0, 2, 3, 4, 1))
-      assert (
-          z.shape[-1] == self.z_dim
-      ), f"Expected input shape (N, D, H, W, {self.z_dim}, got {z.shape}"
+      assert z.shape[-1] == self.z_dim, f"Expected input shape (N, D, H, W, {self.z_dim}, got {z.shape}"
     decoded = self._decode(z, feat_cache).sample
     if not return_dict:
       return (decoded,)
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -380,9 +380,7 @@ def __call__(
       # Support both global [B, 6, dim] and per-token [B, seq_len, 6, dim] temb.
       # Per-token temb is used by TI2V where first-frame tokens have timestep=0.
       if temb.ndim == 4:  # Per-token: [B, seq_len, 6, dim]
-        adaln = jnp.expand_dims(
-            self.adaln_scale_shift_table, 0
-        )  # [1, 1, 6, dim]
+        adaln = jnp.expand_dims(self.adaln_scale_shift_table, 0)  # [1, 1, 6, dim]
         combined = adaln + temb.astype(jnp.float32)  # [B, seq_len, 6, dim]
         parts = jnp.split(combined, 6, axis=2)
         shift_msa = parts[0].squeeze(2)
@@ -392,12 +390,10 @@ def __call__(
         c_scale_msa = parts[4].squeeze(2)
         c_gate_msa = parts[5].squeeze(2)
       else:  # Global: [B, 6, dim]
-        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
-            jnp.split(
-                (self.adaln_scale_shift_table + temb.astype(jnp.float32)),
-                6,
-                axis=1,
-            )
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
+            (self.adaln_scale_shift_table + temb.astype(jnp.float32)),
+            6,
+            axis=1,
         )
       axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_heads"))
       hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names)
@@ -639,22 +635,14 @@ def __call__(
         # tokens receive timestep=0 (clean) and other tokens receive timestep=t.
         bt, sl = timestep.shape
         t_flat = timestep.reshape(-1)  # [B*seq_len]
-        t_sinusoidal = self.condition_embedder.timesteps_proj(
-            t_flat
-        )  # [B*sl, freq_dim]
+        t_sinusoidal = self.condition_embedder.timesteps_proj(t_flat)  # [B*sl, freq_dim]
         t_sinusoidal = t_sinusoidal.reshape(bt, sl, -1)  # [B, sl, freq_dim]
-        temb = self.condition_embedder.time_embedder(
-            t_sinusoidal
-        )  # [B, sl, dim]
+        temb = self.condition_embedder.time_embedder(t_sinusoidal)  # [B, sl, dim]
         with jax.named_scope("time_proj"):
-          timestep_proj = self.condition_embedder.time_proj(
-              self.condition_embedder.act_fn(temb)
-          )  # [B, sl, dim*6]
+          timestep_proj = self.condition_embedder.time_proj(self.condition_embedder.act_fn(temb))  # [B, sl, dim*6]
         timestep_proj = timestep_proj.reshape(bt, sl, 6, -1)  # [B, sl, 6, dim]
         # Text processing
-        encoder_hidden_states = self.condition_embedder.text_embedder(
-            encoder_hidden_states
-        )
+        encoder_hidden_states = self.condition_embedder.text_embedder(encoder_hidden_states)
         encoder_hidden_states_image = None
         encoder_attention_mask = None
       else:
@@ -664,9 +652,7 @@ def __call__(
             encoder_hidden_states,
             encoder_hidden_states_image,
             encoder_attention_mask,
-        ) = self.condition_embedder(
-            timestep, encoder_hidden_states, encoder_hidden_states_image
-        )
+        ) = self.condition_embedder(timestep, encoder_hidden_states, encoder_hidden_states_image)
         timestep_proj = timestep_proj.reshape(timestep_proj.shape[0], 6, -1)
 
     if encoder_hidden_states_image is not None:
@@ -745,18 +731,12 @@ def layer_forward(hidden_states):
 
     if per_token_t:
       # temb: [B, seq_len, dim] — per-token modulation for final head
-      combined_head = jnp.expand_dims(
-          self.scale_shift_table, 0
-      ) + jnp.expand_dims(
-          temb, 2
-      )  # [B, sl, 2, dim]
+      combined_head = jnp.expand_dims(self.scale_shift_table, 0) + jnp.expand_dims(temb, 2)  # [B, sl, 2, dim]
       shift, scale = jnp.split(combined_head, 2, axis=2)
       shift = shift.squeeze(2)  # [B, sl, dim]
       scale = scale.squeeze(2)  # [B, sl, dim]
     else:
-      shift, scale = jnp.split(
-          self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1
-      )
+      shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
     hidden_states = (self.norm_out(hidden_states.astype(jnp.float32)) * (1 + scale) + shift).astype(hidden_states.dtype)
     with jax.named_scope("proj_out"):
       hidden_states = self.proj_out(hidden_states)