merge and lint

entrpn · entrpn · commit 4e362c39f17a · 2025-09-02T18:21:13.000Z
diff --git a/src/maxdiffusion/configuration_utils.py b/src/maxdiffusion/configuration_utils.py
@@ -47,21 +47,24 @@
 
 _re_configuration_file = re.compile(r"config\.(.*)\.json")
 
+
 class CustomEncoder(json.JSONEncoder):
-    """
-    Custom JSON encoder to handle non-serializable types like JAX/Numpy dtypes.
-    """
-    def default(self, o):
-        # This will catch the `dtype[bfloat16]` object and convert it to the string "bfloat16"
-        if isinstance(o, type(jnp.dtype('bfloat16'))):
-            return str(o)
-        # Add fallbacks for other numpy types if needed
-        if isinstance(o, np.integer):
-            return int(o)
-        if isinstance(o, np.floating):
-            return float(o)
-        # Let the base class default method raise the TypeError for other types
-        return super().default(o)
+  """
+  Custom JSON encoder to handle non-serializable types like JAX/Numpy dtypes.
+  """
+
+  def default(self, o):
+    # This will catch the `dtype[bfloat16]` object and convert it to the string "bfloat16"
+    if isinstance(o, type(jnp.dtype("bfloat16"))):
+      return str(o)
+    # Add fallbacks for other numpy types if needed
+    if isinstance(o, np.integer):
+      return int(o)
+    if isinstance(o, np.floating):
+      return float(o)
+    # Let the base class default method raise the TypeError for other types
+    return super().default(o)
+
 
 class FrozenDict(OrderedDict):
 
@@ -596,14 +599,14 @@ def to_json_saveable(value):
     config_dict.pop("quant", None)
     keys_to_remove = []
     for key, value in config_dict.items():
-        # Check the type of the value by its class name to avoid import issues
-        if type(value).__name__ == 'Rngs':
-            keys_to_remove.append(key)
+      # Check the type of the value by its class name to avoid import issues
+      if type(value).__name__ == "Rngs":
+        keys_to_remove.append(key)
 
     if keys_to_remove:
-        max_logging.log(f"Skipping non-serializable config keys: {keys_to_remove}")
-        for key in keys_to_remove:
-            config_dict.pop(key)
+      max_logging.log(f"Skipping non-serializable config keys: {keys_to_remove}")
+      for key in keys_to_remove:
+        config_dict.pop(key)
 
     try:
 
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -22,43 +22,47 @@
 from maxdiffusion.utils import export_to_video
 from google.cloud import storage
 
+
 def upload_video_to_gcs(output_dir: str, video_path: str):
-    """
-    Uploads a local video file to a specified Google Cloud Storage bucket.
-    """
-    try:
-        path_without_scheme = output_dir.removeprefix("gs://")
-        parts = path_without_scheme.split('/', 1)
-        bucket_name = parts[0]
-        folder_name = parts[1] if len(parts) > 1 else ''
+  """
+  Uploads a local video file to a specified Google Cloud Storage bucket.
+  """
+  try:
+    path_without_scheme = output_dir.removeprefix("gs://")
+    parts = path_without_scheme.split("/", 1)
+    bucket_name = parts[0]
+    folder_name = parts[1] if len(parts) > 1 else ""
 
-        storage_client = storage.Client()
-        bucket = storage_client.bucket(bucket_name)
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
 
-        source_file_path = f"./{video_path}"
-        destination_blob_name = os.path.join(folder_name, "videos", video_path)
+    source_file_path = f"./{video_path}"
+    destination_blob_name = os.path.join(folder_name, "videos", video_path)
 
-        blob = bucket.blob(destination_blob_name)
+    blob = bucket.blob(destination_blob_name)
 
-        max_logging.log(f"Uploading {source_file_path} to {bucket_name}/{destination_blob_name}...")
-        blob.upload_from_filename(source_file_path)
-        max_logging.log(f"Upload complete {source_file_path}.")
+    max_logging.log(f"Uploading {source_file_path} to {bucket_name}/{destination_blob_name}...")
+    blob.upload_from_filename(source_file_path)
+    max_logging.log(f"Upload complete {source_file_path}.")
+
+  except Exception as e:
+    max_logging.log(f"An error occurred: {e}")
 
-    except Exception as e:
-        max_logging.log(f"An error occurred: {e}")
 
 def delete_file(file_path: str):
   if os.path.exists(file_path):
-      try:
-          os.remove(file_path)
-          max_logging.log(f"Successfully deleted file: {file_path}")
-      except OSError as e:
-          max_logging.log(f"Error deleting file '{file_path}': {e}")
+    try:
+      os.remove(file_path)
+      max_logging.log(f"Successfully deleted file: {file_path}")
+    except OSError as e:
+      max_logging.log(f"Error deleting file '{file_path}': {e}")
   else:
-      max_logging.log(f"The file '{file_path}' does not exist.")
+    max_logging.log(f"The file '{file_path}' does not exist.")
+
 
 jax.config.update("jax_use_shardy_partitioner", True)
 
+
 def inference_generate_video(config, pipeline, filename_prefix=""):
   s0 = time.perf_counter()
   prompt = [config.prompt] * config.global_batch_size_to_train_on
@@ -88,6 +92,7 @@ def inference_generate_video(config, pipeline, filename_prefix=""):
       delete_file(f"./{video_path}")
   return
 
+
 def run(config, pipeline=None, filename_prefix=""):
   print("seed: ", config.seed)
   from maxdiffusion.checkpointing.wan_checkpointer import WanCheckpointer
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -747,9 +747,7 @@ def __init__(
         precision=precision,
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
-            (
-                "embed",
-            ),
+            ("embed",),
         ),
     )
 
@@ -763,9 +761,7 @@ def __init__(
         precision=precision,
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
-            (
-                "embed",
-            ),
+            ("embed",),
         ),
     )
 
@@ -779,9 +775,7 @@ def __init__(
         precision=precision,
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
-            (
-                "embed",
-            ),
+            ("embed",),
         ),
     )
 
@@ -795,9 +789,7 @@ def __init__(
         precision=precision,
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
-            (
-                "heads",
-            ),
+            ("heads",),
         ),
     )
 
@@ -813,9 +805,7 @@ def __init__(
           dtype=dtype,
           scale_init=nnx.with_partitioning(
               nnx.initializers.ones,
-              (
-                  "norm",
-              ),
+              ("norm",),
           ),
           param_dtype=weights_dtype,
       )
@@ -826,9 +816,7 @@ def __init__(
           dtype=dtype,
           scale_init=nnx.with_partitioning(
               nnx.initializers.ones,
-              (
-                  "norm",
-              ),
+              ("norm",),
           ),
           param_dtype=weights_dtype,
       )
@@ -850,8 +838,12 @@ def _apply_rope(self, xq: jax.Array, xk: jax.Array, freqs_cis: jax.Array) -> Tup
     return xq_out, xk_out
 
   def __call__(
-      self, hidden_states: jax.Array, encoder_hidden_states: jax.Array = None, rotary_emb: Optional[jax.Array] = None,
-      deterministic: bool = True, rngs: nnx.Rngs = None,
+      self,
+      hidden_states: jax.Array,
+      encoder_hidden_states: jax.Array = None,
+      rotary_emb: Optional[jax.Array] = None,
+      deterministic: bool = True,
+      rngs: nnx.Rngs = None,
   ) -> jax.Array:
     hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
     encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", "tensor"))
diff --git a/src/maxdiffusion/models/gradient_checkpoint.py b/src/maxdiffusion/models/gradient_checkpoint.py
@@ -67,21 +67,25 @@ def to_jax_policy(self, names_which_can_be_saved: list = [], names_which_can_be_
       case GradientCheckpointType.FULL:
         return None
       case GradientCheckpointType.OFFLOAD_MATMUL_WITHOUT_BATCH:
-        return cp.offload_dot_with_no_batch_dims(
-           offload_src="device", offload_dst="pinned_host"
-        )
+        return cp.offload_dot_with_no_batch_dims(offload_src="device", offload_dst="pinned_host")
       case GradientCheckpointType.CUSTOM:
         policy = cp.save_and_offload_only_these_names(
-              names_which_can_be_saved=names_which_can_be_saved, 
-              names_which_can_be_offloaded=names_which_can_be_offloaded,
-              offload_src="device",
-              offload_dst="pinned_host"
-          )
+            names_which_can_be_saved=names_which_can_be_saved,
+            names_which_can_be_offloaded=names_which_can_be_offloaded,
+            offload_src="device",
+            offload_dst="pinned_host",
+        )
         return policy
       case GradientCheckpointType.MATMUL_WITHOUT_BATCH:
         return jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
 
-  def apply(self, module: nnx.Module, names_which_can_be_saved: list = [], names_which_can_be_offloaded: list = [], static_argnums=()) -> nnx.Module:
+  def apply(
+      self,
+      module: nnx.Module,
+      names_which_can_be_saved: list = [],
+      names_which_can_be_offloaded: list = [],
+      static_argnums=(),
+  ) -> nnx.Module:
     """
     Applies a gradient checkpoint policy to a module
     if no policy is needed, it will return the module as is
@@ -95,9 +99,4 @@ def apply(self, module: nnx.Module, names_which_can_be_saved: list = [], names_w
     policy = self.to_jax_policy(names_which_can_be_saved, names_which_can_be_offloaded)
     if policy == SKIP_GRADIENT_CHECKPOINT_KEY:
       return module
-    return nnx.remat(  # pylint: disable=invalid-name
-        module,
-        prevent_cse=False,
-        policy=policy,
-        static_argnums=static_argnums
-    )
+    return nnx.remat(module, prevent_cse=False, policy=policy, static_argnums=static_argnums)  # pylint: disable=invalid-name
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -236,10 +236,10 @@ def __init__(
     )
 
   def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None) -> jax.Array:
-    hidden_states = self.act_fn(hidden_states) # Output is (4, 75600, 13824)
+    hidden_states = self.act_fn(hidden_states)  # Output is (4, 75600, 13824)
     hidden_states = checkpoint_name(hidden_states, "ffn_activation")
     hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
-    return self.proj_out(hidden_states) # output is (4, 75600, 5120)
+    return self.proj_out(hidden_states)  # output is (4, 75600, 5120)
 
 
 class WanTransformerBlock(nnx.Module):
@@ -281,7 +281,7 @@ def __init__(
         weights_dtype=weights_dtype,
         precision=precision,
         attention_kernel=attention,
-        dropout=dropout
+        dropout=dropout,
     )
 
     # 1. Cross-attention
@@ -299,7 +299,7 @@ def __init__(
         weights_dtype=weights_dtype,
         precision=precision,
         attention_kernel=attention,
-        dropout=dropout
+        dropout=dropout,
     )
     assert cross_attn_norm is True
     self.norm2 = FP32LayerNorm(rngs=rngs, dim=dim, eps=eps, elementwise_affine=True)
@@ -313,15 +313,24 @@ def __init__(
         dtype=dtype,
         weights_dtype=weights_dtype,
         precision=precision,
-        dropout=dropout
+        dropout=dropout,
     )
     self.norm3 = FP32LayerNorm(rngs=rngs, dim=dim, eps=eps, elementwise_affine=False)
 
     key = rngs.params()
     self.adaln_scale_shift_table = nnx.Param(
-      jax.random.normal(key, (1, 6, dim)) / dim**0.5,)
+        jax.random.normal(key, (1, 6, dim)) / dim**0.5,
+    )
 
-  def __call__(self, hidden_states: jax.Array, encoder_hidden_states: jax.Array, temb: jax.Array, rotary_emb: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None,):
+  def __call__(
+      self,
+      hidden_states: jax.Array,
+      encoder_hidden_states: jax.Array,
+      temb: jax.Array,
+      rotary_emb: jax.Array,
+      deterministic: bool = True,
+      rngs: nnx.Rngs = None,
+  ):
     shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
         (self.adaln_scale_shift_table + temb), 6, axis=1
     )
@@ -331,13 +340,19 @@ def __call__(self, hidden_states: jax.Array, encoder_hidden_states: jax.Array, t
     # 1. Self-attention
     norm_hidden_states = (self.norm1(hidden_states) * (1 + scale_msa) + shift_msa).astype(hidden_states.dtype)
     attn_output = self.attn1(
-        hidden_states=norm_hidden_states, encoder_hidden_states=norm_hidden_states, rotary_emb=rotary_emb, deterministic=deterministic, rngs=rngs
+        hidden_states=norm_hidden_states,
+        encoder_hidden_states=norm_hidden_states,
+        rotary_emb=rotary_emb,
+        deterministic=deterministic,
+        rngs=rngs,
     )
     hidden_states = (hidden_states + attn_output * gate_msa).astype(hidden_states.dtype)
 
     # 2. Cross-attention
     norm_hidden_states = self.norm2(hidden_states)
-    attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs)
+    attn_output = self.attn2(
+        hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs
+    )
     hidden_states = hidden_states + attn_output
 
     # 3. Feed-forward
@@ -380,7 +395,7 @@ def __init__(
       attention: str = "dot_product",
       remat_policy: str = "None",
       names_which_can_be_saved: list = [],
-      names_which_can_be_offloaded: list = []
+      names_which_can_be_offloaded: list = [],
   ):
     inner_dim = num_attention_heads * attention_head_dim
     out_channels = out_channels or in_channels
@@ -417,7 +432,7 @@ def __init__(
 
     # 3. Transformer blocks
     @nnx.split_rngs(splits=num_layers)
-    @nnx.vmap(in_axes=0, out_axes=0, transform_metadata= {nnx.PARTITION_NAME: "layers_per_stage"} )
+    @nnx.vmap(in_axes=0, out_axes=0, transform_metadata={nnx.PARTITION_NAME: "layers_per_stage"})
     def init_block(rngs):
       return WanTransformerBlock(
           rngs=rngs,
@@ -496,7 +511,9 @@ def scan_fn(carry, block):
       new_carry = (hidden_states, rngs_carry)
       return new_carry, None
 
-    rematted_block_forward = self.gradient_checkpoint.apply(scan_fn, self.names_which_can_be_saved, self.names_which_can_be_offloaded)
+    rematted_block_forward = self.gradient_checkpoint.apply(
+        scan_fn, self.names_which_can_be_saved, self.names_which_can_be_offloaded
+    )
     initial_carry = (hidden_states, rngs)
     final_carry, _ = nnx.scan(
         rematted_block_forward,
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -121,7 +121,7 @@ def _load_kwargs(self, argv: list[str]):
   @staticmethod
   def wan_init(raw_keys):
     if not any("layers_per_stage" in inner_tuple for inner_tuple in raw_keys["logical_axis_rules"]):
-      raw_keys["logical_axis_rules"]+= (("layers_per_stage", None),)
+      raw_keys["logical_axis_rules"] += (("layers_per_stage", None),)
     if "wan_transformer_pretrained_model_name_or_path" in raw_keys:
       transformer_pretrained_model_name_or_path = raw_keys["wan_transformer_pretrained_model_name_or_path"]
       if transformer_pretrained_model_name_or_path == "":
diff --git a/src/maxdiffusion/tests/gradient_checkpoint_test.py b/src/maxdiffusion/tests/gradient_checkpoint_test.py
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py