ruff checks

prishajain1 · prishajain1 · commit 52d415beb96f · 2026-01-16T00:12:32.000+05:30
diff --git a/src/maxdiffusion/checkpointing/wan_checkpointer_i2v_2p1.py b/src/maxdiffusion/checkpointing/wan_checkpointer_i2v_2p1.py
@@ -92,4 +92,4 @@ def config_to_json(model_or_config):
 
     # Save the checkpoint
     self.checkpoint_manager.save(train_step, args=ocp.args.Composite(**items))
-    max_logging.log(f"Checkpoint for step {train_step} saved.")
+    max_logging.log(f"Checkpoint for step {train_step} saved.")
diff --git a/src/maxdiffusion/checkpointing/wan_checkpointer_i2v_2p2.py b/src/maxdiffusion/checkpointing/wan_checkpointer_i2v_2p2.py
@@ -111,4 +111,4 @@ def config_to_json(model_or_config):
 
     # Save the checkpoint
     self.checkpoint_manager.save(train_step, args=ocp.args.Composite(**items))
-    max_logging.log(f"Checkpoint for step {train_step} saved.")
+    max_logging.log(f"Checkpoint for step {train_step} saved.")
diff --git a/src/maxdiffusion/loaders/lora_conversion_utils.py b/src/maxdiffusion/loaders/lora_conversion_utils.py
@@ -391,7 +391,7 @@ def _convert_to_ai_toolkit_cat(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
     ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
     if not is_sparse:
       # down_weight is copied to each split
-      ait_sd.update({k: down_weight for k in ait_down_keys})
+      ait_sd.update(dict.fromkeys(ait_down_keys, down_weight))
 
       # up_weight is split to each split
       ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})  # noqa: C416
@@ -534,7 +534,7 @@ def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
     ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
 
     # down_weight is copied to each split
-    ait_sd.update({k: down_weight for k in ait_down_keys})
+    ait_sd.update(dict.fromkeys(ait_down_keys, down_weight))
 
     # up_weight is split to each split
     ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})  # noqa: C416
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -1010,7 +1010,7 @@ def __init__(
           dtype=dtype, param_dtype=weights_dtype, precision=precision,
           bias_init=nnx.with_partitioning(
               nnx.initializers.zeros,
-              ("embed",), 
+              ("embed",),
           ),
       )
       self.add_v_proj = nnx.Linear(
@@ -1129,7 +1129,7 @@ def __call__(
         encoder_hidden_states_img = None
         encoder_hidden_states_text = encoder_hidden_states
         encoder_attention_mask_img = None
-      
+
       if self.qk_norm:
         with self.conditional_named_scope("attn_q_norm"):
           query_proj_text = self.norm_q(query_proj_raw)
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -277,7 +277,7 @@ def load_base_wan_transformer(
           if "norm1" in renamed_pt_key or "norm2" in renamed_pt_key:
               renamed_pt_key = renamed_pt_key.replace("weight", "scale")
               renamed_pt_key = renamed_pt_key.replace("kernel", "scale")
-      
+
       renamed_pt_key = renamed_pt_key.replace("blocks_", "blocks.")
       renamed_pt_key = renamed_pt_key.replace(".scale_shift_table", ".adaln_scale_shift_table")
       renamed_pt_key = renamed_pt_key.replace("to_out_0", "proj_attn")
diff --git a/src/maxdiffusion/pipelines/pipeline_flax_utils.py b/src/maxdiffusion/pipelines/pipeline_flax_utils.py
@@ -473,7 +473,7 @@ def load_module(name, value):
         class_obj = import_flax_or_no_model(pipeline_module, class_name)
 
         importable_classes = ALL_IMPORTABLE_CLASSES
-        class_candidates = {c: class_obj for c in importable_classes.keys()}
+        class_candidates = dict.fromkeys(importable_classes.keys(), class_obj)
       else:
         # else we just import it from the library.
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -399,7 +399,7 @@ def load_scheduler(cls, config):
         flow_shift=config.flow_shift,  # 5.0 for 720p, 3.0 for 480p
     )
     return scheduler, scheduler_state
-  
+
   def encode_image(self, image: PipelineImageInput, num_videos_per_prompt: int = 1):
       if not isinstance(image, list):
           image = [image]
@@ -516,7 +516,7 @@ def prepare_latents_i2v_base(
       """
       height, width = image.shape[-2:]
       image = image[:, :, jnp.newaxis, :, :]  # [B, C, 1, H, W]
-      
+
       if last_image is None:
           video_condition = jnp.concatenate(
               [image, jnp.zeros((image.shape[0], image.shape[1], num_frames - 1, height, width), dtype=image.dtype)], axis=2
@@ -574,7 +574,7 @@ def _create_common_components(cls, config, vae_only=False, i2v=False):
           "vae": wan_vae, "vae_cache": vae_cache,
           "devices_array": devices_array, "rngs": rngs, "mesh": mesh,
           "tokenizer": None, "text_encoder": None, "scheduler": None, "scheduler_state": None,
-          "image_processor": None, "image_encoder": None 
+          "image_processor": None, "image_encoder": None
       }
 
       if not vae_only:
@@ -621,7 +621,7 @@ def _prepare_model_inputs_i2v(
     # 2. Encode Image (only for WAN 2.1 I2V which uses CLIP image embeddings)
     # WAN 2.2 I2V does not use CLIP image embeddings, it uses VAE latent conditioning instead
     transformer_dtype = self.config.activations_dtype
-    
+
     if self.config.model_name == "wan2.1":
         # WAN 2.1 I2V: Use CLIP image encoder
         if image_embeds is None:
@@ -635,7 +635,7 @@ def _prepare_model_inputs_i2v(
 
         if batch_size > 1:
             image_embeds = jnp.tile(image_embeds, (batch_size, 1, 1))
-        
+
         image_embeds = image_embeds.astype(transformer_dtype)
     else:
         # WAN 2.2 I2V: No CLIP image embeddings, set to None or empty tensor
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
@@ -19,7 +19,6 @@
 from typing import List, Union, Optional, Tuple
 from ...pyconfig import HyperParameters
 from functools import partial
-import numpy as np
 from flax import nnx
 from flax.linen import partitioning as nn_partitioning
 import jax
@@ -88,7 +87,7 @@ def prepare_latents(
       last_image: Optional[jax.Array] = None,
       num_videos_per_prompt: int = 1,
   ) -> Tuple[jax.Array, jax.Array, Optional[jax.Array]]:
-        
+
         if hasattr(image, "detach"):
             image = image.detach().cpu().numpy()
         image = jnp.array(image)
@@ -97,12 +96,12 @@ def prepare_latents(
             if hasattr(last_image, "detach"):
                 last_image = last_image.detach().cpu().numpy()
             last_image = jnp.array(last_image)
-        
+
         if num_videos_per_prompt > 1:
            image = jnp.repeat(image, num_videos_per_prompt, axis=0)
            if last_image is not None:
               last_image = jnp.repeat(last_image, num_videos_per_prompt, axis=0)
-        
+
         num_channels_latents = self.vae.z_dim
         num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
         latent_height = height // self.vae_scale_factor_spatial
@@ -119,16 +118,16 @@ def prepare_latents(
         if last_image is None:
             mask_lat_size = mask_lat_size.at[:, :, 1:, :, :].set(0)
         else:
-            mask_lat_size = mask_lat_size.at[:, :, 1:-1, :, :].set(0)     
+            mask_lat_size = mask_lat_size.at[:, :, 1:-1, :, :].set(0)
         first_frame_mask = mask_lat_size[:, :, 0:1]
         first_frame_mask = jnp.repeat(first_frame_mask, self.vae_scale_factor_temporal, axis=2)
         mask_lat_size = jnp.concatenate([first_frame_mask, mask_lat_size[:, :, 1:]], axis=2)
         mask_lat_size = mask_lat_size.reshape(
-          batch_size, 
+          batch_size,
           1,
-          num_latent_frames, 
-          self.vae_scale_factor_temporal, 
-          latent_height, 
+          num_latent_frames,
+          self.vae_scale_factor_temporal,
+          latent_height,
           latent_width
         )
         mask_lat_size = jnp.transpose(mask_lat_size, (0, 2, 4, 5, 3, 1)).squeeze(-1)
@@ -210,7 +209,7 @@ def _process_image_input(img_input, height, width, num_videos_per_prompt):
     scheduler_state = self.scheduler.set_timesteps(
         self.scheduler_state, num_inference_steps=num_inference_steps, shape=latents.shape
     )
-    
+
     graphdef, state, rest_of_state = nnx.split(self.transformer, nnx.Param, ...)
     data_sharding = NamedSharding(self.mesh, P())
     if self.config.global_batch_size_to_train_on // self.config.per_device_batch_size == 0:
@@ -234,7 +233,7 @@ def _process_image_input(img_input, height, width, num_videos_per_prompt):
         scheduler=self.scheduler,
     )
 
-    
+
     with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
       latents = p_run_inference(
           latents=latents,
@@ -246,7 +245,7 @@ def _process_image_input(img_input, height, width, num_videos_per_prompt):
       )
       latents = jnp.transpose(latents, (0, 4, 1, 2, 3))
       latents = self._denormalize_latents(latents)
-      
+
     if output_type == "latent":
       return latents
     return self._decode_latents_to_video(latents)
@@ -287,5 +286,5 @@ def run_inference_2_1_i2v(
         encoder_hidden_states_image=image_embeds,
     )
     noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
-    latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents, return_dict=False)  
-  return latents
+    latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents, return_dict=False)
+  return latents
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py
@@ -87,7 +87,7 @@ def prepare_latents(
     last_image: Optional[jax.Array] = None,
     num_videos_per_prompt: int = 1,
 ) -> Tuple[jax.Array, jax.Array, Optional[jax.Array]]:
-    
+
     if hasattr(image, "detach"):
         image = image.detach().cpu().numpy()
     image = jnp.array(image)
@@ -109,12 +109,12 @@ def prepare_latents(
     else:
         latents = latents.astype(dtype)
 
-    latent_condition, _ = self.prepare_latents_i2v_base(image, num_frames, dtype, last_image)    
+    latent_condition, _ = self.prepare_latents_i2v_base(image, num_frames, dtype, last_image)
     mask_lat_size = jnp.ones((batch_size, 1, num_frames, latent_height, latent_width), dtype=dtype)
     if last_image is None:
         mask_lat_size = mask_lat_size.at[:, :, 1:, :, :].set(0)
     else:
-        mask_lat_size = mask_lat_size.at[:, :, 1:-1, :, :].set(0)     
+        mask_lat_size = mask_lat_size.at[:, :, 1:-1, :, :].set(0)
 
     first_frame_mask = mask_lat_size[:, :, 0:1]
     first_frame_mask = jnp.repeat(first_frame_mask, self.vae_scale_factor_temporal, axis=2)
@@ -123,9 +123,9 @@ def prepare_latents(
         batch_size, 1, num_latent_frames, self.vae_scale_factor_temporal, latent_height, latent_width
     )
     mask_lat_size = jnp.transpose(mask_lat_size, (0, 2, 4, 5, 3, 1)).squeeze(-1)
-    condition = jnp.concatenate([mask_lat_size, latent_condition], axis=-1)        
+    condition = jnp.concatenate([mask_lat_size, latent_condition], axis=-1)
     return latents, condition, None
- 
+
   def __call__(
     self,
     prompt: Union[str, List[str]],
@@ -297,7 +297,7 @@ def low_noise_branch(operands):
             latents_input = jnp.concatenate([latents, latents], axis=0)
         latent_model_input = jnp.concatenate([latents_input, condition], axis=-1)
         timestep = jnp.broadcast_to(t, latents_input.shape[0])
-            
+
         use_high_noise = jnp.greater_equal(t, boundary)
         noise_pred, _ = jax.lax.cond(
         use_high_noise,
@@ -307,4 +307,4 @@ def low_noise_branch(operands):
         )
         noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
         latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
-    return latents
+    return latents