Merge branch 'main' into elisatsai_disable_unsafe_rng

eltsai · web-flow · commit 8a18686a5517 · 2025-12-29T18:38:10.000-05:00
diff --git a/.github/workflows/UploadDockerImages.yml b/.github/workflows/UploadDockerImages.yml
@@ -21,6 +21,8 @@ on:
   schedule:
     # Run the job daily at 12AM UTC
     - cron:  '0 0 * * *'
+  
+  workflow_dispatch:
 
 jobs:
   build-image:
diff --git a/maxdiffusion_jax_ai_image_tpu.Dockerfile b/maxdiffusion_jax_ai_image_tpu.Dockerfile
@@ -3,6 +3,8 @@ ARG JAX_AI_IMAGE_BASEIMAGE
 # JAX AI Base Image
 FROM $JAX_AI_IMAGE_BASEIMAGE
 
+ARG JAX_AI_IMAGE_BASEIMAGE
+
 ARG COMMIT_HASH
 
 ENV COMMIT_HASH=$COMMIT_HASH
@@ -18,5 +20,12 @@ COPY . .
 # Install Maxdiffusion Jax AI Image requirements
 RUN pip install -r /deps/requirements_with_jax_ai_image.txt
 
+# TODO: Remove the flax pin and fsspec overrides once flax stable version releases
+RUN if echo "$JAX_AI_IMAGE_BASEIMAGE" | grep -q "nightly"; then \
+        echo "Nightly build detected: Installing specific Flax commit and fsspec." && \
+        pip install --upgrade --force-reinstall git+https://github.com/google/flax.git@ef78d6584623511746be4824965cdef42b464583 && \
+        pip install "fsspec==2025.10.0"; \
+    fi
+
 # Run the script available in JAX-AI-Image base image to generate the manifest file
 RUN bash /jax-ai-image/generate_manifest.sh PREFIX=maxdiffusion COMMIT_HASH=$COMMIT_HASH
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -341,8 +341,10 @@ quantization: ''
 quantization_local_shard_count: -1
 compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
 use_qwix_quantization: False # Whether to use qwix for quantization. If set to True, the transformer of WAN will be quantized using qwix.
-# Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
-quantization_calibration_method: "absmax"
+# Quantization calibration method used for weights, activations and bwd. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
+weight_quantization_calibration_method: "absmax"
+act_quantization_calibration_method: "absmax"
+bwd_quantization_calibration_method: "absmax"
 qwix_module_path: ".*"
 
 # Eval model on per eval_every steps. -1 means don't eval.
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -1005,13 +1005,12 @@ def __call__(
     if encoder_hidden_states is None:
       encoder_hidden_states = hidden_states
 
-    with self.conditional_named_scope("attn_qkv_proj"):
-      with self.conditional_named_scope("proj_query"):
-        query_proj = self.query(hidden_states)
-      with self.conditional_named_scope("proj_key"):
-        key_proj = self.key(encoder_hidden_states)
-      with self.conditional_named_scope("proj_value"):
-        value_proj = self.value(encoder_hidden_states)
+    with jax.named_scope("query_proj"):
+      query_proj = self.query(hidden_states)
+    with jax.named_scope("key_proj"):
+      key_proj = self.key(encoder_hidden_states)
+    with jax.named_scope("value_proj"):
+      value_proj = self.value(encoder_hidden_states)
 
     if self.qk_norm:
       with self.conditional_named_scope("attn_q_norm"):
@@ -1031,13 +1030,13 @@ def __call__(
     key_proj = checkpoint_name(key_proj, "key_proj")
     value_proj = checkpoint_name(value_proj, "value_proj")
 
-    with self.conditional_named_scope("attn_compute"):
+    with jax.named_scope("apply_attention"):
       attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
 
     attn_output = attn_output.astype(dtype=dtype)
     attn_output = checkpoint_name(attn_output, "attn_output")
 
-    with self.conditional_named_scope("attn_out_proj"):
+    with jax.named_scope("proj_attn"):
       hidden_states = self.proj_attn(attn_output)
       hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
     return hidden_states
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -142,8 +142,8 @@ def __call__(
   ):
     timestep = self.timesteps_proj(timestep)
     temb = self.time_embedder(timestep)
-
-    timestep_proj = self.time_proj(self.act_fn(temb))
+    with jax.named_scope("time_proj"):
+      timestep_proj = self.time_proj(self.act_fn(temb))
 
     encoder_hidden_states = self.text_embedder(encoder_hidden_states)
     if encoder_hidden_states_image is not None:
@@ -186,7 +186,8 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array) -> jax.Array:
-    x = self.proj(x)
+    with jax.named_scope("gelu"):
+      x = self.proj(x)
     return nnx.gelu(x)
 
 
@@ -244,12 +245,11 @@ def conditional_named_scope(self, name: str):
     return jax.named_scope(name) if self.enable_jax_named_scopes else contextlib.nullcontext()
 
   def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None) -> jax.Array:
-    with self.conditional_named_scope("mlp_up_proj_and_gelu"):
       hidden_states = self.act_fn(hidden_states)  # Output is (4, 75600, 13824)
       hidden_states = checkpoint_name(hidden_states, "ffn_activation")
       hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
-    with self.conditional_named_scope("mlp_down_proj"):
-      return self.proj_out(hidden_states)  # output is (4, 75600, 5120)
+      with jax.named_scope("proj_out"):
+        return self.proj_out(hidden_states)  # output is (4, 75600, 5120)
 
 
 class WanTransformerBlock(nnx.Module):
@@ -359,10 +359,9 @@ def __call__(
       rngs: nnx.Rngs = None,
   ):
     with self.conditional_named_scope("transformer_block"):
-      with self.conditional_named_scope("adaln"):
-        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
-            (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
-        )
+      shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
+          (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
+      )
       hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
       hidden_states = checkpoint_name(hidden_states, "hidden_states")
       encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", None))
@@ -558,6 +557,7 @@ def conditional_named_scope(self, name: str):
     """Return a JAX named scope if enabled, otherwise a null context."""
     return jax.named_scope(name) if self.enable_jax_named_scopes else contextlib.nullcontext()
 
+  @jax.named_scope('WanModel')
   def __call__(
       self,
       hidden_states: jax.Array,
@@ -625,9 +625,8 @@ def layer_forward(hidden_states):
         hidden_states = rematted_layer_forward(hidden_states)
 
     shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
-    with self.conditional_named_scope("output_norm"):
-      hidden_states = (self.norm_out(hidden_states.astype(jnp.float32)) * (1 + scale) + shift).astype(hidden_states.dtype)
-    with self.conditional_named_scope("output_proj"):
+    hidden_states = (self.norm_out(hidden_states.astype(jnp.float32)) * (1 + scale) + shift).astype(hidden_states.dtype)
+    with jax.named_scope("proj_out"):
       hidden_states = self.proj_out(hidden_states)
 
     hidden_states = hidden_states.reshape(
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -303,9 +303,9 @@ def get_fp8_config(cls, config: HyperParameters):
             act_qtype=jnp.float8_e4m3fn,
             bwd_qtype=jnp.float8_e5m2,
             disable_channelwise_axes=True,  # per_tensor calibration
-            weight_calibration_method=config.quantization_calibration_method,
-            act_calibration_method=config.quantization_calibration_method,
-            bwd_calibration_method=config.quantization_calibration_method,
+            weight_calibration_method=config.weight_quantization_calibration_method,
+            act_calibration_method=config.act_quantization_calibration_method,
+            bwd_calibration_method=config.bwd_quantization_calibration_method,
             op_names=("dot_general", "einsum"),
         ),
         qwix.QtRule(
@@ -314,9 +314,9 @@ def get_fp8_config(cls, config: HyperParameters):
             act_qtype=jnp.float8_e4m3fn,
             bwd_qtype=jnp.float8_e4m3fn,
             disable_channelwise_axes=True,  # per_tensor calibration
-            weight_calibration_method=config.quantization_calibration_method,
-            act_calibration_method=config.quantization_calibration_method,
-            bwd_calibration_method=config.quantization_calibration_method,
+            weight_calibration_method=config.weight_quantization_calibration_method,
+            act_calibration_method=config.act_quantization_calibration_method,
+            bwd_calibration_method=config.bwd_quantization_calibration_method,
             op_names=("conv_general_dilated"),
         ),
     ]
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -62,7 +62,7 @@ def _load_and_init(cls, config, restored_checkpoint=None, vae_only=False, load_t
   @classmethod
   def from_pretrained(cls, config: HyperParameters, vae_only=False, load_transformer=True):
     pipeline , transformer = cls._load_and_init(config, None, vae_only, load_transformer)
-    transformer = cls.quantize_transformer(config, transformer, pipeline, pipeline.mesh)
+    pipeline.transformer = cls.quantize_transformer(config, transformer, pipeline, pipeline.mesh)
     return pipeline
 
   @classmethod
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
@@ -70,8 +70,8 @@ def _load_and_init(cls, config, restored_checkpoint=None, vae_only=False, load_t
   @classmethod
   def from_pretrained(cls, config: HyperParameters, vae_only=False, load_transformer=True):
     pipeline, low_noise_transformer, high_noise_transformer = cls._load_and_init(config, None, vae_only, load_transformer)
-    low_noise_transformer = cls.quantize_transformer(config, low_noise_transformer, pipeline, pipeline.mesh)
-    high_noise_transformer = cls.quantize_transformer(config, high_noise_transformer, pipeline, pipeline.mesh)
+    pipeline.low_noise_transformer = cls.quantize_transformer(config, low_noise_transformer, pipeline, pipeline.mesh)
+    pipeline.high_noise_transformer = cls.quantize_transformer(config, high_noise_transformer, pipeline, pipeline.mesh)
     return pipeline
 
   @classmethod
diff --git a/src/maxdiffusion/tests/wan_transformer_test.py b/src/maxdiffusion/tests/wan_transformer_test.py
@@ -346,7 +346,9 @@ def create_real_rule_instance(*args, **kwargs):
     config_fp8_full = Mock(spec=HyperParameters)
     config_fp8_full.use_qwix_quantization = True
     config_fp8_full.quantization = "fp8_full"
-    config_fp8_full.quantization_calibration_method = "absmax"
+    config_fp8_full.weight_quantization_calibration_method = "fixed,-224,224"
+    config_fp8_full.act_quantization_calibration_method = "fixed,-224,224"
+    config_fp8_full.bwd_quantization_calibration_method = "absmax"
     config_fp8_full.qwix_module_path = ".*"
     provider_fp8_full = WanPipeline.get_qt_provider(config_fp8_full)
     self.assertIsNotNone(provider_fp8_full)
@@ -357,9 +359,9 @@ def create_real_rule_instance(*args, **kwargs):
             act_qtype=jnp.float8_e4m3fn,
             bwd_qtype=jnp.float8_e5m2,
             disable_channelwise_axes=True,  # per_tensor calibration
-            weight_calibration_method=config_fp8_full.quantization_calibration_method,
-            act_calibration_method=config_fp8_full.quantization_calibration_method,
-            bwd_calibration_method=config_fp8_full.quantization_calibration_method,
+            weight_calibration_method=config_fp8_full.weight_quantization_calibration_method,
+            act_calibration_method=config_fp8_full.act_quantization_calibration_method,
+            bwd_calibration_method=config_fp8_full.bwd_quantization_calibration_method,
             op_names=("dot_general", "einsum"),
         ),
         call(
@@ -368,9 +370,9 @@ def create_real_rule_instance(*args, **kwargs):
             act_qtype=jnp.float8_e4m3fn,
             bwd_qtype=jnp.float8_e4m3fn,
             disable_channelwise_axes=True,  # per_tensor calibration
-            weight_calibration_method=config_fp8_full.quantization_calibration_method,
-            act_calibration_method=config_fp8_full.quantization_calibration_method,
-            bwd_calibration_method=config_fp8_full.quantization_calibration_method,
+            weight_calibration_method=config_fp8_full.weight_quantization_calibration_method,
+            act_calibration_method=config_fp8_full.act_quantization_calibration_method,
+            bwd_calibration_method=config_fp8_full.bwd_quantization_calibration_method,
             op_names=("conv_general_dilated"),
         ),
     ]
@@ -395,7 +397,9 @@ def test_quantize_transformer_enabled(self, mock_get_dummy_inputs, mock_quantize
     mock_config.quantization = "fp8_full"
     mock_config.qwix_module_path = ".*"
     mock_config.per_device_batch_size = 1
-    mock_config.quantization_calibration_method = "absmax"
+    mock_config.weight_quantization_calibration_method = "fixed,-224,224"
+    mock_config.act_quantization_calibration_method = "fixed,-224,224"
+    mock_config.bwd_quantization_calibration_method = "absmax"
 
     mock_model = Mock(spec=WanModel)
     mock_pipeline = Mock()