moving vae logic to config files

eltsai · eltsai · commit c1fc264dd1c0 · 2026-04-15T01:24:10.000Z
diff --git a/dependencies/requirements/generated_requirements/requirements.txt b/dependencies/requirements/generated_requirements/requirements.txt
@@ -2,6 +2,7 @@
 # If you need to modify dependencies, please do so in the host requirements file and run seed-env again.
 
 absl-py>=2.3.1
+accelerate>=1.13.0
 aiofiles>=25.1.0
 aiohappyeyeballs>=2.6.1
 aiohttp>=3.13.3
@@ -80,6 +81,7 @@ isort>=8.0.1
 jaraco-functools>=4.4.0
 jax>=0.9.0
 jaxlib>=0.9.0
+jaxopt>=0.8.5
 jaxtyping>=0.3.9
 jinja2>=3.1.6
 keras>=3.13.1
diff --git a/requirements.txt b/requirements.txt
diff --git a/requirements_with_jax_ai_image.txt b/requirements_with_jax_ai_image.txt
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -181,6 +181,18 @@ logical_axis_rules: [
                       ['out_channels', 'tensor'],
                       ['conv_out', 'context'],
                     ]
+vae_logical_axis_rules: [
+                          ['activation_batch', 'redundant'],
+                          ['activation_length', 'vae_spatial'],
+                          ['activation_heads', null],
+                          ['activation_kv_length', null],
+                          ['embed', null],
+                          ['heads', null],
+                          ['norm', null],
+                          ['conv_batch', 'redundant'],
+                          ['out_channels', 'vae_spatial'],
+                          ['conv_out', 'vae_spatial'],
+                        ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
 # One axis for each parallelism type may hold a placeholder (-1)
diff --git a/src/maxdiffusion/configs/base_wan_1_3b.yml b/src/maxdiffusion/configs/base_wan_1_3b.yml
@@ -1,4 +1,4 @@
-﻿# Copyright 2023 Google LLC
+# Copyright 2023 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -157,6 +157,18 @@ logical_axis_rules: [
                       ['out_channels', 'tensor'],
                       ['conv_out', 'context'],
                     ]
+vae_logical_axis_rules: [
+                          ['activation_batch', 'redundant'],
+                          ['activation_length', 'vae_spatial'],
+                          ['activation_heads', null],
+                          ['activation_kv_length', null],
+                          ['embed', null],
+                          ['heads', null],
+                          ['norm', null],
+                          ['conv_batch', 'redundant'],
+                          ['out_channels', 'vae_spatial'],
+                          ['conv_out', 'vae_spatial'],
+                        ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
 # One axis for each parallelism type may hold a placeholder (-1)
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -169,6 +169,18 @@ logical_axis_rules: [
                       ['out_channels', 'tensor'],
                       ['conv_out', 'context'],
                     ]
+vae_logical_axis_rules: [
+                          ['activation_batch', 'redundant'],
+                          ['activation_length', 'vae_spatial'],
+                          ['activation_heads', null],
+                          ['activation_kv_length', null],
+                          ['embed', null],
+                          ['heads', null],
+                          ['norm', null],
+                          ['conv_batch', 'redundant'],
+                          ['out_channels', 'vae_spatial'],
+                          ['conv_out', 'vae_spatial'],
+                        ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
 # One axis for each parallelism type may hold a placeholder (-1)
diff --git a/src/maxdiffusion/configs/base_wan_i2v_14b.yml b/src/maxdiffusion/configs/base_wan_i2v_14b.yml
@@ -163,6 +163,18 @@ logical_axis_rules: [
                       ['out_channels', 'tensor'],
                       ['conv_out', 'context'],
                     ]
+vae_logical_axis_rules: [
+                          ['activation_batch', 'redundant'],
+                          ['activation_length', 'vae_spatial'],
+                          ['activation_heads', null],
+                          ['activation_kv_length', null],
+                          ['embed', null],
+                          ['heads', null],
+                          ['norm', null],
+                          ['conv_batch', 'redundant'],
+                          ['out_channels', 'vae_spatial'],
+                          ['conv_out', 'vae_spatial'],
+                        ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
 # One axis for each parallelism type may hold a placeholder (-1)
diff --git a/src/maxdiffusion/configs/base_wan_i2v_27b.yml b/src/maxdiffusion/configs/base_wan_i2v_27b.yml
@@ -164,6 +164,18 @@ logical_axis_rules: [
                       ['out_channels', 'tensor'],
                       ['conv_out', 'context'],
                     ]
+vae_logical_axis_rules: [
+                          ['activation_batch', 'redundant'],
+                          ['activation_length', 'vae_spatial'],
+                          ['activation_heads', null],
+                          ['activation_kv_length', null],
+                          ['embed', null],
+                          ['heads', null],
+                          ['norm', null],
+                          ['conv_batch', 'redundant'],
+                          ['out_channels', 'vae_spatial'],
+                          ['conv_out', 'vae_spatial'],
+                        ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
 # One axis for each parallelism type may hold a placeholder (-1)
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -212,33 +212,18 @@ def load_base_wan_transformer(
   device = jax.local_devices(backend=device)[0]
   filename = "diffusion_pytorch_model.safetensors.index.json"
   local_files = False
-
-  # Only rank 0 downloads; others wait for cache to be populated
-  process_index = jax.process_index()
   if os.path.isdir(pretrained_model_name_or_path):
     index_file_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
     if not os.path.isfile(index_file_path):
       raise FileNotFoundError(f"File {index_file_path} not found for local directory.")
     local_files = True
   elif hf_download:
-    # Only rank 0 downloads; synchronize across all ranks
-    if process_index == 0:
-      # download the index file for sharded models.
-      index_file_path = hf_hub_download(
-          pretrained_model_name_or_path,
-          subfolder=subfolder,
-          filename=filename,
-      )
-    jax.experimental.multihost_utils.sync_global_devices("model_index_download")
-
-    if process_index != 0:
-      # Non-rank-0 processes wait and use the cached path
-      index_file_path = hf_hub_download(
-          pretrained_model_name_or_path,
-          subfolder=subfolder,
-          filename=filename,
-          force_download=False,  # Use cache, don't download
-      )
+    # download the index file for sharded models.
+    index_file_path = hf_hub_download(
+        pretrained_model_name_or_path,
+        subfolder=subfolder,
+        filename=filename,
+    )
   with jax.default_device(device):
     # open the index file.
     with open(index_file_path, "r") as f:
@@ -253,19 +238,7 @@ def load_base_wan_transformer(
       if local_files:
         ckpt_shard_path = os.path.join(pretrained_model_name_or_path, subfolder, model_file)
       else:
-        # Only rank 0 downloads new files; others use cached versions
-        if process_index == 0:
-          ckpt_shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=model_file)
-        jax.experimental.multihost_utils.sync_global_devices(f"model_download_{model_file}")
-
-        if process_index != 0:
-          # Non-rank-0: use cached version
-          ckpt_shard_path = hf_hub_download(
-              pretrained_model_name_or_path,
-              subfolder=subfolder,
-              filename=model_file,
-              force_download=False,  # Use cache
-          )
+        ckpt_shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=model_file)
       # now get all the filenames for the model that need downloading
       max_logging.log(f"Load and port {pretrained_model_name_or_path} {subfolder} on {device}")
 
@@ -331,25 +304,12 @@ def load_wan_vae(pretrained_model_name_or_path: str, eval_shapes: dict, device:
   device = jax.devices(device)[0]
   subfolder = "vae"
   filename = "diffusion_pytorch_model.safetensors"
-  process_index = jax.process_index()
-
   if os.path.isdir(pretrained_model_name_or_path):
     ckpt_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
     if not os.path.isfile(ckpt_path):
       raise FileNotFoundError(f"File {ckpt_path} not found for local directory.")
   elif hf_download:
-    # Only rank 0 downloads; others use cache
-    if process_index == 0:
-      ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
-    jax.experimental.multihost_utils.sync_global_devices("vae_download")
-
-    if process_index != 0:
-      ckpt_path = hf_hub_download(
-          pretrained_model_name_or_path,
-          subfolder=subfolder,
-          filename=filename,
-          force_download=False,  # Use cache
-      )
+    ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
   max_logging.log(f"Load and port {pretrained_model_name_or_path} VAE on {device}")
   with jax.default_device(device):
     if ckpt_path is not None:
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -635,18 +635,20 @@ def _create_common_components(cls, config, vae_only=False, i2v=False):
     )
 
     # logical axis rules for VAE encoding/decoding
-    vae_logical_axis_rules = (
-        ("activation_batch", "redundant"),
-        ("activation_length", "vae_spatial"),
-        ("activation_heads", None),
-        ("activation_kv_length", None),
-        ("embed", None),
-        ("heads", None),
-        ("norm", None),
-        ("conv_batch", "redundant"),
-        ("out_channels", "vae_spatial"),
-        ("conv_out", "vae_spatial"),
-    )
+    vae_logical_axis_rules = getattr(config, "vae_logical_axis_rules", None)
+    if vae_logical_axis_rules is None:
+      vae_logical_axis_rules = (
+          ("activation_batch", "redundant"),
+          ("activation_length", "vae_spatial"),
+          ("activation_heads", None),
+          ("activation_kv_length", None),
+          ("embed", None),
+          ("heads", None),
+          ("norm", None),
+          ("conv_batch", "redundant"),
+          ("out_channels", "vae_spatial"),
+          ("conv_out", "vae_spatial"),
+      )
 
     rng = jax.random.key(config.seed)
     rngs = nnx.Rngs(rng)
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -199,6 +199,8 @@ def user_init(raw_keys):
     max_utils.write_config_raw_keys_for_gcs(raw_keys)
 
     raw_keys["logical_axis_rules"] = _lists_to_tuples(raw_keys["logical_axis_rules"])
+    if "vae_logical_axis_rules" in raw_keys:
+      raw_keys["vae_logical_axis_rules"] = _lists_to_tuples(raw_keys["vae_logical_axis_rules"])
     # Verify qkv is sharded across sequence.
     if "ring" in raw_keys["attention"] or raw_keys["attention_sharding_uniform"]:
       max_logging.log(