more changes

eltsai · eltsai · commit b5d6dd7102bd · 2026-04-13T20:06:24.000Z
diff --git a/dependencies/requirements/generated_requirements/requirements.txt b/dependencies/requirements/generated_requirements/requirements.txt
@@ -67,7 +67,7 @@ hf-transfer>=0.1.9
 hf-xet>=1.4.2 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
 httpcore>=1.0.9
 httpx>=0.28.1
-huggingface-hub>=0.36.2
+huggingface-hub>=1.10.1
 humanize>=4.15.0
 hypothesis>=6.142.1
 idna>=3.11
diff --git a/requirements.txt b/requirements.txt
@@ -14,7 +14,6 @@ ftfy
 tensorboard>=2.17.0
 tensorboardx>=2.6.2.2
 tensorboard-plugin-profile>=2.15.2
-tokamax
 Jinja2
 scikit-image
 parameterized
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -27,6 +27,7 @@
 from maxdiffusion.kernels.splash_attention import splash_attention_mask as tokamax_splash_attention_mask
 from maxdiffusion.kernels.splash_attention import splash_attention_kernel as tokamax_splash_attention_kernel
 from maxdiffusion.kernels.splash_attention import ring_attention_kernel as tokamax_ring_attention_kernel
+from maxdiffusion.kernels.splash_attention import base as tokamax_splash_base
 from einops import rearrange
 from .. import common_types, max_logging
 
@@ -363,7 +364,10 @@ def wrap_flash_attention(query, key, value):
       # Both are (kv_padded_len,) - element-wise multiplication
       kv_segment_ids = (kv_segment_ids * kv_mask_padded).astype(jnp.int32)
 
-    segment_ids = splash_attention_kernel.SegmentIds(q=q_segment_ids, kv=kv_segment_ids)
+    if attention_kernel == "tokamax_ring":
+      segment_ids = tokamax_splash_base.SegmentIds(q=q_segment_ids, kv=kv_segment_ids)
+    else:
+      segment_ids = splash_attention_kernel.SegmentIds(q=q_segment_ids, kv=kv_segment_ids)
 
     # make_splash_mha is wrapped around shardmap and seq and head is already
     # sharded based on in_specs, therefore setting head_shards=1 and q_seq_shards=1.
@@ -1954,4 +1958,4 @@ def setup(self):
   def __call__(self, hidden_states, deterministic=True):
     hidden_states = self.proj(hidden_states)
     hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
-    return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)
+    return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -759,7 +759,6 @@ def __init__(
         precision=precision,
     )
 
-  @nnx.jit(static_argnames="feat_idx")
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=0):
     if feat_cache is not None:
       idx = feat_idx
@@ -908,7 +907,6 @@ def __init__(
         precision=precision,
     )
 
-  @nnx.jit(static_argnames="feat_idx")
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=0):
     if feat_cache is not None:
       idx = feat_idx
@@ -1104,6 +1102,7 @@ def __init__(
     )
     self.mesh = mesh
 
+  @nnx.jit
   def _encode(self, x: jax.Array, feat_cache: AutoencoderKLWanCache):
     feat_cache.init_cache()
     if x.shape[-1] != 3:
@@ -1175,6 +1174,7 @@ def encode(
       return (posterior,)
     return FlaxAutoencoderKLOutput(latent_dist=posterior)
 
+  @nnx.jit
   def _decode(
       self, z: jax.Array, feat_cache: AutoencoderKLWanCache, return_dict: bool = True
   ) -> Union[FlaxDecoderOutput, jax.Array]:
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -212,18 +212,33 @@ def load_base_wan_transformer(
   device = jax.local_devices(backend=device)[0]
   filename = "diffusion_pytorch_model.safetensors.index.json"
   local_files = False
+  
+  # Only rank 0 downloads; others wait for cache to be populated
+  process_index = jax.process_index()
   if os.path.isdir(pretrained_model_name_or_path):
     index_file_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
     if not os.path.isfile(index_file_path):
       raise FileNotFoundError(f"File {index_file_path} not found for local directory.")
     local_files = True
   elif hf_download:
-    # download the index file for sharded models.
-    index_file_path = hf_hub_download(
-        pretrained_model_name_or_path,
-        subfolder=subfolder,
-        filename=filename,
-    )
+    # Only rank 0 downloads; synchronize across all ranks
+    if process_index == 0:
+      # download the index file for sharded models.
+      index_file_path = hf_hub_download(
+          pretrained_model_name_or_path,
+          subfolder=subfolder,
+          filename=filename,
+      )
+    jax.experimental.multihost_utils.sync_global_devices("model_index_download")
+    
+    if process_index != 0:
+      # Non-rank-0 processes wait and use the cached path
+      index_file_path = hf_hub_download(
+          pretrained_model_name_or_path,
+          subfolder=subfolder,
+          filename=filename,
+          force_download=False,  # Use cache, don't download
+      )
   with jax.default_device(device):
     # open the index file.
     with open(index_file_path, "r") as f:
@@ -238,7 +253,19 @@ def load_base_wan_transformer(
       if local_files:
         ckpt_shard_path = os.path.join(pretrained_model_name_or_path, subfolder, model_file)
       else:
-        ckpt_shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=model_file)
+        # Only rank 0 downloads new files; others use cached versions
+        if process_index == 0:
+          ckpt_shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=model_file)
+        jax.experimental.multihost_utils.sync_global_devices(f"model_download_{model_file}")
+        
+        if process_index != 0:
+          # Non-rank-0: use cached version
+          ckpt_shard_path = hf_hub_download(
+              pretrained_model_name_or_path, 
+              subfolder=subfolder, 
+              filename=model_file,
+              force_download=False,  # Use cache
+          )
       # now get all the filenames for the model that need downloading
       max_logging.log(f"Load and port {pretrained_model_name_or_path} {subfolder} on {device}")
 
@@ -304,12 +331,25 @@ def load_wan_vae(pretrained_model_name_or_path: str, eval_shapes: dict, device:
   device = jax.devices(device)[0]
   subfolder = "vae"
   filename = "diffusion_pytorch_model.safetensors"
+  process_index = jax.process_index()
+  
   if os.path.isdir(pretrained_model_name_or_path):
     ckpt_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
     if not os.path.isfile(ckpt_path):
       raise FileNotFoundError(f"File {ckpt_path} not found for local directory.")
   elif hf_download:
-    ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
+    # Only rank 0 downloads; others use cache
+    if process_index == 0:
+      ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
+    jax.experimental.multihost_utils.sync_global_devices("vae_download")
+    
+    if process_index != 0:
+      ckpt_path = hf_hub_download(
+          pretrained_model_name_or_path, 
+          subfolder=subfolder, 
+          filename=filename,
+          force_download=False,  # Use cache
+      )
   max_logging.log(f"Load and port {pretrained_model_name_or_path} VAE on {device}")
   with jax.default_device(device):
     if ckpt_path is not None:
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -492,7 +492,7 @@ def encode_prompt(
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
-      prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=jnp.float32)
+      prompt_embeds = jnp.array(prompt_embeds.detach().float().numpy(), dtype=jnp.float32)
 
     if negative_prompt_embeds is None:
       negative_prompt = negative_prompt or ""
@@ -502,7 +502,7 @@ def encode_prompt(
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
-      negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=jnp.float32)
+      negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().float().numpy(), dtype=jnp.float32)
 
     return prompt_embeds, negative_prompt_embeds
 

Original file line number	Diff line number	Diff line change
`@@ -492,7 +492,7 @@ def encode_prompt(`
`492`	`492`	`num_videos_per_prompt=num_videos_per_prompt,`
`493`	`493`	`max_sequence_length=max_sequence_length,`
`494`	`494`	`)`
`495`		`- prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=jnp.float32)`
	`495`	`+ prompt_embeds = jnp.array(prompt_embeds.detach().float().numpy(), dtype=jnp.float32)`
`496`	`496`
`497`	`497`	`if negative_prompt_embeds is None:`
`498`	`498`	`negative_prompt = negative_prompt or ""`
`@@ -502,7 +502,7 @@ def encode_prompt(`
`502`	`502`	`num_videos_per_prompt=num_videos_per_prompt,`
`503`	`503`	`max_sequence_length=max_sequence_length,`
`504`	`504`	`)`
`505`		`- negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=jnp.float32)`
	`505`	`+ negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().float().numpy(), dtype=jnp.float32)`
`506`	`506`
`507`	`507`	`return prompt_embeds, negative_prompt_embeds`
`508`	`508`