AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 12 additions & 11 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎src/maxdiffusion/max_utils.py‎
Lines changed: 6 additions & 2 deletions b/‎src/maxdiffusion/max_utils.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 2 additions & 2 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/maxdiffusion/models/wan/wan_utils.py‎
Lines changed: 59 additions & 58 deletions b/‎src/maxdiffusion/models/wan/wan_utils.py‎
Lines changed: 59 additions & 58 deletions
diff --git a/‎src/maxdiffusion/pipelines/wan/wan_pipeline.py‎
Lines changed: 19 additions & 7 deletions b/‎src/maxdiffusion/pipelines/wan/wan_pipeline.py‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎src/maxdiffusion/pyconfig.py‎
Lines changed: 2 additions & 0 deletions b/‎src/maxdiffusion/pyconfig.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxdiffusion/tests/attention_test.py‎
Lines changed: 15 additions & 44 deletions b/‎src/maxdiffusion/tests/attention_test.py‎
Lines changed: 15 additions & 44 deletions
@@ -55,17 +55,18 @@ from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
 
-#flash_block_sizes: {}
-flash_block_sizes: {
-  "block_q" : 3024,
-  "block_kv_compute" : 1024,
-  "block_kv" : 2048,
-  "block_q_dkv" : 3024,
-  "block_kv_dkv" : 2048,
-  "block_kv_dkv_compute" : 2048,
-  "block_q_dq" : 3024,
-  "block_kv_dq" : 2048
-}
+flash_block_sizes: {}
+# Use on v6e
+# flash_block_sizes: {
+#   "block_q" : 3024,
+#   "block_kv_compute" : 1024,
+#   "block_kv" : 2048,
+#   "block_q_dkv" : 3024,
+#   "block_kv_dkv" : 2048,
+#   "block_kv_dkv_compute" : 2048,
+#   "block_q_dq" : 3024,
+#   "block_kv_dq" : 2048
+# }
 # GroupNorm groups
 norm_num_groups: 32
 
 
@@ -281,9 +281,13 @@ def create_device_mesh(config, devices=None, logging=True):
   ici_parallelism = fill_unspecified_mesh_axes(ici_parallelism, num_devices_per_slice, "ICI")
   if multi_slice_env:
     dcn_parallelism = fill_unspecified_mesh_axes(dcn_parallelism, num_slices, "DCN")
-    mesh = mesh_utils.create_hybrid_device_mesh(ici_parallelism, dcn_parallelism, devices, allow_split_physical_axes=config.allow_split_physical_axes)
+    mesh = mesh_utils.create_hybrid_device_mesh(
+        ici_parallelism, dcn_parallelism, devices, allow_split_physical_axes=config.allow_split_physical_axes
+    )
   else:
-    mesh = mesh_utils.create_device_mesh(ici_parallelism, devices, allow_split_physical_axes=config.allow_split_physical_axes)
+    mesh = mesh_utils.create_device_mesh(
+        ici_parallelism, devices, allow_split_physical_axes=config.allow_split_physical_axes
+    )
 
   if logging:
     max_logging.log(f"Decided on mesh: {mesh}")
 
@@ -568,8 +568,8 @@ class AttentionOp(nn.Module):
   use_memory_efficient_attention: bool = False
   split_head_dim: bool = False
   float32_qk_product: bool = True
-  axis_names_q: AxisNames = ((BATCH, HEAD, LENGTH, D_KV),)
-  axis_names_kv: AxisNames = ((BATCH, HEAD, KV_LENGTH, D_KV),)
+  axis_names_q: AxisNames = (BATCH, HEAD, LENGTH, D_KV)
+  axis_names_kv: AxisNames = (BATCH, HEAD, KV_LENGTH, D_KV)
   flash_min_seq_length: int = 4096
   flash_block_sizes: BlockSizes = None
   dtype: DType = jnp.float32
 
@@ -137,10 +137,11 @@ def load_wan_transformer(pretrained_model_name_or_path: str, eval_shapes: dict,
   else:
     return load_base_wan_transformer(pretrained_model_name_or_path, eval_shapes, device, hf_download)
 
+
 def load_base_wan_transformer(pretrained_model_name_or_path: str, eval_shapes: dict, device: str, hf_download: bool = True):
   device = jax.devices(device)[0]
-  subfolder="transformer"
-  filename="diffusion_pytorch_model.safetensors.index.json"
+  subfolder = "transformer"
+  filename = "diffusion_pytorch_model.safetensors.index.json"
   local_files = False
   if os.path.isdir(pretrained_model_name_or_path):
     index_file_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
@@ -150,72 +151,72 @@ def load_base_wan_transformer(pretrained_model_name_or_path: str, eval_shapes: d
   elif hf_download:
     # download the index file for sharded models.
     index_file_path = hf_hub_download(
-        pretrained_model_name_or_path, subfolder=subfolder, filename=filename,
+        pretrained_model_name_or_path,
+        subfolder=subfolder,
+        filename=filename,
     )
-  with jax.default_device(device):  
-      # open the index file.
-      with open(index_file_path, "r") as f:
-        index_dict = json.load(f)
-      model_files = set()
-      for key in index_dict["weight_map"].keys():
-        model_files.add(index_dict["weight_map"][key])
-
-      model_files = list(model_files)
-      tensors = {}
-      for model_file in model_files:
-        if local_files:
-          ckpt_shard_path = os.path.join(pretrained_model_name_or_path, subfolder, model_file)
-        else:
-          ckpt_shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=model_file)
-        # now get all the filenames for the model that need downloading
-        max_logging.log(f"Load and port Wan 2.1 transformer on {device}")
-
-        if ckpt_shard_path is not None:
-          with safe_open(ckpt_shard_path, framework="pt") as f:
-            for k in f.keys():
-              tensors[k] = torch2jax(f.get_tensor(k))
-      flax_state_dict = {}
-      cpu = jax.local_devices(backend="cpu")[0]
-      flattened_dict = flatten_dict(eval_shapes)
-      # turn all block numbers to strings just for matching weights.
-      # Later they will be turned back to ints.
-      random_flax_state_dict = {}
-      for key in flattened_dict:
-        string_tuple = tuple([str(item) for item in key])
-        random_flax_state_dict[string_tuple] = flattened_dict[key]
-      del flattened_dict
-      for pt_key, tensor in tensors.items():
-        renamed_pt_key = rename_key(pt_key)
-        renamed_pt_key = renamed_pt_key.replace("blocks_", "blocks.")
-        renamed_pt_key = renamed_pt_key.replace("to_out_0", "proj_attn")
-        renamed_pt_key = renamed_pt_key.replace("ffn.net_2", "ffn.proj_out")
-        renamed_pt_key = renamed_pt_key.replace("ffn.net_0", "ffn.act_fn")
-        renamed_pt_key = renamed_pt_key.replace("norm2", "norm2.layer_norm")
-        pt_tuple_key = tuple(renamed_pt_key.split("."))
-
-        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict)
-        flax_key = rename_for_nnx(flax_key)
-        flax_key = _tuple_str_to_int(flax_key)
-        flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
-      validate_flax_state_dict(eval_shapes, flax_state_dict)
-      flax_state_dict = unflatten_dict(flax_state_dict)
-      del tensors
-      jax.clear_caches()
-      return flax_state_dict
+  with jax.default_device(device):
+    # open the index file.
+    with open(index_file_path, "r") as f:
+      index_dict = json.load(f)
+    model_files = set()
+    for key in index_dict["weight_map"].keys():
+      model_files.add(index_dict["weight_map"][key])
+
+    model_files = list(model_files)
+    tensors = {}
+    for model_file in model_files:
+      if local_files:
+        ckpt_shard_path = os.path.join(pretrained_model_name_or_path, subfolder, model_file)
+      else:
+        ckpt_shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=model_file)
+      # now get all the filenames for the model that need downloading
+      max_logging.log(f"Load and port Wan 2.1 transformer on {device}")
+
+      if ckpt_shard_path is not None:
+        with safe_open(ckpt_shard_path, framework="pt") as f:
+          for k in f.keys():
+            tensors[k] = torch2jax(f.get_tensor(k))
+    flax_state_dict = {}
+    cpu = jax.local_devices(backend="cpu")[0]
+    flattened_dict = flatten_dict(eval_shapes)
+    # turn all block numbers to strings just for matching weights.
+    # Later they will be turned back to ints.
+    random_flax_state_dict = {}
+    for key in flattened_dict:
+      string_tuple = tuple([str(item) for item in key])
+      random_flax_state_dict[string_tuple] = flattened_dict[key]
+    del flattened_dict
+    for pt_key, tensor in tensors.items():
+      renamed_pt_key = rename_key(pt_key)
+      renamed_pt_key = renamed_pt_key.replace("blocks_", "blocks.")
+      renamed_pt_key = renamed_pt_key.replace("to_out_0", "proj_attn")
+      renamed_pt_key = renamed_pt_key.replace("ffn.net_2", "ffn.proj_out")
+      renamed_pt_key = renamed_pt_key.replace("ffn.net_0", "ffn.act_fn")
+      renamed_pt_key = renamed_pt_key.replace("norm2", "norm2.layer_norm")
+      pt_tuple_key = tuple(renamed_pt_key.split("."))
+
+      flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict)
+      flax_key = rename_for_nnx(flax_key)
+      flax_key = _tuple_str_to_int(flax_key)
+      flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
+    validate_flax_state_dict(eval_shapes, flax_state_dict)
+    flax_state_dict = unflatten_dict(flax_state_dict)
+    del tensors
+    jax.clear_caches()
+    return flax_state_dict
 
 
 def load_wan_vae(pretrained_model_name_or_path: str, eval_shapes: dict, device: str, hf_download: bool = True):
   device = jax.devices(device)[0]
-  subfolder="vae"
-  filename="diffusion_pytorch_model.safetensors"
+  subfolder = "vae"
+  filename = "diffusion_pytorch_model.safetensors"
   if os.path.isdir(pretrained_model_name_or_path):
     ckpt_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
     if not os.path.isfile(ckpt_path):
       raise FileNotFoundError(f"File {ckpt_path} not found for local directory.")
   elif hf_download:
-    ckpt_path = hf_hub_download(
-        pretrained_model_name_or_path, subfolder=subfolder, filename=filename
-    )
+    ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
   max_logging.log(f"Load and port Wan 2.1 VAE on {device}")
   with jax.default_device(device):
     if ckpt_path is not None:
 
@@ -183,7 +183,7 @@ def load_tokenizer(cls, config: HyperParameters):
 
   @classmethod
   def load_vae(cls, devices_array: np.array, mesh: Mesh, rngs: nnx.Rngs, config: HyperParameters):
-    
+
     def create_model(rngs: nnx.Rngs, config: HyperParameters):
       wan_vae = AutoencoderKLWan.from_config(
           config.pretrained_model_name_or_path,
@@ -194,11 +194,12 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
           weights_dtype=config.weights_dtype,
       )
       return wan_vae
-    # 1. eval shape    
+
+    # 1. eval shape
     p_model_factory = partial(create_model, config=config)
     wan_vae = nnx.eval_shape(p_model_factory, rngs=rngs)
     graphdef, state = nnx.split(wan_vae, nnx.Param)
-    
+
     # 2. retrieve the state shardings, mapping logical names to mesh axis names.
     logical_state_spec = nnx.get_partition_spec(state)
     logical_state_sharding = nn.logical_to_mesh_sharding(logical_state_spec, mesh, config.logical_axis_rules)
@@ -215,7 +216,7 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
       sharding = logical_state_sharding[path].value
       state[path].value = device_put_replicated(val, sharding)
     state = nnx.from_flat_state(state)
-    
+
     wan_vae = nnx.merge(graphdef, state)
     vae_cache = AutoencoderKLWanCache(wan_vae)
     return wan_vae, vae_cache
@@ -463,7 +464,18 @@ def __call__(
 
 
 @partial(jax.jit, static_argnames=("do_classifier_free_guidance", "guidance_scale"))
-def transformer_forward_pass(graphdef, sharded_state, rest_of_state, latents, timestep, prompt_embeds, is_uncond, slg_mask, do_classifier_free_guidance, guidance_scale):
+def transformer_forward_pass(
+    graphdef,
+    sharded_state,
+    rest_of_state,
+    latents,
+    timestep,
+    prompt_embeds,
+    is_uncond,
+    slg_mask,
+    do_classifier_free_guidance,
+    guidance_scale,
+):
   wan_transformer = nnx.merge(graphdef, sharded_state, rest_of_state)
   noise_pred = wan_transformer(
       hidden_states=latents, timestep=timestep, encoder_hidden_states=prompt_embeds, is_uncond=is_uncond, slg_mask=slg_mask
@@ -474,7 +486,7 @@ def transformer_forward_pass(graphdef, sharded_state, rest_of_state, latents, ti
     noise_pred = noise_pred[:bsz]
     noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
     latents = latents[:bsz]
-  
+
   return noise_pred, latents
 
 
@@ -516,7 +528,7 @@ def run_inference(
         is_uncond=jnp.array(True, dtype=jnp.bool_),
         slg_mask=slg_mask,
         do_classifier_free_guidance=do_classifier_free_guidance,
-        guidance_scale=guidance_scale
+        guidance_scale=guidance_scale,
     )
 
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
 
@@ -36,9 +36,11 @@ def string_to_bool(s: str) -> bool:
     return False
   raise ValueError(f"Can't convert {s} to bool")
 
+
 def string_to_list(string_list: str) -> list:
   return ast.literal_eval(string_list)
 
+
 _yaml_types_to_parser = {str: str, int: int, float: float, bool: string_to_bool, list: string_to_list}
 
 _config = None
 
@@ -23,7 +23,6 @@
 from ..models.attention_flax import FlaxAttention
 from .. import max_utils
 from .. import pyconfig
-from maxdiffusion import FlaxUNet2DConditionModel
 
 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 
@@ -73,54 +72,26 @@ def test_splash_attention(self):
     devices_array = max_utils.create_device_mesh(config)
     mesh = Mesh(devices_array, config.mesh_axes)
     flash_block_sizes = max_utils.get_flash_block_sizes(config)
-    splash_attention = FlaxAttention(
-        heads * head_depth,
-        heads,
-        head_depth,
-        split_head_dim=True,
-        attention_kernel="flash",
-        mesh=mesh,
-        dtype=jnp.bfloat16,
-        flash_block_sizes=flash_block_sizes,
-    )
-
-    params = splash_attention.init(key2, x)["params"]
-    p_apply = jax.jit(splash_attention.apply).lower({"params": params}, x).compile()
-    splash_attention_out = p_apply({"params": params}, x)
+    with mesh:
+      splash_attention = FlaxAttention(
+          heads * head_depth,
+          heads,
+          head_depth,
+          split_head_dim=True,
+          attention_kernel="flash",
+          mesh=mesh,
+          dtype=jnp.bfloat16,
+          flash_block_sizes=flash_block_sizes,
+      )
+
+      params = splash_attention.init(key2, x)["params"]
+      p_apply = jax.jit(splash_attention.apply).lower({"params": params}, x).compile()
+      splash_attention_out = p_apply({"params": params}, x)
 
     diff_norm = jnp.linalg.norm(dot_attention_out - splash_attention_out)
 
     assert diff_norm < 1.0
 
-  def test_flash_block_sizes(self):
-    """Test loading flash block sizes from cli."""
-
-    pyconfig.initialize(
-        [
-            None,
-            os.path.join(THIS_DIR, "..", "configs", "base_2_base.yml"),
-            'flash_block_sizes={"block_q" : 256, "block_kv_compute": 256, "block_kv": 256,'
-            '"block_q_dkv": 256, "block_kv_dkv": 256, "block_kv_dkv_compute": 256,'
-            '"block_q_dq": 256, "block_kv_dq": 256}',
-            "attention=flash",
-        ],
-        unittest=True,
-    )
-    config = pyconfig.config
-    devices_array = max_utils.create_device_mesh(config)
-    mesh = Mesh(devices_array, config.mesh_axes)
-    flash_block_sizes = max_utils.get_flash_block_sizes(config)
-    _, _ = FlaxUNet2DConditionModel.from_pretrained(
-        config.pretrained_model_name_or_path,
-        revision=config.revision,
-        subfolder="unet",
-        dtype=jnp.bfloat16,
-        from_pt=config.from_pt,
-        attention_kernel=config.attention,
-        flash_block_sizes=flash_block_sizes,
-        mesh=mesh,
-    )
-
 
 if __name__ == "__main__":
   absltest.main()