AI-Hypercomputer
diff --git a/‎README.md‎
Lines changed: 7 additions & 7 deletions b/‎README.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 2 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 2 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 2 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_i2v_14b.yml‎
Lines changed: 2 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_i2v_14b.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_i2v_27b.yml‎
Lines changed: 2 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_i2v_27b.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 1 addition & 8 deletions b/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 1 addition & 8 deletions
@@ -628,16 +628,16 @@ To generate images, run the following command:
 We added ring attention support for Wan models. Below are the stats for one `720p` (81 frames) video generation (with CFG DP):
 | Accelerator |  Model | Attention Type | Inference Steps | Sharding | e2e Generation Time |
 | -- | -- | -- | -- | -- | -- | 
-| v7x-8 | WAN 2.1 | Tokamax Flash | 50 | dp2-fsdp1-context4-tp1 | 264.2 |
-| v7x-8 | WAN 2.1 | Tokamax Ring | 50 | dp2-fsdp1-context4-tp1 | **252.4** |
-| v7x-8 | WAN 2.2 | Tokamax Flash | 40 | dp2-fsdp1-context4-tp1 | 212.7 |
-| v7x-8 | WAN 2.2 | Tokamax Ring | 40 | dp2-fsdp1-context4-tp1 | **201.7** |
+| v7x-8 | WAN 2.1 | Tokamax Flash | 50 | dp2-fsdp1-context4-tp1 | **249.3** |
+| v7x-8 | WAN 2.1 | Tokamax Ring | 50 | dp2-fsdp1-context4-tp1 | 252.4 |
+| v7x-8 | WAN 2.2 | Tokamax Flash | 40 | dp2-fsdp1-context4-tp1 | **194.4** |
+| v7x-8 | WAN 2.2 | Tokamax Ring | 40 | dp2-fsdp1-context4-tp1 | 201.7 |
 
 | Accelerator |  Model | Attention Type | Inference Steps | Sharding | e2e Generation Time |
 | -- | -- | -- | -- | -- | -- | 
-| v7x-16 | WAN 2.1 | Tokamax Flash | 50 | dp2-fsdp1-context8-tp1 | 146.6 |
-| v7x-16 | WAN 2.1 | Tokamax Ring | 50 | dp2-fsdp1-context8-tp1 | **137.2** |
-| v7x-16 | WAN 2.2 | Tokamax Flash | 40 | dp2-fsdp1-context8-tp1 | **117.8** |
+| v7x-16 | WAN 2.1 | Tokamax Flash | 50 | dp2-fsdp1-context8-tp1 | **127.1** |
+| v7x-16 | WAN 2.1 | Tokamax Ring | 50 | dp2-fsdp1-context8-tp1 | 137.2 |
+| v7x-16 | WAN 2.2 | Tokamax Flash | 40 | dp2-fsdp1-context8-tp1 | **106.0** |
 | v7x-16 | WAN 2.2 | Tokamax Ring | 40 | dp2-fsdp1-context8-tp1 | 137.5 |
 
 (* There are some known stability issues for ring attention on 16 TPUs, please use `tokamax_flash` attention instead.)
 
@@ -62,6 +62,8 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses
+use_base2_exp: True
+use_experimental_scheduler: True
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 
@@ -61,6 +61,8 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring, ulysses
+use_base2_exp: True
+use_experimental_scheduler: True
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 
@@ -62,6 +62,8 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring, ulysses
+use_base2_exp: True
+use_experimental_scheduler: True
 flash_min_seq_length: 4096
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 # Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
 
@@ -61,6 +61,8 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring, ulysses
+use_base2_exp: True
+use_experimental_scheduler: True
 flash_min_seq_length: 4096
 dropout: 0.0
 
 
@@ -61,6 +61,8 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring, ulysses
+use_base2_exp: True
+use_experimental_scheduler: True
 flash_min_seq_length: 4096
 dropout: 0.0
 
 
@@ -302,14 +302,7 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
       f"{'=' * 50}"
   )
 
-  s0 = time.perf_counter()
-  if max_utils.profiler_enabled(config):
-    with max_utils.Profiler(config):
-      videos = call_pipeline(config, pipeline, prompt, negative_prompt)
-    generation_time_with_profiler = time.perf_counter() - s0
-    max_logging.log(f"generation_time_with_profiler: {generation_time_with_profiler}")
-    if writer and jax.process_index() == 0:
-      writer.add_scalar("inference/generation_time_with_profiler", generation_time_with_profiler, global_step=0)
+  videos = call_pipeline(config, pipeline, prompt, negative_prompt)
 
   return saved_video_path