fix some tests.

jfacevedo-google · jfacevedo-google · commit fc77dc05273d · 2025-06-16T19:43:30.000Z
diff --git a/src/maxdiffusion/tests/attention_test.py b/src/maxdiffusion/tests/attention_test.py
@@ -37,7 +37,10 @@ def setUp(self):
   def test_splash_attention(self):
     """Test numerics of splash attention are equivalent to dot_product"""
 
-    pyconfig.initialize([None, os.path.join(THIS_DIR, "..", "configs", "base21.yml")], unittest=True)
+    pyconfig.initialize([None, os.path.join(THIS_DIR, "..", "configs", "base21.yml"),
+      'flash_block_sizes={"block_q" : 512, "block_kv_compute": 512, "block_kv": 512,'
+      '"block_q_dkv": 512, "block_kv_dkv": 512, "block_kv_dkv_compute": 512,'
+      '"block_q_dq": 512, "block_kv_dq": 512}',], unittest=True)
     config = pyconfig.config
 
     batch = 8
@@ -47,15 +50,14 @@ def test_splash_attention(self):
 
     key1, key2 = jax.random.split(jax.random.PRNGKey(0))
     x = jax.random.normal(key1, (batch, length, heads * head_depth))
-
     dot_product_attention = FlaxAttention(
         heads * head_depth,
         heads,
         head_depth,
         split_head_dim=True,
         attention_kernel="dot_product",
         mesh=None,
-        dtype=jnp.bfloat16,
+        dtype=jnp.bfloat16
     )
 
     params = dot_product_attention.init(key2, x)["params"]
@@ -64,9 +66,16 @@ def test_splash_attention(self):
 
     devices_array = max_utils.create_device_mesh(config)
     mesh = Mesh(devices_array, config.mesh_axes)
-
+    flash_block_sizes = max_utils.get_flash_block_sizes(config)
     splash_attention = FlaxAttention(
-        heads * head_depth, heads, head_depth, split_head_dim=True, attention_kernel="flash", mesh=mesh, dtype=jnp.bfloat16
+        heads * head_depth,
+        heads,
+        head_depth,
+        split_head_dim=True,
+        attention_kernel="flash",
+        mesh=mesh,
+        dtype=jnp.bfloat16,
+        flash_block_sizes=flash_block_sizes
     )
 
     params = splash_attention.init(key2, x)["params"]
diff --git a/src/maxdiffusion/tests/wan_transformer_test.py b/src/maxdiffusion/tests/wan_transformer_test.py
@@ -163,8 +163,8 @@ def test_wan_block(self):
         mesh=mesh,
         flash_block_sizes=flash_block_sizes,
     )
-
-    dummy_output = wan_block(dummy_hidden_states, dummy_encoder_hidden_states, dummy_temb, dummy_rotary_emb)
+    with mesh:
+      dummy_output = wan_block(dummy_hidden_states, dummy_encoder_hidden_states, dummy_temb, dummy_rotary_emb)
     assert dummy_output.shape == dummy_hidden_states.shape
 
   def test_wan_attention(self):
@@ -210,10 +210,10 @@ def test_wan_attention(self):
 
     dummy_hidden_states = jnp.ones(dummy_hidden_states_shape)
     dummy_encoder_hidden_states = jnp.ones(dummy_hidden_states_shape)
-
-    dummy_output = attention(
-        hidden_states=dummy_hidden_states, encoder_hidden_states=dummy_encoder_hidden_states, rotary_emb=dummy_rotary_emb
-    )
+    with mesh:
+      dummy_output = attention(
+          hidden_states=dummy_hidden_states, encoder_hidden_states=dummy_encoder_hidden_states, rotary_emb=dummy_rotary_emb
+      )
     assert dummy_output.shape == dummy_hidden_states_shape
 
     # dot product
@@ -246,7 +246,7 @@ def test_wan_model(self):
     frames = 21
     height = 90
     width = 160
-    hidden_states_shape = (batch_size, frames, height, width, channels)
+    hidden_states_shape = (batch_size, channels, frames, height, width)
     dummy_hidden_states = jnp.ones(hidden_states_shape)
 
     key = jax.random.key(0)
@@ -266,10 +266,14 @@ def test_wan_model(self):
 
     dummy_timestep = jnp.ones((batch_size))
     dummy_encoder_hidden_states = jnp.ones((batch_size, 512, 4096))
-
-    dummy_output = wan_model(
-        hidden_states=dummy_hidden_states, timestep=dummy_timestep, encoder_hidden_states=dummy_encoder_hidden_states
-    )
+    with mesh:
+      dummy_output = wan_model(
+          hidden_states=dummy_hidden_states,
+          timestep=dummy_timestep,
+          encoder_hidden_states=dummy_encoder_hidden_states,
+          is_uncond=jnp.array(True, dtype=jnp.bool_),
+          slg_mask=jnp.zeros(40, dtype=jnp.bool_)
+      )
     assert dummy_output.shape == hidden_states_shape
 
 
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -22,6 +22,7 @@
 import jax
 import jax.tree_util as jtu
 from flax import nnx
+from flax.linen import partitioning as nn_partitioning
 from ..schedulers import FlaxEulerDiscreteScheduler
 from .. import max_utils, max_logging, train_utils, maxdiffusion_utils
 from ..checkpointing.wan_checkpointer import (WanCheckpointer, WAN_CHECKPOINT)
@@ -115,7 +116,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, data):
     for step in np.arange(start_step, self.config.max_train_steps):
       if self.config.enable_profiler and step == first_profiling_step:
         max_utils.activate_profiler(self.config)
-      with jax.profiler.StepTraceAnnotation("train", step_num=step), pipeline.mesh:
+      with jax.profiler.StepTraceAnnotation("train", step_num=step), pipeline.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
         state, train_metric, rng = p_train_step(state, graphdef, data, rng)
 
       new_time = datetime.datetime.now()