add sharding contraint to reshape after attn. Use mesh with vae decode.

jfacevedo-google · jfacevedo-google · commit ce3ee644017e · 2025-07-09T18:08:15.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -99,7 +99,8 @@ def _reshape_heads_to_head_dim(tensor):
   # This is used to transform the output of flash attention back into the format of other attention outputs
   b, h, s, d = tensor.shape
   tensor = jnp.transpose(tensor, axes=[0, 2, 1, 3])
-  return jnp.reshape(tensor, (b, -1, h * d))
+  reshaped_tensor = jnp.reshape(tensor, (b, -1, h * d))
+  return jax.lax.with_sharding_constraint(reshaped_tensor, PartitionSpec("data", "fsdp", "tensor"))
 
 
 def _unflatten_heads(tensor, heads):
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -434,12 +434,13 @@ def __call__(
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
         )
-      latents_mean = jnp.array(self.vae.latents_mean).reshape(1, self.vae.z_dim, 1, 1, 1)
-      latents_std = 1.0 / jnp.array(self.vae.latents_std).reshape(1, self.vae.z_dim, 1, 1, 1)
-      latents = latents / latents_std + latents_mean
-      latents = latents.astype(self.config.weights_dtype)
-
-    video = self.vae.decode(latents, self.vae_cache)[0]
+        latents_mean = jnp.array(self.vae.latents_mean).reshape(1, self.vae.z_dim, 1, 1, 1)
+        latents_std = 1.0 / jnp.array(self.vae.latents_std).reshape(1, self.vae.z_dim, 1, 1, 1)
+        latents = latents / latents_std + latents_mean
+        latents = latents.astype(self.config.weights_dtype)
+    
+    with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+      video = self.vae.decode(latents, self.vae_cache)[0]
 
     video = jnp.transpose(video, (0, 4, 1, 2, 3))
     video = torch.from_numpy(np.array(video.astype(dtype=jnp.float32))).to(dtype=torch.bfloat16)