implements a working wan 2.1 pipeline.

jfacevedo-google · jfacevedo-google · commit 5cc2e495ba50 · 2025-05-30T23:00:16.000Z
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from typing import Sequence
+import jax
 import time
 from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline
 from maxdiffusion import pyconfig
@@ -21,7 +22,6 @@
 
 def run(config):
   pipeline = WanPipeline.from_pretrained(config)
-
   s0 = time.perf_counter()
   video = pipeline(
     prompt=config.prompt,
@@ -32,17 +32,20 @@ def run(config):
     num_inference_steps=config.num_inference_steps,
     guidance_scale=config.guidance_scale,
   )
+
   print("compile time: ", (time.perf_counter() - s0))
+  export_to_video(video[0], "jax_output.mp4", fps=16)
   s0 = time.perf_counter()
-  video = pipeline(
-    prompt=config.prompt,
-    negative_prompt=config.negative_prompt,
-    height=config.height,
-    width=config.width,
-    num_frames=config.num_frames,
-    num_inference_steps=config.num_inference_steps,
-    guidance_scale=config.guidance_scale,
-  )
+  with jax.profiler.trace("/tmp/trace/"):
+    video = pipeline(
+      prompt=config.prompt,
+      negative_prompt=config.negative_prompt,
+      height=config.height,
+      width=config.width,
+      num_frames=config.num_frames,
+      num_inference_steps=config.num_inference_steps,
+      guidance_scale=config.guidance_scale,
+    )
   print("generation time: ", (time.perf_counter() - s0))
   export_to_video(video[0], "jax_output.mp4", fps=16)
 
@@ -51,5 +54,6 @@ def main(argv: Sequence[str]) -> None:
   pyconfig.initialize(argv)
   run(pyconfig.config)
 
+
 if __name__ == "__main__":
   app.run(main)
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -109,14 +109,6 @@ def __init__(
     self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
     self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
 
-    self.jitted_decode = jax.jit(
-      partial(
-        self.vae.decode,
-        feat_cache=self.vae_cache,
-        return_dict=False
-      )
-    )
-
     self.p_run_inference = None
 
   @classmethod
@@ -402,8 +394,8 @@ def __call__(
       latents = latents / latents_std + latents_mean
       latents = latents.astype(self.config.weights_dtype)
 
-    with self.mesh:
-      video = self.jitted_decode(latents)[0]
+    video = self.vae.decode(latents, self.vae_cache)[0]
+
     video = jnp.transpose(video, (0, 4, 1, 2, 3))
     video = torch.from_numpy(np.array(video.astype(dtype=jnp.float32))).to(dtype=torch.bfloat16)
     video = self.video_processor.postprocess_video(video, output_type="np")