wan pipeline wip

jfacevedo-google · jfacevedo-google · commit 9276f26180fb · 2025-04-18T23:06:25.000Z
diff --git a/README.md b/README.md
@@ -53,6 +53,7 @@ MaxDiffusion supports
   - [Training](#training)
   - [Dreambooth](#dreambooth)
   - [Inference](#inference)
+  - [Wan 2.1](#wan)
   - [Flux](#flux)
     - [Fused Attention for GPU:](#fused-attention-for-gpu)
   - [Hyper SDXL LoRA](#hyper-sdxl-lora)
@@ -171,6 +172,13 @@ To generate images, run the following command:
   ```bash
   python -m src.maxdiffusion.generate src/maxdiffusion/configs/base21.yml run_name="my_run"
   ```
+
+  ## Wan
+
+  ```bash
+  python src/maxdiffusion/generate_wan.py src/maxdiffusion/configs/base_wan_t2v.yml run_name="wan-test" output_dir="gs://jfacevedo-maxdiffusion" jax_cache_dir="/tmp/"
+  ```
+  
   ## Flux
 
   First make sure you have permissions to access the Flux repos in Huggingface.
diff --git a/src/maxdiffusion/configs/base_wan_t2v.yml b/src/maxdiffusion/configs/base_wan_t2v.yml
@@ -23,9 +23,7 @@ gcs_metrics: False
 save_config_to_gcs: False
 log_period: 100
 
-pretrained_model_name_or_path: 'black-forest-labs/FLUX.1-dev'
-clip_model_name_or_path: 'ariG23498/clip-vit-large-patch14-text-flax'
-t5xxl_model_name_or_path: 'ariG23498/t5-v1-1-xxl-flax'
+pretrained_model_name_or_path: 'Wan-AI/Wan2.1-T2V-14B-Diffusers'
 
 # Flux params
 flux_name: "flux-dev"
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -19,11 +19,20 @@
 from absl import app
 from maxdiffusion import pyconfig, max_logging
 from maxdiffusion.models.wan.transformers.transformer_flux_wan_nnx import WanModel
+from maxdiffusion.pipelines.wan.pipeline_wan import WanPipeline
 
 def run(config):
   max_logging.log("Wan 2.1 inference script")
 
-  wan_transformer = WanModel(rngs=nnx.Rngs(config.seed))
+  pipeline, params = WanPipeline.from_pretrained(
+    config.pretrained_model_name_or_path,
+    vae=None,
+    transformer=None
+  )
+  breakpoint()
+
+  #wan_transformer = WanModel(rngs=nnx.Rngs(config.seed))
+
 
 def main(argv: Sequence[str]) -> None:
   pyconfig.initialize(argv)
diff --git a/src/maxdiffusion/image_processor.py b/src/maxdiffusion/image_processor.py
@@ -35,6 +35,52 @@
     List[torch.FloatTensor],
 ]
 
+def is_valid_image(image) -> bool:
+    r"""
+    Checks if the input is a valid image.
+
+    A valid image can be:
+    - A `PIL.Image.Image`.
+    - A 2D or 3D `np.ndarray` or `torch.Tensor` (grayscale or color image).
+
+    Args:
+        image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
+            The image to validate. It can be a PIL image, a NumPy array, or a torch tensor.
+
+    Returns:
+        `bool`:
+            `True` if the input is a valid image, `False` otherwise.
+    """
+    return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
+
+
+def is_valid_image_imagelist(images):
+    r"""
+    Checks if the input is a valid image or list of images.
+
+    The input can be one of the following formats:
+    - A 4D tensor or numpy array (batch of images).
+    - A valid single image: `PIL.Image.Image`, 2D `np.ndarray` or `torch.Tensor` (grayscale image), 3D `np.ndarray` or
+      `torch.Tensor`.
+    - A list of valid images.
+
+    Args:
+        images (`Union[np.ndarray, torch.Tensor, PIL.Image.Image, List]`):
+            The image(s) to check. Can be a batch of images (4D tensor/array), a single image, or a list of valid
+            images.
+
+    Returns:
+        `bool`:
+            `True` if the input is valid, `False` otherwise.
+    """
+    if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
+        return True
+    elif is_valid_image(images):
+        return True
+    elif isinstance(images, list):
+        return all(is_valid_image(image) for image in images)
+    return False
+
 
 class VaeImageProcessor(ConfigMixin):
   """
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -0,0 +1,87 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from typing import Tuple, List
+from flax import nnx
+from ...configuration_utils import ConfigMixin, flax_register_to_config
+from ..modeling_flax_utils import FlaxModelMixin
+
+class WanEncoder3d(nnx.Module):
+  pass
+
+class WanCausalConv3d(nnx.Module):
+  pass
+
+class WanDecoder3d(nnx.Module):
+  pass
+
+class AutoencoderKLWan(nnx.Module, FlaxModelMixin, ConfigMixin):
+  def __init__(
+    self,
+    base_dim: int = 96,
+    z_dim: int = 16,
+    dim_mult: Tuple[int] = [1,2,4,4],
+    num_res_blocks: int = 2,
+    attn_scales: List[float] = [],
+    temporal_downsample: List[bool] = [False, True, True],
+    dropout: float = 0.0,
+    latents_mean: List[float] = [
+      -0.7571,
+      -0.7089,
+      -0.9113,
+      0.1075,
+      -0.1745,
+      0.9653,
+      -0.1517,
+      1.5508,
+      0.4134,
+      -0.0715,
+      0.5517,
+      -0.3632,
+      -0.1922,
+      -0.9497,
+      0.2503,
+      -0.2921,
+    ],
+    latents_std: List[float] = [
+      2.8184,
+      1.4541,
+      2.3275,
+      2.6558,
+      1.2196,
+      1.7708,
+      2.6052,
+      2.0743,
+      3.2687,
+      2.1526,
+      2.8652,
+      1.5579,
+      1.6382,
+      1.1253,
+      2.8251,
+      1.9160,
+    ],
+  ):
+    self.z_dim = z_dim
+    self.temporal_downsample = temporal_downsample
+    self.temporal_upsample = temporal_downsample[::-1]
+
+    self.encoder = WanEncoder3d(z_dim * 2, z_dim * 2, 1)
+    self.post_quant_conv = WanCausalConv3d(z_dim, z_dim, 1)
+
+    self.decoder = WanDecoder3d(
+      base_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temporal_upsample, dropout
+    )
diff --git a/src/maxdiffusion/pipelines/wan/__init__.py b/src/maxdiffusion/pipelines/wan/__init__.py
diff --git a/src/maxdiffusion/pipelines/wan/pipeline_wan.py b/src/maxdiffusion/pipelines/wan/pipeline_wan.py
@@ -0,0 +1,84 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from typing import Union, List
+from transformers import AutoTokenizer, UMT5EncoderModel
+import torch
+from ...models.wan.transformers.transformer_flux_wan_nnx import WanModel
+from ...models.wan.autoencoder_kl_wan import AutoencoderKLWan 
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from ...video_processor import VideoProcessor
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+
+class WanPipeline(FlaxDiffusionPipeline):
+
+  def __init__(
+      self,
+      tokenizer: AutoTokenizer,
+      text_encoder: UMT5EncoderModel,
+      transformer: WanModel,
+      vae: AutoencoderKLWan,
+      scheduler: FlowMatchEulerDiscreteScheduler,
+  ):
+    super().__init__()
+
+    self.register_modules(
+      vae=vae,
+      text_encoder=text_encoder,
+      tokenizer=tokenizer,
+      transformer=transformer,
+      scheduler=scheduler
+    )
+
+    self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
+    self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+    self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+
+  def _get_t5_prompt_embds(
+    self,
+    prompt: Union[str, List[str]] = None,
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+  ):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    text_inputs = self.tokenizer(
+      prompt,
+      padding="max_length",
+      max_length=max_sequence_length,
+      truncation=True,
+      add_special_tokens=True,
+      return_attention_mask=True,
+      return_tensors="pt",
+    )
+    text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+    seq_lens = mask.gt(0).sum(dim=1).long()
+
+    prompt_embeds = self.text_encoder(text_input_ids, mask).last_hidden_state
+    # prompt_embeds = prompt_embeds.to(dtype=dtype)
+    prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+    prompt_embeds = torch.stack(
+        [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+    )
+
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    _, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+    return prompt_embeds
diff --git a/src/maxdiffusion/schedulers/__init__.py b/src/maxdiffusion/schedulers/__init__.py
@@ -48,6 +48,7 @@
   _import_structure["scheduling_lms_discrete_flax"] = ["FlaxLMSDiscreteScheduler"]
   _import_structure["scheduling_pndm_flax"] = ["FlaxPNDMScheduler"]
   _import_structure["scheduling_sde_ve_flax"] = ["FlaxScoreSdeVeScheduler"]
+  _import_structure["scheduling_flow_match_euler_discrete_flax"] = ["FlowMatchEulerDiscreteScheduler"]
   _import_structure["scheduling_utils_flax"] = [
       "FlaxKarrasDiffusionSchedulers",
       "FlaxSchedulerMixin",
@@ -73,6 +74,7 @@
     from .scheduling_lms_discrete_flax import FlaxLMSDiscreteScheduler
     from .scheduling_pndm_flax import FlaxPNDMScheduler
     from .scheduling_sde_ve_flax import FlaxScoreSdeVeScheduler
+    from .scheduling_flow_match_euler_discrete_flax import FlowMatchEulerDiscreteScheduler
     from .scheduling_utils_flax import (
         FlaxKarrasDiffusionSchedulers,
         FlaxSchedulerMixin,
diff --git a/src/maxdiffusion/schedulers/scheduling_flow_match_euler_discrete_flax.py b/src/maxdiffusion/schedulers/scheduling_flow_match_euler_discrete_flax.py
@@ -0,0 +1,71 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    # FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
+
+@flax.struct.dataclass
+class FlowMatchEulerDiscreteSchedulerState:
+  common: CommonSchedulerState
+
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(FlaxSchedulerOutput):
+  state: FlowMatchEulerDiscreteSchedulerState
+
+class FlowMatchEulerDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
+  # _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+  dtype: jnp.dtype
+
+  @property
+  def has_state(self):
+    return True
+  
+  @register_to_config
+  def __init__(
+    self,
+    num_train_timesteps: int = 1000,
+    shift: float = 1.0,
+    use_dynamic_shifting: bool = False,
+    base_shift: Optional[float] = 0.5,
+    max_shift: Optional[float] = 1.15,
+    base_image_seq_len: Optional[int] = 256,
+    max_image_seq_len: Optional[int] = 4096,
+    invert_sigmas: bool = False,
+    shift_terminal: Optional[float] = None,
+    use_karras_sigmas: Optional[bool] = False,
+    use_exponential_sigmas: Optional[bool] = False,
+    use_beta_sigmas: Optional[bool] = False,
+    time_shift_type: str = "exponential",
+    dtype: jnp.dtype = jnp.float32
+  ):
+    self.dtype = dtype
+  
+  def create_state(self, common: Optional[CommonSchedulerState] = None) -> FlowMatchEulerDiscreteSchedulerState:
+    if common is None:
+      common = CommonSchedulerState.create(self)
diff --git a/src/maxdiffusion/video_processor.py b/src/maxdiffusion/video_processor.py