AI-Hypercomputer
diff --git a/‎README.md‎
Lines changed: 8 additions & 0 deletions b/‎README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_t2v.yml‎
Lines changed: 1 addition & 3 deletions b/‎src/maxdiffusion/configs/base_wan_t2v.yml‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 189 additions & 2 deletions b/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 189 additions & 2 deletions
diff --git a/‎src/maxdiffusion/image_processor.py‎
Lines changed: 46 additions & 0 deletions b/‎src/maxdiffusion/image_processor.py‎
Lines changed: 46 additions & 0 deletions
@@ -53,6 +53,7 @@ MaxDiffusion supports
   - [Training](#training)
   - [Dreambooth](#dreambooth)
   - [Inference](#inference)
+  - [Wan 2.1](#wan)
   - [Flux](#flux)
     - [Fused Attention for GPU:](#fused-attention-for-gpu)
   - [Hyper SDXL LoRA](#hyper-sdxl-lora)
@@ -171,6 +172,13 @@ To generate images, run the following command:
   ```bash
   python -m src.maxdiffusion.generate src/maxdiffusion/configs/base21.yml run_name="my_run"
   ```
+
+  ## Wan
+
+  ```bash
+  python src/maxdiffusion/generate_wan.py src/maxdiffusion/configs/base_wan_t2v.yml run_name="wan-test" output_dir="gs://jfacevedo-maxdiffusion" jax_cache_dir="/tmp/"
+  ```
+  
   ## Flux
 
   First make sure you have permissions to access the Flux repos in Huggingface.
 
@@ -23,9 +23,7 @@ gcs_metrics: False
 save_config_to_gcs: False
 log_period: 100
 
-pretrained_model_name_or_path: 'black-forest-labs/FLUX.1-dev'
-clip_model_name_or_path: 'ariG23498/clip-vit-large-patch14-text-flax'
-t5xxl_model_name_or_path: 'ariG23498/t5-v1-1-xxl-flax'
+pretrained_model_name_or_path: 'Wan-AI/Wan2.1-T2V-14B-Diffusers'
 
 # Flux params
 flux_name: "flux-dev"
 
@@ -13,18 +13,205 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-
-from typing import Callable, List, Union, Sequence
+import html
+from typing import Callable, List, Union, Sequence, Optional
+import time
+import torch
+import ftfy
+import regex as re
+import jax
+from jax.sharding import Mesh, PositionalSharding, PartitionSpec as P
 from flax import nnx
 from absl import app
+from transformers import AutoTokenizer, UMT5EncoderModel
 from maxdiffusion import pyconfig, max_logging
 from maxdiffusion.models.wan.transformers.transformer_flux_wan_nnx import WanModel
+from maxdiffusion.pipelines.wan.pipeline_wan import WanPipeline
+
+from maxdiffusion.max_utils import (
+    device_put_replicated,
+    get_memory_allocations,
+    create_device_mesh,
+    get_flash_block_sizes,
+    get_precision,
+    setup_initial_state,
+)
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+def _get_t5_prompt_embeds(
+    tokenizer: AutoTokenizer,
+    text_encoder: UMT5EncoderModel,
+    prompt: Union[str, List[str]] = None,
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+):
+
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt = [prompt_clean(u) for u in prompt]
+    batch_size = len(prompt)
+
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_attention_mask=True,
+        return_tensors="pt",
+    )
+    text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+    seq_lens = mask.gt(0).sum(dim=1).long()
+
+    prompt_embeds = text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+    prompt_embeds = torch.stack(
+        [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+    )
+
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    _, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+def encode_prompt(
+    tokenizer: AutoTokenizer,
+    text_encoder: UMT5EncoderModel,
+    prompt: Union[str, List[str]],
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    do_classifier_free_guidance: bool = True,
+    num_videos_per_prompt: int = 1,
+    prompt_embeds: Optional[torch.Tensor] = None,
+    negative_prompt_embeds: Optional[torch.Tensor] = None,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+):
+    r"""
+    Encodes the prompt into text encoder hidden states.
+
+    Args:
+        prompt (`str` or `List[str]`, *optional*):
+            prompt to be encoded
+        negative_prompt (`str` or `List[str]`, *optional*):
+            The prompt or prompts not to guide the image generation. If not defined, one has to pass
+            `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+            less than `1`).
+        do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+            Whether to use classifier free guidance or not.
+        num_videos_per_prompt (`int`, *optional*, defaults to 1):
+            Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+        prompt_embeds (`torch.Tensor`, *optional*):
+            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+            provided, text embeddings will be generated from `prompt` input argument.
+        negative_prompt_embeds (`torch.Tensor`, *optional*):
+            Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+            weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+            argument.
+        device: (`torch.device`, *optional*):
+            torch device
+        dtype: (`torch.dtype`, *optional*):
+            torch dtype
+    """
+
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    if prompt is not None:
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+
+    if prompt_embeds is None:
+        prompt_embeds = _get_t5_prompt_embeds(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            prompt=prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+
+    if do_classifier_free_guidance and negative_prompt_embeds is None:
+        negative_prompt = negative_prompt or ""
+        negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+        if prompt is not None and type(prompt) is not type(negative_prompt):
+            raise TypeError(
+                f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                f" {type(prompt)}."
+            )
+        elif batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )
+
+        negative_prompt_embeds = _get_t5_prompt_embeds(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            prompt=negative_prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+
+    return prompt_embeds, negative_prompt_embeds
 
 def run(config):
   max_logging.log("Wan 2.1 inference script")
 
+  rng = jax.random.key(config.seed)
+  devices_array = create_device_mesh(config)
+  mesh = Mesh(devices_array, config.mesh_axes)
+
+  global_batch_size = config.per_device_batch_size * jax.local_device_count()
+
+  tokenizer = AutoTokenizer.from_pretrained(
+    config.pretrained_model_name_or_path, subfolder="tokenizer", dtype=config.weights_dtype
+  )
+  text_encoder = UMT5EncoderModel.from_pretrained(
+    config.pretrained_model_name_or_path, subfolder="text_encoder",
+  )
+  s0 = time.perf_counter()
+  prompt_embeds, negative_prompt_embeds = encode_prompt(
+      tokenizer=tokenizer,
+      text_encoder=text_encoder,
+      prompt=config.prompt,
+      negative_prompt=config.negative_prompt
+  )
+  max_logging.log(f"text encoding time: {(time.perf_counter() - s0)}")
+
+  # pipeline, params = WanPipeline.from_pretrained(
+  #   config.pretrained_model_name_or_path,
+  #   #vae=None,
+  #   #transformer=None
+  # )
+  # breakpoint()
+
   wan_transformer = WanModel(rngs=nnx.Rngs(config.seed))
 
+
 def main(argv: Sequence[str]) -> None:
   pyconfig.initialize(argv)
   run(pyconfig.config)
 
@@ -35,6 +35,52 @@
     List[torch.FloatTensor],
 ]
 
+def is_valid_image(image) -> bool:
+    r"""
+    Checks if the input is a valid image.
+
+    A valid image can be:
+    - A `PIL.Image.Image`.
+    - A 2D or 3D `np.ndarray` or `torch.Tensor` (grayscale or color image).
+
+    Args:
+        image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
+            The image to validate. It can be a PIL image, a NumPy array, or a torch tensor.
+
+    Returns:
+        `bool`:
+            `True` if the input is a valid image, `False` otherwise.
+    """
+    return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
+
+
+def is_valid_image_imagelist(images):
+    r"""
+    Checks if the input is a valid image or list of images.
+
+    The input can be one of the following formats:
+    - A 4D tensor or numpy array (batch of images).
+    - A valid single image: `PIL.Image.Image`, 2D `np.ndarray` or `torch.Tensor` (grayscale image), 3D `np.ndarray` or
+      `torch.Tensor`.
+    - A list of valid images.
+
+    Args:
+        images (`Union[np.ndarray, torch.Tensor, PIL.Image.Image, List]`):
+            The image(s) to check. Can be a batch of images (4D tensor/array), a single image, or a list of valid
+            images.
+
+    Returns:
+        `bool`:
+            `True` if the input is valid, `False` otherwise.
+    """
+    if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
+        return True
+    elif is_valid_image(images):
+        return True
+    elif isinstance(images, list):
+        return all(is_valid_image(image) for image in images)
+    return False
+
 
 class VaeImageProcessor(ConfigMixin):
   """