AI-Hypercomputer
diff --git a/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 116 additions & 119 deletions b/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 116 additions & 119 deletions
diff --git a/‎src/maxdiffusion/image_processor.py‎
Lines changed: 36 additions & 35 deletions b/‎src/maxdiffusion/image_processor.py‎
Lines changed: 36 additions & 35 deletions
@@ -13,6 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+
 import html
 from typing import Callable, List, Union, Sequence, Optional
 import time
@@ -37,21 +38,23 @@
     setup_initial_state,
 )
 
+
 def basic_clean(text):
-    text = ftfy.fix_text(text)
-    text = html.unescape(html.unescape(text))
-    return text.strip()
+  text = ftfy.fix_text(text)
+  text = html.unescape(html.unescape(text))
+  return text.strip()
 
 
 def whitespace_clean(text):
-    text = re.sub(r"\s+", " ", text)
-    text = text.strip()
-    return text
+  text = re.sub(r"\s+", " ", text)
+  text = text.strip()
+  return text
 
 
 def prompt_clean(text):
-    text = whitespace_clean(basic_clean(text))
-    return text
+  text = whitespace_clean(basic_clean(text))
+  return text
+
 
 def _get_t5_prompt_embeds(
     tokenizer: AutoTokenizer,
@@ -63,35 +66,36 @@ def _get_t5_prompt_embeds(
     dtype: Optional[torch.dtype] = None,
 ):
 
-    prompt = [prompt] if isinstance(prompt, str) else prompt
-    prompt = [prompt_clean(u) for u in prompt]
-    batch_size = len(prompt)
+  prompt = [prompt] if isinstance(prompt, str) else prompt
+  prompt = [prompt_clean(u) for u in prompt]
+  batch_size = len(prompt)
+
+  text_inputs = tokenizer(
+      prompt,
+      padding="max_length",
+      max_length=max_sequence_length,
+      truncation=True,
+      add_special_tokens=True,
+      return_attention_mask=True,
+      return_tensors="pt",
+  )
+  text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+  seq_lens = mask.gt(0).sum(dim=1).long()
+
+  prompt_embeds = text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+  prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+  prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+  prompt_embeds = torch.stack(
+      [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+  )
 
-    text_inputs = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=max_sequence_length,
-        truncation=True,
-        add_special_tokens=True,
-        return_attention_mask=True,
-        return_tensors="pt",
-    )
-    text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
-    seq_lens = mask.gt(0).sum(dim=1).long()
-
-    prompt_embeds = text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
-    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-    prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
-    prompt_embeds = torch.stack(
-        [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
-    )
+  # duplicate text embeddings for each generation per prompt, using mps friendly method
+  _, seq_len, _ = prompt_embeds.shape
+  prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+  prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
 
-    # duplicate text embeddings for each generation per prompt, using mps friendly method
-    _, seq_len, _ = prompt_embeds.shape
-    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+  return prompt_embeds
 
-    return prompt_embeds
 
 def encode_prompt(
     tokenizer: AutoTokenizer,
@@ -106,77 +110,77 @@ def encode_prompt(
     device: Optional[torch.device] = None,
     dtype: Optional[torch.dtype] = None,
 ):
-    r"""
-    Encodes the prompt into text encoder hidden states.
-
-    Args:
-        prompt (`str` or `List[str]`, *optional*):
-            prompt to be encoded
-        negative_prompt (`str` or `List[str]`, *optional*):
-            The prompt or prompts not to guide the image generation. If not defined, one has to pass
-            `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-            less than `1`).
-        do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-            Whether to use classifier free guidance or not.
-        num_videos_per_prompt (`int`, *optional*, defaults to 1):
-            Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
-        prompt_embeds (`torch.Tensor`, *optional*):
-            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-            provided, text embeddings will be generated from `prompt` input argument.
-        negative_prompt_embeds (`torch.Tensor`, *optional*):
-            Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-            weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-            argument.
-        device: (`torch.device`, *optional*):
-            torch device
-        dtype: (`torch.dtype`, *optional*):
-            torch dtype
-    """
-
-    prompt = [prompt] if isinstance(prompt, str) else prompt
-    if prompt is not None:
-        batch_size = len(prompt)
-    else:
-        batch_size = prompt_embeds.shape[0]
-
-    if prompt_embeds is None:
-        prompt_embeds = _get_t5_prompt_embeds(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            prompt=prompt,
-            num_videos_per_prompt=num_videos_per_prompt,
-            max_sequence_length=max_sequence_length,
-            device=device,
-            dtype=dtype,
-        )
-
-    if do_classifier_free_guidance and negative_prompt_embeds is None:
-        negative_prompt = negative_prompt or ""
-        negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-
-        if prompt is not None and type(prompt) is not type(negative_prompt):
-            raise TypeError(
-                f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                f" {type(prompt)}."
-            )
-        elif batch_size != len(negative_prompt):
-            raise ValueError(
-                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`."
-            )
-
-        negative_prompt_embeds = _get_t5_prompt_embeds(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            prompt=negative_prompt,
-            num_videos_per_prompt=num_videos_per_prompt,
-            max_sequence_length=max_sequence_length,
-            device=device,
-            dtype=dtype,
-        )
-
-    return prompt_embeds, negative_prompt_embeds
+  r"""
+  Encodes the prompt into text encoder hidden states.
+
+  Args:
+      prompt (`str` or `List[str]`, *optional*):
+          prompt to be encoded
+      negative_prompt (`str` or `List[str]`, *optional*):
+          The prompt or prompts not to guide the image generation. If not defined, one has to pass
+          `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+          less than `1`).
+      do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+          Whether to use classifier free guidance or not.
+      num_videos_per_prompt (`int`, *optional*, defaults to 1):
+          Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+      prompt_embeds (`torch.Tensor`, *optional*):
+          Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+          provided, text embeddings will be generated from `prompt` input argument.
+      negative_prompt_embeds (`torch.Tensor`, *optional*):
+          Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+          weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+          argument.
+      device: (`torch.device`, *optional*):
+          torch device
+      dtype: (`torch.dtype`, *optional*):
+          torch dtype
+  """
+
+  prompt = [prompt] if isinstance(prompt, str) else prompt
+  if prompt is not None:
+    batch_size = len(prompt)
+  else:
+    batch_size = prompt_embeds.shape[0]
+
+  if prompt_embeds is None:
+    prompt_embeds = _get_t5_prompt_embeds(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        prompt=prompt,
+        num_videos_per_prompt=num_videos_per_prompt,
+        max_sequence_length=max_sequence_length,
+        device=device,
+        dtype=dtype,
+    )
+
+  if do_classifier_free_guidance and negative_prompt_embeds is None:
+    negative_prompt = negative_prompt or ""
+    negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+    if prompt is not None and type(prompt) is not type(negative_prompt):
+      raise TypeError(
+          f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}."
+      )
+    elif batch_size != len(negative_prompt):
+      raise ValueError(
+          f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+          f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+          " the batch size of `prompt`."
+      )
+
+    negative_prompt_embeds = _get_t5_prompt_embeds(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        prompt=negative_prompt,
+        num_videos_per_prompt=num_videos_per_prompt,
+        max_sequence_length=max_sequence_length,
+        device=device,
+        dtype=dtype,
+    )
+
+  return prompt_embeds, negative_prompt_embeds
+
 
 def run(config):
   max_logging.log("Wan 2.1 inference script")
@@ -188,17 +192,15 @@ def run(config):
   global_batch_size = config.per_device_batch_size * jax.local_device_count()
 
   tokenizer = AutoTokenizer.from_pretrained(
-    config.pretrained_model_name_or_path, subfolder="tokenizer", dtype=config.weights_dtype
+      config.pretrained_model_name_or_path, subfolder="tokenizer", dtype=config.weights_dtype
   )
   text_encoder = UMT5EncoderModel.from_pretrained(
-    config.pretrained_model_name_or_path, subfolder="text_encoder",
+      config.pretrained_model_name_or_path,
+      subfolder="text_encoder",
   )
   s0 = time.perf_counter()
   prompt_embeds, negative_prompt_embeds = encode_prompt(
-      tokenizer=tokenizer,
-      text_encoder=text_encoder,
-      prompt=config.prompt,
-      negative_prompt=config.negative_prompt
+      tokenizer=tokenizer, text_encoder=text_encoder, prompt=config.prompt, negative_prompt=config.negative_prompt
   )
   max_logging.log(f"text encoding time: {(time.perf_counter() - s0)}")
 
@@ -209,20 +211,15 @@ def run(config):
   # )
   # breakpoint()
 
-  pipeline, params = WanPipeline.from_pretrained(
-    config.pretrained_model_name_or_path,
-    vae=None,
-    transformer=None
-  )
-
-  #wan_transformer = WanModel(rngs=nnx.Rngs(config.seed))
-
+  pipeline, params = WanPipeline.from_pretrained(config.pretrained_model_name_or_path, vae=None, transformer=None)
 
+  # wan_transformer = WanModel(rngs=nnx.Rngs(config.seed))
 
 
 def main(argv: Sequence[str]) -> None:
   pyconfig.initialize(argv)
   run(pyconfig.config)
 
+
 if __name__ == "__main__":
   app.run(main)
@@ -35,51 +35,52 @@
     List[torch.FloatTensor],
 ]
 
+
 def is_valid_image(image) -> bool:
-    r"""
-    Checks if the input is a valid image.
+  r"""
+  Checks if the input is a valid image.
 
-    A valid image can be:
-    - A `PIL.Image.Image`.
-    - A 2D or 3D `np.ndarray` or `torch.Tensor` (grayscale or color image).
+  A valid image can be:
+  - A `PIL.Image.Image`.
+  - A 2D or 3D `np.ndarray` or `torch.Tensor` (grayscale or color image).
 
-    Args:
-        image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
-            The image to validate. It can be a PIL image, a NumPy array, or a torch tensor.
+  Args:
+      image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
+          The image to validate. It can be a PIL image, a NumPy array, or a torch tensor.
 
-    Returns:
-        `bool`:
-            `True` if the input is a valid image, `False` otherwise.
-    """
-    return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
+  Returns:
+      `bool`:
+          `True` if the input is a valid image, `False` otherwise.
+  """
+  return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
 
 
 def is_valid_image_imagelist(images):
-    r"""
-    Checks if the input is a valid image or list of images.
+  r"""
+  Checks if the input is a valid image or list of images.
 
-    The input can be one of the following formats:
-    - A 4D tensor or numpy array (batch of images).
-    - A valid single image: `PIL.Image.Image`, 2D `np.ndarray` or `torch.Tensor` (grayscale image), 3D `np.ndarray` or
-      `torch.Tensor`.
-    - A list of valid images.
+  The input can be one of the following formats:
+  - A 4D tensor or numpy array (batch of images).
+  - A valid single image: `PIL.Image.Image`, 2D `np.ndarray` or `torch.Tensor` (grayscale image), 3D `np.ndarray` or
+    `torch.Tensor`.
+  - A list of valid images.
 
-    Args:
-        images (`Union[np.ndarray, torch.Tensor, PIL.Image.Image, List]`):
-            The image(s) to check. Can be a batch of images (4D tensor/array), a single image, or a list of valid
-            images.
+  Args:
+      images (`Union[np.ndarray, torch.Tensor, PIL.Image.Image, List]`):
+          The image(s) to check. Can be a batch of images (4D tensor/array), a single image, or a list of valid
+          images.
 
-    Returns:
-        `bool`:
-            `True` if the input is valid, `False` otherwise.
-    """
-    if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
-        return True
-    elif is_valid_image(images):
-        return True
-    elif isinstance(images, list):
-        return all(is_valid_image(image) for image in images)
-    return False
+  Returns:
+      `bool`:
+          `True` if the input is valid, `False` otherwise.
+  """
+  if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
+    return True
+  elif is_valid_image(images):
+    return True
+  elif isinstance(images, list):
+    return all(is_valid_image(image) for image in images)
+  return False
 
 
 class VaeImageProcessor(ConfigMixin):