AI-Hypercomputer
diff --git a/‎setup.sh‎
Lines changed: 1 addition & 1 deletion b/‎setup.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 0 additions & 12 deletions b/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/transformers_pytorch/__init__.py‎ b/‎src/maxdiffusion/models/ltx_video/transformers_pytorch/__init__.py‎
diff --git a/‎src/maxdiffusion/models/ltx_video/transformers_pytorch/attention.py‎
Lines changed: 1264 additions & 0 deletions b/‎src/maxdiffusion/models/ltx_video/transformers_pytorch/attention.py‎
Lines changed: 1264 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/transformers_pytorch/embeddings.py‎
Lines changed: 129 additions & 0 deletions b/‎src/maxdiffusion/models/ltx_video/transformers_pytorch/embeddings.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/transformers_pytorch/symmetric_patchifier.py‎
Lines changed: 84 additions & 0 deletions b/‎src/maxdiffusion/models/ltx_video/transformers_pytorch/symmetric_patchifier.py‎
Lines changed: 84 additions & 0 deletions
@@ -110,4 +110,4 @@ else
 fi
 
 # Install maxdiffusion
-pip3 install -U . || echo "Failed to install maxdiffusion" >&2
+pip3 install -U . || echo "Failed to install maxdiffusion" >&2
@@ -9,6 +9,7 @@ activations_dtype: 'bfloat16'
 
 run_name: ''
 output_dir: ''
+config_path: ''
 save_config_to_gcs: False
 
 #Checkpoints
 
@@ -97,18 +97,6 @@ def run(config):
   pipeline = LTXVideoPipeline.from_pretrained(config, enhance_prompt=enhance_prompt)
   if config.pipeline_type == "multi-scale":
     pipeline = LTXMultiScalePipeline(pipeline)
-  # s0 = time.perf_counter()
-  # images = pipeline(
-  #     height=height_padded,
-  #     width=width_padded,
-  #     num_frames=num_frames_padded,
-  #     is_video=True,
-  #     output_type="pt",
-  #     config=config,
-  #     enhance_prompt=enhance_prompt,
-  #     seed = config.seed
-  # )
-  # print("compile time: ", (time.perf_counter() - s0))
   s0 = time.perf_counter()
   images = pipeline(
       height=height_padded,
 
@@ -0,0 +1,129 @@
+# Adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py
+import math
+
+import numpy as np
+import torch
+from einops import rearrange
+from torch import nn
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+def get_3d_sincos_pos_embed(embed_dim, grid, w, h, f):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid = rearrange(grid, "c (f h w) -> c f h w", h=h, w=w)
+    grid = rearrange(grid, "c f h w -> c h w f", h=h, w=w)
+    grid = grid.reshape([3, 1, w, h, f])
+    pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+    pos_embed = pos_embed.transpose(1, 0, 2, 3)
+    return rearrange(pos_embed, "h w f c -> (f h w) c")
+
+
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 3 != 0:
+        raise ValueError("embed_dim must be divisible by 3")
+
+    # use half of dimensions to encode grid_h
+    emb_f = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[0])  # (H*W*T, D/3)
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[1])  # (H*W*T, D/3)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[2])  # (H*W*T, D/3)
+
+    emb = np.concatenate([emb_h, emb_w, emb_f], axis=-1)  # (H*W*T, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos_shape = pos.shape
+
+    pos = pos.reshape(-1)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    out = out.reshape([*pos_shape, -1])[0]
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (M, D)
+    return emb
+
+
+class SinusoidalPositionalEmbedding(nn.Module):
+    """Apply positional information to a sequence of embeddings.
+
+    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
+    them
+
+    Args:
+        embed_dim: (int): Dimension of the positional embedding.
+        max_seq_length: Maximum sequence length to apply positional embeddings
+
+    """
+
+    def __init__(self, embed_dim: int, max_seq_length: int = 32):
+        super().__init__()
+        position = torch.arange(max_seq_length).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim)
+        )
+        pe = torch.zeros(1, max_seq_length, embed_dim)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        _, seq_length, _ = x.shape
+        x = x + self.pe[:, :seq_length]
+        return x
@@ -0,0 +1,84 @@
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+import torch
+from diffusers.configuration_utils import ConfigMixin
+from einops import rearrange
+from torch import Tensor
+
+
+class Patchifier(ConfigMixin, ABC):
+    def __init__(self, patch_size: int):
+        super().__init__()
+        self._patch_size = (1, patch_size, patch_size)
+
+    @abstractmethod
+    def patchify(self, latents: Tensor) -> Tuple[Tensor, Tensor]:
+        raise NotImplementedError("Patchify method not implemented")
+
+    @abstractmethod
+    def unpatchify(
+        self,
+        latents: Tensor,
+        output_height: int,
+        output_width: int,
+        out_channels: int,
+    ) -> Tuple[Tensor, Tensor]:
+        pass
+
+    @property
+    def patch_size(self):
+        return self._patch_size
+
+    def get_latent_coords(
+        self, latent_num_frames, latent_height, latent_width, batch_size, device
+    ):
+        """
+        Return a tensor of shape [batch_size, 3, num_patches] containing the
+            top-left corner latent coordinates of each latent patch.
+        The tensor is repeated for each batch element.
+        """
+        latent_sample_coords = torch.meshgrid(
+            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
+            torch.arange(0, latent_height, self._patch_size[1], device=device),
+            torch.arange(0, latent_width, self._patch_size[2], device=device),
+        )
+        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+        latent_coords = rearrange(
+            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+        )
+        return latent_coords
+
+
+class SymmetricPatchifier(Patchifier):
+    def patchify(self, latents: Tensor) -> Tuple[Tensor, Tensor]:
+        b, _, f, h, w = latents.shape
+        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
+        latents = rearrange(
+            latents,
+            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
+            p1=self._patch_size[0],
+            p2=self._patch_size[1],
+            p3=self._patch_size[2],
+        )
+        return latents, latent_coords
+
+    def unpatchify(
+        self,
+        latents: Tensor,
+        output_height: int,
+        output_width: int,
+        out_channels: int,
+    ) -> Tuple[Tensor, Tensor]:
+        output_height = output_height // self._patch_size[1]
+        output_width = output_width // self._patch_size[2]
+        latents = rearrange(
+            latents,
+            "b (f h w) (c p q) -> b c f (h p) (w q)",
+            h=output_height,
+            w=output_width,
+            p=self._patch_size[1],
+            q=self._patch_size[2],
+        )
+        return latents