AI-Hypercomputer
diff --git a/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 55 additions & 3 deletions b/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 55 additions & 3 deletions
diff --git a/‎src/maxdiffusion/models/ltx_2/adaln.py‎
Lines changed: 0 additions & 35 deletions b/‎src/maxdiffusion/models/ltx_2/adaln.py‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎src/maxdiffusion/models/ltx_2/attention_ltx2.py‎
Lines changed: 55 additions & 0 deletions b/‎src/maxdiffusion/models/ltx_2/attention_ltx2.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/ltx_2/rope.py‎
Lines changed: 67 additions & 0 deletions b/‎src/maxdiffusion/models/ltx_2/rope.py‎
Lines changed: 67 additions & 0 deletions
@@ -521,8 +521,18 @@ def __call__(self, timesteps: jax.Array) -> jax.Array:
 
 
 class NNXPixArtAlphaCombinedTimestepSizeEmbeddings(nnx.Module):
-  def __init__(self, rngs: nnx.Rngs, embedding_dim: int, size_emb_dim: int, dtype: jnp.dtype = jnp.float32, weights_dtype: jnp.dtype = jnp.float32):
+  def __init__(
+      self,
+      rngs: nnx.Rngs,
+      embedding_dim: int,
+      size_emb_dim: int,
+      use_additional_conditions: bool = False,
+      dtype: jnp.dtype = jnp.float32,
+      weights_dtype: jnp.dtype = jnp.float32
+  ):
     self.outdim = size_emb_dim
+    self.use_additional_conditions = use_additional_conditions
+
     self.time_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
     self.timestep_embedder = NNXTimestepEmbedding(
         rngs=rngs,
@@ -532,7 +542,49 @@ def __init__(self, rngs: nnx.Rngs, embedding_dim: int, size_emb_dim: int, dtype:
         weights_dtype=weights_dtype
     )
 
-  def __call__(self, timestep: jax.Array, hidden_dtype: jnp.dtype = jnp.float32) -> jax.Array:
+    if use_additional_conditions:
+        self.additional_condition_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.resolution_embedder = NNXTimestepEmbedding(
+            rngs=rngs,
+            in_channels=256,
+            time_embed_dim=size_emb_dim,
+            dtype=dtype,
+            weights_dtype=weights_dtype
+        )
+        self.aspect_ratio_embedder = NNXTimestepEmbedding(
+            rngs=rngs,
+            in_channels=256,
+            time_embed_dim=size_emb_dim,
+            dtype=dtype,
+            weights_dtype=weights_dtype
+        )
+
+  def __call__(
+      self,
+      timestep: jax.Array,
+      resolution: Optional[jax.Array] = None,
+      aspect_ratio: Optional[jax.Array] = None,
+      hidden_dtype: jnp.dtype = jnp.float32
+  ) -> jax.Array:
     timesteps_proj = self.time_proj(timestep)
     timesteps_emb = self.timestep_embedder(timesteps_proj.astype(hidden_dtype))
-    return timesteps_emb
+
+    if self.use_additional_conditions:
+        if resolution is None or aspect_ratio is None:
+            raise ValueError("resolution and aspect_ratio must be provided when use_additional_conditions is True")
+            
+        resolution_emb = self.additional_condition_proj(resolution.flatten()).astype(hidden_dtype)
+        resolution_emb = self.resolution_embedder(resolution_emb)
+        # Reshape to (batch_size, -1) matching PyTorch's reshape(batch_size, -1)
+        # assuming resolution input was (batch_size, ...) so flatten logic holds.
+        resolution_emb = resolution_emb.reshape(timestep.shape[0], -1)
+
+        aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).astype(hidden_dtype)
+        aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb)
+        aspect_ratio_emb = aspect_ratio_emb.reshape(timestep.shape[0], -1)
+        
+        conditioning = timesteps_emb + jnp.concatenate([resolution_emb, aspect_ratio_emb], axis=1)
+    else:
+        conditioning = timesteps_emb
+
+    return conditioning
@@ -0,0 +1,55 @@
+
+from typing import Optional, Tuple, Any
+from flax import nnx
+import jax
+import jax.numpy as jnp
+
+class Attention(nnx.Module):
+    """
+    Placeholder for LTX-2 Attention (Self/Cross, Audio/Video).
+    Assumed to be implemented by another team/task.
+    """
+    def __init__(
+        self, 
+        rngs: nnx.Rngs,
+        query_dim: int,
+        heads: int = 8,
+        kv_heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        use_bias: bool = True,
+        cross_attention_dim: Optional[int] = None,
+        out_bias: bool = True,
+        qk_norm: str = "rms_norm_across_heads",
+        norm_eps: float = 1e-6,
+        rope_type: str = "interleaved",
+        dtype: jnp.dtype = jnp.float32,
+        param_dtype: jnp.dtype = jnp.float32,
+    ):
+        self.heads = heads
+        self.dim_head = dim_head
+        # Full implementation omitted.
+
+    def __call__(
+        self, 
+        hidden_states: jax.Array, 
+        encoder_hidden_states: Optional[jax.Array] = None, 
+        attention_mask: Optional[jax.Array] = None, 
+        query_rotary_emb: Optional[Tuple[jax.Array, jax.Array]] = None, 
+        key_rotary_emb: Optional[Tuple[jax.Array, jax.Array]] = None,
+        deterministic: bool = True
+    ) -> jax.Array:
+        """
+        Placeholder forward pass.
+        Returns tensor of same shape as input (hidden_states) usually, 
+        or projected to query_dim.
+        """
+        # Return hidden_states for shape compatibility in simple tests, 
+        # or zeros if dimensions change (e.g. cross attn).
+        # If cross attention (encoder_hidden_states provided), usually output is query_dim-based.
+        # We assume output shape matches hidden_states (query) spatial dims, but depth is query_dim.
+        # But 'out' is projected to query_dim. 
+        # In Block, we add this to 'hidden_states' (residual).
+        # So it MUST match hidden_states shape.
+        
+        return hidden_states 
@@ -0,0 +1,67 @@
+
+from flax import nnx
+from enum import Enum
+import jax
+import jax.numpy as jnp
+from typing import Tuple, Optional, Union, List
+
+class LTXRopeType(Enum):
+    INTERLEAVED = "interleaved"
+    SPLIT = "split"
+
+class LTX2AudioVideoRotaryPosEmbed(nnx.Module):
+    """
+    Placeholder for LTX-2 3D Video and 1D Audio RoPE.
+    Assumed to be implemented by another team/task.
+    """
+    def __init__(
+        self,
+        dim: int,
+        patch_size: int,
+        patch_size_t: int,
+        base_num_frames: int = 128,
+        base_height: int = 2048,
+        base_width: int = 2048,
+        sampling_rate: int = 16000,
+        hop_length: int = 160,
+        scale_factors: Union[List[int], Tuple[int, ...]] = (8, 32, 32),
+        theta: float = 10000.0,
+        causal_offset: int = 1,
+        modality: str = "video",
+        double_precision: bool = True,
+        rope_type: str = "interleaved",
+        num_attention_heads: int = 32,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dim = dim
+        self.rope_type = rope_type
+        self.dtype = dtype
+        self.modality = modality
+
+    def prepare_video_coords(self, batch_size, num_frames, height, width, fps):
+        # Return dummy coords
+        return jnp.zeros((batch_size, 1, 1), dtype=self.dtype)
+
+    def prepare_audio_coords(self, batch_size, audio_num_frames):
+        # Return dummy coords
+        return jnp.zeros((batch_size, 1, 1), dtype=self.dtype)
+
+    def __call__(
+        self,
+        coords: jax.Array,
+    ) -> Tuple[jax.Array, jax.Array]:
+        """
+        Returns placeholder frequencies (cos, sin).
+        """
+        # Return dummy cos/sin
+        # Shape: (1, 1, dim) to broadcast? 
+        # Attention expects (batch, seq, head_dim) usually or (batch, 1, head_dim)
+        # Let's return sensible broadcastable shapes.
+        return jnp.zeros((1, 1, self.dim), dtype=self.dtype), jnp.zeros((1, 1, self.dim), dtype=self.dtype)
+
+# Helper placeholders if used by attention
+def apply_interleaved_rotary_emb(x, freqs):
+    return x
+
+def apply_split_rotary_emb(x, freqs):
+    return x