AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/ltx2_video.yml‎
Lines changed: 106 additions & 0 deletions b/‎src/maxdiffusion/configs/ltx2_video.yml‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 77 additions & 0 deletions b/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 77 additions & 0 deletions
@@ -0,0 +1,106 @@
+#hardware
+hardware: 'tpu'
+skip_jax_distributed_system: False
+attention: 'flash'
+attention_sharding_uniform: True 
+
+jax_cache_dir: ''
+weights_dtype: 'bfloat16'
+activations_dtype: 'bfloat16'
+
+
+run_name: ''
+output_dir: ''
+config_path: ''
+save_config_to_gcs: False
+
+#Checkpoints
+text_encoder_model_name_or_path: "ariG23498/t5-v1-1-xxl-flax"
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+frame_rate: 30
+max_sequence_length: 512
+sampler: "from_checkpoint"
+
+# Generation parameters
+pipeline_type: multi-scale
+prompt: "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie."
+#negative_prompt: "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+height: 512
+width: 512
+num_frames: 88
+flow_shift: 5.0
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
+prompt_enhancement_words_threshold: 120
+stg_mode: "attention_values"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+seed: 10
+conditioning_media_paths: None #["IMAGE_PATH"]
+conditioning_start_frames: [0]
+
+
+first_pass:
+  guidance_scale: [1, 1, 6, 8, 6, 1, 1]
+  stg_scale: [0, 0, 4, 4, 4, 2, 1]
+  rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
+  guidance_timesteps: [1.0, 0.996,  0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
+  skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
+  num_inference_steps: 30
+  skip_final_inference_steps: 3
+  skip_initial_inference_steps: 0
+  cfg_star_rescale: True
+
+second_pass:
+  guidance_scale: [1]
+  stg_scale: [1]
+  rescaling_scale: [1]
+  guidance_timesteps: [1.0]
+  skip_block_list: [27]
+  num_inference_steps: 30
+  skip_initial_inference_steps: 17
+  skip_final_inference_steps: 0
+  cfg_star_rescale: True
+
+#parallelism
+mesh_axes: ['data', 'fsdp', 'context', 'tensor']
+logical_axis_rules: [
+                      ['batch', 'data'],
+                      ['activation_heads', 'fsdp'],
+                      ['activation_batch', 'data'],
+                      ['activation_kv', 'tensor'],
+                      ['mlp','tensor'],
+                      ['embed','fsdp'],
+                      ['heads', 'tensor'],
+                      ['norm', 'fsdp'],
+                      ['conv_batch', ['data','fsdp']],
+                      ['out_channels', 'tensor'],
+                      ['conv_out', 'fsdp'],
+                      ['conv_in', 'fsdp']
+                    ]
+data_sharding: [['data', 'fsdp', 'context', 'tensor']]
+dcn_data_parallelism: 1  # recommended DCN axis to be auto-sharded
+dcn_fsdp_parallelism: -1
+dcn_context_parallelism: 1
+dcn_tensor_parallelism: 1
+ici_data_parallelism: 1
+ici_fsdp_parallelism: -1  # recommended ICI axis to be auto-sharded
+ici_context_parallelism: 1
+ici_tensor_parallelism: 1
+
+allow_split_physical_axes: False
+learning_rate_schedule_steps: -1
+max_train_steps: 500
+pretrained_model_name_or_path: ''
+unet_checkpoint: ''
+dataset_name: 'diffusers/pokemon-gpt4-captions'
+train_split: 'train'
+dataset_type: 'tf'
+cache_latents_text_encoder_outputs: True
+per_device_batch_size: 1
+compile_topology_num_slices: -1 
+quantization_local_shard_count: -1
+use_qwix_quantization: False 
+jit_initializers: True 
+enable_single_replica_ckpt_restoring: False
@@ -501,3 +501,80 @@ def __call__(self, timestep, guidance, pooled_projection):
     conditioning = time_guidance_emb + pooled_projections
 
     return conditioning
+
+
+class NNXTimesteps(nnx.Module):
+
+  def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+    self.num_channels = num_channels
+    self.flip_sin_to_cos = flip_sin_to_cos
+    self.downscale_freq_shift = downscale_freq_shift
+    self.scale = scale
+
+  def __call__(self, timesteps: jax.Array) -> jax.Array:
+    return get_sinusoidal_embeddings(
+        timesteps=timesteps,
+        embedding_dim=self.num_channels,
+        freq_shift=self.downscale_freq_shift,
+        flip_sin_to_cos=self.flip_sin_to_cos,
+        scale=self.scale,
+    )
+
+
+class NNXPixArtAlphaCombinedTimestepSizeEmbeddings(nnx.Module):
+
+  def __init__(
+      self,
+      rngs: nnx.Rngs,
+      embedding_dim: int,
+      size_emb_dim: int,
+      use_additional_conditions: bool = False,
+      dtype: jnp.dtype = jnp.float32,
+      weights_dtype: jnp.dtype = jnp.float32,
+  ):
+    self.outdim = size_emb_dim
+    self.use_additional_conditions = use_additional_conditions
+
+    self.time_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+    self.timestep_embedder = NNXTimestepEmbedding(
+        rngs=rngs, in_channels=256, time_embed_dim=embedding_dim, dtype=dtype, weights_dtype=weights_dtype
+    )
+
+    if use_additional_conditions:
+      self.additional_condition_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+      self.resolution_embedder = NNXTimestepEmbedding(
+          rngs=rngs, in_channels=256, time_embed_dim=size_emb_dim, dtype=dtype, weights_dtype=weights_dtype
+      )
+      self.aspect_ratio_embedder = NNXTimestepEmbedding(
+          rngs=rngs, in_channels=256, time_embed_dim=size_emb_dim, dtype=dtype, weights_dtype=weights_dtype
+      )
+
+  def __call__(
+      self,
+      timestep: jax.Array,
+      resolution: Optional[jax.Array] = None,
+      aspect_ratio: Optional[jax.Array] = None,
+      hidden_dtype: jnp.dtype = jnp.float32,
+  ) -> jax.Array:
+    timesteps_proj = self.time_proj(timestep)
+    timesteps_emb = self.timestep_embedder(timesteps_proj.astype(hidden_dtype))
+
+    if self.use_additional_conditions:
+      if resolution is None or aspect_ratio is None:
+        raise ValueError("resolution and aspect_ratio must be provided when use_additional_conditions is True")
+
+      resolution_emb = self.additional_condition_proj(resolution.flatten()).astype(hidden_dtype)
+      resolution_emb = self.resolution_embedder(resolution_emb)
+      # Reshape to (batch_size, -1) matching PyTorch's reshape(batch_size, -1)
+      # assuming resolution input was (batch_size, ...) so flatten logic holds.
+      resolution_emb = resolution_emb.reshape(timestep.shape[0], -1)
+
+      aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).astype(hidden_dtype)
+      aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb)
+      aspect_ratio_emb = aspect_ratio_emb.reshape(timestep.shape[0], -1)
+
+      conditioning = timesteps_emb + jnp.concatenate([resolution_emb, aspect_ratio_emb], axis=1)
+    else:
+      conditioning = timesteps_emb
+
+    return conditioning