vae latents std and mean fix

prishajain1 · prishajain1 · commit 2724462926c9 · 2026-03-02T13:41:06.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py
@@ -1197,8 +1197,8 @@ def __init__(
     )
 
     self.scaling_factor = scaling_factor
-    self.latents_mean = tuple([0.0] * latent_channels)
-    self.latents_std = tuple([1.0] * latent_channels)
+    self.latents_mean = nnx.Param(jnp.zeros((latent_channels,), dtype=dtype))
+    self.latents_std = nnx.Param(jnp.ones((latent_channels,), dtype=dtype))
     self.encoder_causal = encoder_causal
     self.decoder_causal = decoder_causal
 
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2_audio.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2_audio.py
@@ -707,8 +707,8 @@ def __init__(
             is_causal=is_causal
         )
         
-        self.latents_mean = tuple([0.0] * base_channels)
-        self.latents_std = tuple([1.0] * base_channels)
+        self.latents_mean = nnx.Param(jnp.zeros((base_channels,), dtype=dtype))
+        self.latents_std = nnx.Param(jnp.ones((base_channels,), dtype=dtype))
 
     def _normalize_latents(self, h: jnp.ndarray) -> jnp.ndarray:
         if self.double_z:
@@ -721,7 +721,7 @@ def _normalize_latents(self, h: jnp.ndarray) -> jnp.ndarray:
         
         # Normalize means ONLY
         means_patched = self.patchifier.patchify(means) 
-        means_normalized = (means_patched - jnp.array(self.latents_mean, dtype=means_patched.dtype)) / jnp.array(self.latents_std, dtype=means_patched.dtype)
+        means_normalized = (means_patched - self.latents_mean.value.astype(means_patched.dtype)) / self.latents_std.value.astype(means_patched.dtype)
         means_normalized = self.patchifier.unpatchify(means_normalized, channels, freq)
 
         if logvars is not None:
@@ -734,7 +734,7 @@ def _denormalize_latents(self, z: jnp.ndarray) -> Tuple[jnp.ndarray, Tuple[int,
         
         # Denormalize latents (which are just means)
         patched_z = self.patchifier.patchify(z)
-        denorm_patched_z = (patched_z * jnp.array(self.latents_std, dtype=patched_z.dtype)) + jnp.array(self.latents_mean, dtype=patched_z.dtype)
+        denorm_patched_z = (patched_z * self.latents_std.value.astype(patched_z.dtype)) + self.latents_mean.value.astype(patched_z.dtype)
         z = self.patchifier.unpatchify(denorm_patched_z, channels, freq)
 
         target_frames = time * LATENT_DOWNSAMPLE_FACTOR
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -271,9 +271,7 @@ def load_vae_weights(
           random_flax_state_dict[string_tuple] = flattened_eval[key]
             
       for pt_key, tensor in tensors.items():
-          # Diffusers saves static tensors for these, but they are defined as static tuples in Flax.
-          if pt_key in ["latents_mean", "latents_std"]:
-              continue
+          # latents_mean and latents_std are nnx.Params and will be loaded correctly.
           renamed_pt_key = rename_key(pt_key)
           renamed_pt_key = renamed_pt_key.replace("nin_shortcut", "conv_shortcut")
           
@@ -529,8 +527,6 @@ def load_audio_vae_weights(
           random_flax_state_dict[string_tuple] = flattened_eval[key]
 
     for pt_key, tensor in tensors.items():
-        if pt_key in ["latents_mean", "latents_std"]:
-            continue
         key = rename_for_ltx2_audio_vae(pt_key)
         
         should_transpose = False
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -994,8 +994,8 @@ def prepare_latents(
   ) -> jax.Array:
       if latents is not None:
            if latents.ndim == 5:
-              latents_mean = jnp.array(self.vae.latents_mean)
-              latents_std = jnp.array(self.vae.latents_std)
+              latents_mean = self.vae.latents_mean.value
+              latents_std = self.vae.latents_std.value
               scaling_factor = self.vae.config.scaling_factor if hasattr(self.vae.config, "scaling_factor") else 1.0
               
               latents = self._normalize_latents(latents, latents_mean, latents_std, scaling_factor)
@@ -1045,8 +1045,8 @@ def prepare_audio_latents(
           if latents.ndim != 3:
                raise ValueError("Unexpected audio latents shape")
           
-          latents_mean = jnp.array(self.audio_vae.latents_mean)
-          latents_std = jnp.array(self.audio_vae.latents_std)
+          latents_mean = self.audio_vae.latents_mean.value
+          latents_std = self.audio_vae.latents_std.value
 
           latents = self._normalize_audio_latents(latents, latents_mean, latents_std)
           latents = self._create_noised_state(latents, noise_scale, generator)
@@ -1294,8 +1294,8 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
       )
       latents = self._denormalize_latents(
           latents, 
-          jnp.array(self.vae.latents_mean), 
-          jnp.array(self.vae.latents_std), 
+          self.vae.latents_mean.value, 
+          self.vae.latents_std.value, 
           self.vae.config.scaling_factor
       )
       
@@ -1305,8 +1305,8 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
       # Denormalize and Unpack Audio (Order important: Denorm THEN Unpack)
       audio_latents = self._denormalize_audio_latents(
           audio_latents_jax,
-          jnp.array(self.audio_vae.latents_mean),
-          jnp.array(self.audio_vae.latents_std)
+          self.audio_vae.latents_mean.value,
+          self.audio_vae.latents_std.value
       )
       
       num_mel_bins = self.audio_vae.config.mel_bins if getattr(self, "audio_vae", None) is not None else 64

Original file line number	Diff line number	Diff line change
`@@ -1197,8 +1197,8 @@ def __init__(`
`1197`	`1197`	`)`
`1198`	`1198`
`1199`	`1199`	`self.scaling_factor = scaling_factor`
`1200`		`- self.latents_mean = tuple([0.0] * latent_channels)`
`1201`		`- self.latents_std = tuple([1.0] * latent_channels)`
	`1200`	`+ self.latents_mean = nnx.Param(jnp.zeros((latent_channels,), dtype=dtype))`
	`1201`	`+ self.latents_std = nnx.Param(jnp.ones((latent_channels,), dtype=dtype))`
`1202`	`1202`	`self.encoder_causal = encoder_causal`
`1203`	`1203`	`self.decoder_causal = decoder_causal`
`1204`	`1204`