conditioning activations in fp32, inputs in fp32. vae in fp32

entrpn · entrpn · commit cac3fb5718bd · 2025-10-16T03:33:04.000Z
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -89,7 +89,7 @@ def __init__(
         in_features=in_channels,
         out_features=time_embed_dim,
         use_bias=sample_proj_bias,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
@@ -121,7 +121,7 @@ def __init__(
         in_features=time_embed_dim,
         out_features=time_embed_dim_out,
         use_bias=sample_proj_bias,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
@@ -269,7 +269,7 @@ def __init__(
         in_features=in_features,
         out_features=hidden_size,
         use_bias=True,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
@@ -288,7 +288,7 @@ def __init__(
         in_features=hidden_size,
         out_features=out_features,
         use_bias=True,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -116,7 +116,7 @@ def __init__(
         rngs=rngs,
         in_features=dim,
         out_features=time_proj_dim,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -234,8 +234,8 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
           subfolder="vae",
           rngs=rngs,
           mesh=mesh,
-          dtype=config.activations_dtype,
-          weights_dtype=config.weights_dtype,
+          dtype=jnp.float32,
+          weights_dtype=jnp.float32,
       )
       return wan_vae
 
@@ -494,7 +494,7 @@ def encode_prompt(
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
-      prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=self.config.weights_dtype)
+      prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=jnp.float32)
 
     if negative_prompt_embeds is None:
       negative_prompt = negative_prompt or ""
@@ -504,7 +504,7 @@ def encode_prompt(
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
-      negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=self.config.weights_dtype)
+      negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=jnp.float32)
 
     return prompt_embeds, negative_prompt_embeds
 
@@ -527,7 +527,7 @@ def prepare_latents(
         int(height) // vae_scale_factor_spatial,
         int(width) // vae_scale_factor_spatial,
     )
-    latents = jax.random.normal(rng, shape=shape, dtype=self.config.weights_dtype)
+    latents = jax.random.normal(rng, shape=shape, dtype=jnp.float32)
 
     return latents
 
@@ -617,7 +617,7 @@ def __call__(
         latents_mean = jnp.array(self.vae.latents_mean).reshape(1, self.vae.z_dim, 1, 1, 1)
         latents_std = 1.0 / jnp.array(self.vae.latents_std).reshape(1, self.vae.z_dim, 1, 1, 1)
         latents = latents / latents_std + latents_mean
-        latents = latents.astype(self.config.weights_dtype)
+        latents = latents.astype(jnp.float32)
 
     with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
       video = self.vae.decode(latents, self.vae_cache)[0]