Merge branch 'main' into flux_lora

jfacevedo-google · jfacevedo-google · commit 4c68d535c37d · 2025-02-12T22:06:39.000Z
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@
 [![Unit Tests](https://github.com/google/maxtext/actions/workflows/UnitTests.yml/badge.svg)](https://github.com/google/maxdiffusion/actions/workflows/UnitTests.yml)
 
 # What's new?
-- **`2025/02/08**: Flux schnell & dev inference.
+- **`2025/02/08`**: Flux schnell & dev inference.
 - **`2024/12/12`**: Load multiple LoRAs for inference.
 - **`2024/10/22`**: LoRA support for Hyper SDXL.
 - **`2024/8/1`**: Orbax is the new default checkpointer. You can still use `pipeline.save_pretrained` after training to save in diffusers format.
diff --git a/src/maxdiffusion/configs/base_flux_schnell.yml b/src/maxdiffusion/configs/base_flux_schnell.yml
@@ -54,17 +54,27 @@ precision: "DEFAULT"
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
-flash_block_sizes: {}
-# Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
+flash_block_sizes: {
+  "block_q" : 256,
+  "block_kv_compute" : 256,
+  "block_kv" : 256,
+  "block_q_dkv" : 256,
+  "block_kv_dkv" : 256,
+  "block_kv_dkv_compute" : 256,
+  "block_q_dq" : 256,
+  "block_kv_dq" : 256
+}
+
+# Use the following flash_block_sizes on v6e (Trillium).
 # flash_block_sizes: {
-#   "block_q" : 1536,
-#   "block_kv_compute" : 1536,
-#   "block_kv" : 1536,
-#   "block_q_dkv" : 1536,
-#   "block_kv_dkv" : 1536,
-#   "block_kv_dkv_compute" : 1536,
-#   "block_q_dq" : 1536,
-#   "block_kv_dq" : 1536
+#   "block_q" : 2176,
+#   "block_kv_compute" : 2176,
+#   "block_kv" : 2176,
+#   "block_q_dkv" : 2176,
+#   "block_kv_dkv" : 2176,
+#   "block_kv_dkv_compute" : 2176,
+#   "block_q_dq" : 2176,
+#   "block_kv_dq" : 2176
 # }
 # GroupNorm groups
 norm_num_groups: 32
diff --git a/src/maxdiffusion/generate_flux.py b/src/maxdiffusion/generate_flux.py
@@ -77,7 +77,7 @@ def unpack(x: Array, height: int, width: int) -> Array:
 
 
 def vae_decode(latents, vae, state, config):
-  img = unpack(x=latents.astype(jnp.float32), height=config.resolution, width=config.resolution)
+  img = unpack(x=latents, height=config.resolution, width=config.resolution)
   img = img / vae.config.scaling_factor + vae.config.shift_factor
   img = vae.apply({"params": state.params}, img, deterministic=True, method=vae.decode).sample
   return img
@@ -135,19 +135,7 @@ def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: flo
 
 
 def run_inference(
-    states,
-    transformer,
-    vae,
-    config,
-    mesh,
-    latents,
-    latent_image_ids,
-    prompt_embeds,
-    txt_ids,
-    vec,
-    guidance_vec,
-    c_ts,
-    p_ts
+    states, transformer, vae, config, mesh, latents, latent_image_ids, prompt_embeds, txt_ids, vec, guidance_vec, c_ts, p_ts
 ):
 
   transformer_state = states["transformer"]
@@ -468,7 +456,7 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
           vec=pooled_prompt_embeds,
           guidance_vec=guidance,
           c_ts=c_ts,
-          p_ts=p_ts
+          p_ts=p_ts,
       ),
       in_shardings=(state_shardings,),
       out_shardings=None,
diff --git a/src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py b/src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py
@@ -144,10 +144,10 @@ def __call__(self, hidden_states, temb, image_rotary_emb=None):
     hidden_states = self.linear2(attn_mlp)
     hidden_states = gate * hidden_states
     hidden_states = residual + hidden_states
-    if hidden_states.dtype == jnp.float16:
+    if hidden_states.dtype == jnp.float16 or hidden_states.dtype == jnp.bfloat16:
       hidden_states = jnp.clip(hidden_states, -65504, 65504)
 
-    return hidden_states
+    return hidden_states, temb, image_rotary_emb
 
 
 class FluxTransformerBlock(nn.Module):
@@ -294,9 +294,9 @@ def __call__(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=
 
     context_ff_output = self.txt_mlp(norm_encoder_hidden_states)
     encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
-    if encoder_hidden_states.dtype == jnp.float16:
+    if encoder_hidden_states.dtype == jnp.float16 or encoder_hidden_states.dtype == jnp.bfloat16:
       encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
-    return hidden_states, encoder_hidden_states
+    return hidden_states, encoder_hidden_states, temb, image_rotary_emb
 
 
 @flax_register_to_config
@@ -504,7 +504,7 @@ def __call__(
     image_rotary_emb = nn.with_logical_constraint(image_rotary_emb, ("activation_batch", "activation_embed"))
 
     for double_block in self.double_blocks:
-      hidden_states, encoder_hidden_states = double_block(
+      hidden_states, encoder_hidden_states, temb, image_rotary_emb = double_block(
           hidden_states=hidden_states,
           encoder_hidden_states=encoder_hidden_states,
           temb=temb,
@@ -513,7 +513,7 @@ def __call__(
     hidden_states = jnp.concatenate([encoder_hidden_states, hidden_states], axis=1)
     hidden_states = nn.with_logical_constraint(hidden_states, ("activation_batch", "activation_length", "activation_embed"))
     for single_block in self.single_blocks:
-      hidden_states = single_block(
+      hidden_states, temb, image_rotary_emb = single_block(
           hidden_states=hidden_states, temb=temb, image_rotary_emb=image_rotary_emb
       )
     hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]