adding another format lora support.

jfacevedo-google · ksikiric · commit 719e6dbd1cf8 · 2025-02-18T09:37:43.000+01:00
diff --git a/src/maxdiffusion/configs/base_flux_dev.yml b/src/maxdiffusion/configs/base_flux_dev.yml
@@ -30,7 +30,7 @@ t5xxl_model_name_or_path: 'ariG23498/t5-v1-1-xxl-flax'
 # Flux params
 flux_name: "flux-dev"
 max_sequence_length: 512
-time_shift: False
+time_shift: True
 base_shift: 0.5
 max_shift: 1.15
 # offloads t5 encoder after text encoding to save memory.
diff --git a/src/maxdiffusion/generate_flux.py b/src/maxdiffusion/generate_flux.py
@@ -77,7 +77,7 @@ def unpack(x: Array, height: int, width: int) -> Array:
 
 
 def vae_decode(latents, vae, state, config):
-  img = unpack(x=latents, height=config.resolution, width=config.resolution)
+  img = unpack(x=latents.astype(jnp.float32), height=config.resolution, width=config.resolution)
   img = img / vae.config.scaling_factor + vae.config.shift_factor
   img = vae.apply({"params": state.params}, img, deterministic=True, method=vae.decode).sample
   return img
@@ -115,13 +115,12 @@ def loop_body(
 
 def prepare_latent_image_ids(height, width):
   latent_image_ids = jnp.zeros((height, width, 3))
-  latent_image_ids = latent_image_ids.at[..., 1].set(latent_image_ids[..., 1] + jnp.arange(height)[:, None])
-  latent_image_ids = latent_image_ids.at[..., 2].set(latent_image_ids[..., 2] + jnp.arange(width)[None, :])
+  latent_image_ids = latent_image_ids.at[..., 1].set(jnp.arange(height)[:, None])
+  latent_image_ids = latent_image_ids.at[..., 2].set(jnp.arange(width)[None, :])
 
   latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
 
   latent_image_ids = latent_image_ids.reshape(latent_image_id_height * latent_image_id_width, latent_image_id_channels)
-
   return latent_image_ids.astype(jnp.bfloat16)
 
 
@@ -147,20 +146,10 @@ def run_inference(
     txt_ids,
     vec,
     guidance_vec,
+    c_ts,
+    p_ts
 ):
 
-  timesteps = jnp.linspace(1, 0, config.num_inference_steps + 1)
-  # shifting the schedule to favor high timesteps for higher signal images
-  if config.time_shift:
-    # estimate mu based on linear estimation between two points
-    lin_function = get_lin_function(y1=config.base_shift, y2=config.max_shift)
-    mu = lin_function(latents.shape[1])
-    timesteps = time_shift(mu, 1.0, timesteps).tolist()
-  c_ts = timesteps[:-1]
-  p_ts = timesteps[1:]
-  # jax.debug.print("c_ts: {x}", x=c_ts)
-  # jax.debug.print("p_ts: {x}", x=p_ts)
-
   transformer_state = states["transformer"]
   vae_state = states["vae"]
 
@@ -173,11 +162,10 @@ def run_inference(
       vec=vec,
       guidance_vec=guidance_vec,
   )
-
   vae_decode_p = functools.partial(vae_decode, vae=vae, state=vae_state, config=config)
 
   with mesh, nn_partitioning.axis_rules(config.logical_axis_rules):
-    latents, _, _, _ = jax.lax.fori_loop(0, len(timesteps) - 1, loop_body_p, (latents, transformer_state, c_ts, p_ts))
+    latents, _, _, _ = jax.lax.fori_loop(0, len(c_ts), loop_body_p, (latents, transformer_state, c_ts, p_ts))
   image = vae_decode_p(latents)
   return image
 
@@ -236,8 +224,7 @@ def get_clip_prompt_embeds(
 
   prompt_embeds = text_encoder(text_input_ids, params=text_encoder.params, train=False)
   prompt_embeds = prompt_embeds.pooler_output
-  prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=-1)
-  prompt_embeds = np.reshape(prompt_embeds, (batch_size * num_images_per_prompt, -1))
+  prompt_embeds = jnp.tile(prompt_embeds, (batch_size * num_images_per_prompt, 1))
   return prompt_embeds
 
 
@@ -300,7 +287,7 @@ def encode_prompt(
       max_sequence_length=max_sequence_length,
   )
 
-  text_ids = jnp.zeros((prompt_embeds.shape[0], prompt_embeds.shape[1], 3)).astype(jnp.bfloat16)
+  text_ids = jnp.zeros((prompt_embeds.shape[1], 3)).astype(jnp.bfloat16)
   return prompt_embeds, pooled_prompt_embeds, text_ids
 
 
@@ -397,18 +384,14 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
     print("guidance.shape: ", guidance.shape, guidance.dtype)
     print("pooled_prompt_embeds.shape: ", pooled_prompt_embeds.shape, pooled_prompt_embeds.dtype)
 
-  timesteps = jnp.asarray([1.0] * global_batch_size, dtype=jnp.bfloat16)
   guidance = jnp.asarray([config.guidance_scale] * global_batch_size, dtype=jnp.bfloat16)
 
-  validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timesteps, guidance, pooled_prompt_embeds)
-
   # move inputs to device and shard
   data_sharding = jax.sharding.NamedSharding(mesh, P(*config.data_sharding))
   latents = jax.device_put(latents, data_sharding)
-  latent_image_ids = jax.device_put(latent_image_ids, data_sharding)
+  latent_image_ids = jax.device_put(latent_image_ids)
   prompt_embeds = jax.device_put(prompt_embeds, data_sharding)
-  text_ids = jax.device_put(text_ids, data_sharding)
-  timesteps = jax.device_put(timesteps, data_sharding)
+  text_ids = jax.device_put(text_ids)
   guidance = jax.device_put(guidance, data_sharding)
   pooled_prompt_embeds = jax.device_put(pooled_prompt_embeds, data_sharding)
 
@@ -458,6 +441,19 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
   states["transformer"] = transformer_state
   states["vae"] = vae_state
 
+  # Setup timesteps
+  timesteps = jnp.linspace(1, 0, config.num_inference_steps + 1)
+  # shifting the schedule to favor high timesteps for higher signal images
+  if config.time_shift:
+    # estimate mu based on linear estimation between two points
+    lin_function = get_lin_function(x1=config.max_sequence_length, y1=config.base_shift, y2=config.max_shift)
+    mu = lin_function(latents.shape[1])
+    timesteps = time_shift(mu, 1.0, timesteps)
+  c_ts = timesteps[:-1]
+  p_ts = timesteps[1:]
+
+  validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timesteps, guidance, pooled_prompt_embeds)
+
   p_run_inference = jax.jit(
       functools.partial(
           run_inference,
@@ -471,6 +467,8 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
           txt_ids=text_ids,
           vec=pooled_prompt_embeds,
           guidance_vec=guidance,
+          c_ts=c_ts,
+          p_ts=p_ts
       ),
       in_shardings=(state_shardings,),
       out_shardings=None,
diff --git a/src/maxdiffusion/loaders/flux_lora_pipeline.py b/src/maxdiffusion/loaders/flux_lora_pipeline.py
@@ -53,7 +53,7 @@ def rename_for_interceptor(params_keys, network_alphas, adapter_name):
         new_layer_lora = layer_lora[: layer_lora.index(lora_name)]
         if new_layer_lora not in new_params_keys:
           new_params_keys.append(new_layer_lora)
-          network_alpha = network_alphas[layer_lora]
+          network_alpha = network_alphas.get(layer_lora, None)
           new_network_alphas[new_layer_lora] = network_alpha
     return new_params_keys, new_network_alphas
 
diff --git a/src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py b/src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py
@@ -144,10 +144,10 @@ def __call__(self, hidden_states, temb, image_rotary_emb=None):
     hidden_states = self.linear2(attn_mlp)
     hidden_states = gate * hidden_states
     hidden_states = residual + hidden_states
-    if hidden_states.dtype == jnp.float16 or hidden_states.dtype == jnp.bfloat16:
+    if hidden_states.dtype == jnp.float16:
       hidden_states = jnp.clip(hidden_states, -65504, 65504)
 
-    return hidden_states, temb, image_rotary_emb
+    return hidden_states
 
 
 class FluxTransformerBlock(nn.Module):
@@ -294,9 +294,9 @@ def __call__(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=
 
     context_ff_output = self.txt_mlp(norm_encoder_hidden_states)
     encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
-    if encoder_hidden_states.dtype == jnp.float16 or encoder_hidden_states.dtype == jnp.bfloat16:
+    if encoder_hidden_states.dtype == jnp.float16:
       encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
-    return hidden_states, encoder_hidden_states, temb, image_rotary_emb
+    return hidden_states, encoder_hidden_states
 
 
 @flax_register_to_config
@@ -504,7 +504,7 @@ def __call__(
     image_rotary_emb = nn.with_logical_constraint(image_rotary_emb, ("activation_batch", "activation_embed"))
 
     for double_block in self.double_blocks:
-      hidden_states, encoder_hidden_states, temb, image_rotary_emb = double_block(
+      hidden_states, encoder_hidden_states = double_block(
           hidden_states=hidden_states,
           encoder_hidden_states=encoder_hidden_states,
           temb=temb,
@@ -513,7 +513,7 @@ def __call__(
     hidden_states = jnp.concatenate([encoder_hidden_states, hidden_states], axis=1)
     hidden_states = nn.with_logical_constraint(hidden_states, ("activation_batch", "activation_length", "activation_embed"))
     for single_block in self.single_blocks:
-      hidden_states, temb, image_rotary_emb = single_block(
+      hidden_states = single_block(
           hidden_states=hidden_states, temb=temb, image_rotary_emb=image_rotary_emb
       )
     hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
diff --git a/src/maxdiffusion/models/modeling_flax_pytorch_utils.py b/src/maxdiffusion/models/modeling_flax_pytorch_utils.py
@@ -229,12 +229,21 @@ def convert_flux_lora_pytorch_state_dict_to_flax(config, pt_state_dict, params,
   rank = None
   for pt_key, tensor in pt_state_dict.items():
     renamed_pt_key = rename_key(pt_key)
-    print("renamed_pt_key:", renamed_pt_key)
     renamed_pt_key = renamed_pt_key.replace("lora_unet_", "")
     renamed_pt_key = renamed_pt_key.replace("lora_down", f"lora-{adapter_name}.down")
     renamed_pt_key = renamed_pt_key.replace("lora_up", f"lora-{adapter_name}.up")
 
     if "double_blocks" in renamed_pt_key:
+      renamed_pt_key = renamed_pt_key.replace("double_blocks.", "double_blocks_")
+      renamed_pt_key = renamed_pt_key.replace("processor.proj_lora1.down", f"attn.i_proj.lora-{adapter_name}.down")
+      renamed_pt_key = renamed_pt_key.replace("processor.proj_lora1.up", f"attn.i_proj.lora-{adapter_name}.up")
+      renamed_pt_key = renamed_pt_key.replace("processor.proj_lora2.down", f"attn.e_proj.lora-{adapter_name}.down")
+      renamed_pt_key = renamed_pt_key.replace("processor.proj_lora2.up", f"attn.e_proj.lora-{adapter_name}.up")
+      renamed_pt_key = renamed_pt_key.replace("processor.qkv_lora1.down", f"attn.i_qkv.lora-{adapter_name}.down")
+      renamed_pt_key = renamed_pt_key.replace("processor.qkv_lora1.up", f"attn.i_qkv.lora-{adapter_name}.up")
+      renamed_pt_key = renamed_pt_key.replace("processor.qkv_lora2.down", f"attn.e_qkv.lora-{adapter_name}.down")
+      renamed_pt_key = renamed_pt_key.replace("processor.qkv_lora2.up", f"attn.e_qkv.lora-{adapter_name}.up")
+      
       renamed_pt_key = renamed_pt_key.replace("_img_attn_proj", ".attn.i_proj")
       renamed_pt_key = renamed_pt_key.replace("_img_attn_qkv", ".attn.i_qkv")
       renamed_pt_key = renamed_pt_key.replace("_img_mlp_0", ".img_mlp.layers_0")