support both dev and schnell loading. Images still incorrect.

jfacevedo-google · jfacevedo-google · commit d16c02006318 · 2025-02-01T00:26:34.000Z
diff --git a/src/maxdiffusion/configs/base_flux_dev.yml b/src/maxdiffusion/configs/base_flux_dev.yml
@@ -28,6 +28,7 @@ clip_model_name_or_path: 'ariG23498/clip-vit-large-patch14-text-flax'
 t5xxl_model_name_or_path: 'ariG23498/t5-v1-1-xxl-flax'
 
 # Flux params
+flux_name: "flux-dev"
 max_sequence_length: 512
 time_shift: False
 base_shift: 0.5
diff --git a/src/maxdiffusion/configs/base_flux_schnell.yml b/src/maxdiffusion/configs/base_flux_schnell.yml
@@ -23,11 +23,12 @@ gcs_metrics: False
 save_config_to_gcs: False
 log_period: 100
 
-pretrained_model_name_or_path: 'black-forest-labs/FLUX.1-dev'
+pretrained_model_name_or_path: 'black-forest-labs/FLUX.1-schnell'
 clip_model_name_or_path: 'ariG23498/clip-vit-large-patch14-text-flax'
 t5xxl_model_name_or_path: 'ariG23498/t5-v1-1-xxl-flax'
 
 # Flux params
+flux_name: "flux-schnell"
 max_sequence_length: 256
 time_shift: False
 base_shift: 0.5
@@ -208,10 +209,10 @@ prompt: "A magical castle in the middle of a forest, artistic drawing"
 prompt_2: "A magical castle in the middle of a forest, artistic drawing"
 negative_prompt: "purple, red"
 do_classifier_free_guidance: True
-guidance_scale: 3.5
+guidance_scale: 0.0
 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
 guidance_rescale: 0.0
-num_inference_steps: 20
+num_inference_steps: 4
 
 # SDXL Lightning parameters
 lightning_from_pt: True
diff --git a/src/maxdiffusion/generate_flux.py b/src/maxdiffusion/generate_flux.py
@@ -55,7 +55,7 @@ def unpack(x: Array, height: int, width: int) -> Array:
       ph=2,
       pw=2,
     )
-
+from einops import rearrange
 def vae_decode(latents, vae, state, config):
   img = unpack(x=latents, height=config.resolution, width=config.resolution)
   img = img / vae.config.scaling_factor + vae.config.shift_factor
@@ -87,6 +87,8 @@ def loop_body(
     guidance=guidance_vec,
     y=vec
   )
+  jax.debug.print("*****pred max: {x}", x=np.max(pred))
+  jax.debug.print("*****pred min: {x}", x=np.min(pred))
   latents = latents + (t_prev - t_curr) * pred
   latents = jnp.array(latents, dtype=latents_dtype)
   return latents, state, c_ts, p_ts
@@ -144,6 +146,8 @@ def run_inference(
     timesteps = time_shift(mu, 1.0, timesteps).tolist()
   c_ts = timesteps[:-1]
   p_ts = timesteps[1:]
+  # jax.debug.print("c_ts: {x}", x=c_ts)
+  # jax.debug.print("p_ts: {x}", x=p_ts)
 
 
   transformer_state = states["transformer"]
@@ -162,7 +166,7 @@ def run_inference(
   vae_decode_p = functools.partial(vae_decode, vae=vae, state=vae_state, config=config)
 
   with mesh, nn_partitioning.axis_rules(config.logical_axis_rules):
-    latents, _, _, _ = jax.lax.fori_loop(0, config.num_inference_steps, loop_body_p, (latents, transformer_state, c_ts, p_ts))
+    latents, _, _, _ = jax.lax.fori_loop(0, len(timesteps) - 1, loop_body_p, (latents, transformer_state, c_ts, p_ts))
   image = vae_decode_p(latents)
   return image
 
@@ -293,7 +297,8 @@ def encode_prompt(
     prompt=prompt_2,
     num_images_per_prompt=num_images_per_prompt,
     tokenizer=t5_tokenizer,
-    text_encoder=t5_text_encoder
+    text_encoder=t5_text_encoder,
+    max_sequence_length=max_sequence_length
   )
 
   text_ids = jnp.zeros((prompt_embeds.shape[0], prompt_embeds.shape[1], 3)).astype(jnp.bfloat16)
@@ -356,7 +361,7 @@ def run(config):
     rng=rng
   )
 
-  # LOAD TEXT ENCODERS - t5 on cpu
+  # LOAD TEXT ENCODERS
   clip_text_encoder = FlaxCLIPTextModel.from_pretrained(
     config.pretrained_model_name_or_path,
     subfolder="text_encoder",
@@ -389,7 +394,8 @@ def run(config):
     clip_text_encoder=clip_text_encoder,
     t5_tokenizer=t5_tokenizer,
     t5_text_encoder=t5_encoder,
-    num_images_per_prompt=global_batch_size
+    num_images_per_prompt=global_batch_size,
+    max_sequence_length=config.max_sequence_length
   )
 
   def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timesteps, guidance, pooled_prompt_embeds):
@@ -430,12 +436,12 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
 
   get_memory_allocations()
   # evaluate shapes
-  transformer_eval_params = transformer.init_weights(rngs=rng, max_sequence_length=512, eval_only=True)
+  transformer_eval_params = transformer.init_weights(rngs=rng, max_sequence_length=config.max_sequence_length, eval_only=True)
   
   # loads pretrained weights
-  transformer_params = load_flow_model("flux-dev", transformer_eval_params, "cpu")
+  transformer_params = load_flow_model(config.flux_name, transformer_eval_params, "cpu")
   # create transformer state
-  weights_init_fn = functools.partial(transformer.init_weights, rngs=rng, max_sequence_length=512, eval_only=False)
+  weights_init_fn = functools.partial(transformer.init_weights, rngs=rng, max_sequence_length=config.max_sequence_length, eval_only=False)
   transformer_state, transformer_state_shardings = setup_initial_state(
     model=transformer,
     tx=None,
diff --git a/src/maxdiffusion/models/flux/modules/layers.py b/src/maxdiffusion/models/flux/modules/layers.py
@@ -111,7 +111,49 @@ def timestep_embedding(
         embedding = embedding.astype(t.dtype)
 
     return embedding
+import numpy as np
+class PixArtAlphaTextProjection(nn.Module):
+  hidden_dim: int
+  dtype: jnp.dtype = jnp.float32
+  weights_dtype: jnp.dtype = jnp.float32
+  precision: jax.lax.Precision = None
+  
+  @nn.compact
+  def __call__(self, x: Array) -> Array:
+
+    hidden_states = nn.Dense(
+      self.hidden_dim,
+      use_bias=True,
+      dtype=self.dtype,
+      param_dtype=self.weights_dtype,
+      precision=self.precision,
+      kernel_init=nn.with_logical_partitioning(
+        nn.initializers.lecun_normal(),
+        ("embed", "heads")
+      ),
+      name="in_layer"
+    )(x)
+    jax.debug.print("PixArtAlphaTextProjection, in_layer min: {x}", x=np.min(hidden_states))
+    jax.debug.print("PixArtAlphaTextProjection, in_layer max: {x}", x=np.max(hidden_states))
+    hidden_states = nn.swish(hidden_states)
+    jax.debug.print("PixArtAlphaTextProjection, act min: {x}", x=np.min(hidden_states))
+    jax.debug.print("PixArtAlphaTextProjection, act max: {x}", x=np.max(hidden_states))
+    hidden_states = nn.Dense(
+      self.hidden_dim,
+      use_bias=True,
+      dtype=self.dtype,
+      param_dtype=self.weights_dtype,
+      precision=self.precision,
+      kernel_init=nn.with_logical_partitioning(
+        nn.initializers.lecun_normal(),
+        ("heads", "embed")
+      ),
+      name="out_layer"
+    )(hidden_states)
+    jax.debug.print("PixArtAlphaTextProjection, out min: {x}", x=np.min(hidden_states))
+    jax.debug.print("PixArtAlphaTextProjection, out max: {x}", x=np.max(hidden_states))
 
+    return hidden_states
 
 class MLPEmbedder(nn.Module):
   hidden_dim: int
diff --git a/src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py b/src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py
@@ -17,6 +17,7 @@
 from typing import Dict, Optional, Tuple, Union
 
 from einops import repeat, rearrange
+import numpy as np
 import jax
 import jax.numpy as jnp
 import flax.linen as nn 
@@ -28,7 +29,8 @@
   EmbedND,
   DoubleStreamBlock,
   SingleStreamBlock,
-  LastLayer
+  LastLayer,
+  PixArtAlphaTextProjection
 )
 from ...modeling_flax_utils import FlaxModelMixin
 from ....configuration_utils import ConfigMixin, flax_register_to_config
@@ -129,6 +131,9 @@ def __call__(
     inner_dim = self.num_attention_heads * self.attention_head_dim
     pe_dim = inner_dim // self.num_attention_heads
 
+    jax.debug.print("pooled_projections value min: {x}", x=np.min(y))
+    jax.debug.print("pooled_projections value max: {x}", x=np.max(y))
+
     img = nn.Dense(
       inner_dim,
       dtype=self.dtype,
@@ -140,39 +145,57 @@ def __call__(
       ),
       name="img_in"
     )(img)
-
+    jax.debug.print("img.min: {x}", x=np.min(img))
+    jax.debug.print("img.max: {x}", x=np.max(img))
+    timestep = timestep_embedding(timesteps, 256)
+    jax.debug.print("timestep.min: {x}", x=np.min(timestep))
+    jax.debug.print("timestep.max: {x}", x=np.max(timestep))
     vec = MLPEmbedder(
       hidden_dim=inner_dim,
       dtype=self.dtype,
       weights_dtype=self.weights_dtype,
       precision=self.precision,
       name="time_in"
-    )(timestep_embedding(timesteps, 256))
-
+    )(timestep)
+    jax.debug.print("timestep.vec min: {x}", x=np.min(vec))
+    jax.debug.print("timestep.vec max: {x}", x=np.max(vec))
+    print(f"guidance_embeds? {self.guidance_embeds}")
     if self.guidance_embeds:
       if guidance is None:
         raise ValueError(
           "Didn't get guidance strength for guidance distrilled model."
         )
+      guidance_in = timestep_embedding(guidance, 256)
+
+      jax.debug.print("guidance_in.min: {x}", x=np.min(guidance_in))
+      jax.debug.print("guidance_in.max: {x}", x=np.max(guidance_in))
       guidance_in = MLPEmbedder(
         hidden_dim=inner_dim,
         dtype=self.dtype,
         weights_dtype=self.weights_dtype,
         precision=self.precision,
         name="guidance_in"
-      )(timestep_embedding(guidance, 256))
-    else:
-      guidance_in = Identity(timestep_embedding(guidance, 256))
-
+      )(guidance_in)
+      jax.debug.print("guidance.vec min: {x}", x=np.min(guidance_in))
+      jax.debug.print("guidance.vec max: {x}", x=np.max(guidance_in))
       vec = vec + guidance_in
-    
-    vec = vec + MLPEmbedder(
+      jax.debug.print("timestep_guidance.vec min: {x}", x=np.min(vec))
+      jax.debug.print("timestep_guidance.vec max: {x}", x=np.max(vec))
+    # else:
+    #   guidance_in = Identity()(timestep_embedding(guidance, 256))
+
+    pooled_projections = PixArtAlphaTextProjection(
       hidden_dim=inner_dim,
       dtype=self.dtype,
       weights_dtype=self.weights_dtype,
       precision=self.precision,
       name="vector_in"
     )(y)
+    jax.debug.print("pooled_projections.min: {x}", x=np.min(pooled_projections))
+    jax.debug.print("pooled_projections.max: {x}", x=np.max(pooled_projections))
+    vec = vec + pooled_projections
+    jax.debug.print("temb.min: {x}", x=np.min(vec))
+    jax.debug.print("temb.max: {x}", x=np.max(vec))
 
     txt = nn.Dense(
       inner_dim,
@@ -185,7 +208,8 @@ def __call__(
       ),
       name="txt_in"
     )(txt)
-
+    jax.debug.print("txt.min: {x}", x=np.min(txt))
+    jax.debug.print("txt.max: {x}", x=np.max(txt))
     ids = jnp.concatenate((txt_ids, img_ids), axis=1)
 
     #pe_embedder
@@ -194,7 +218,8 @@ def __call__(
       theta=10000,
       axes_dim=self.axes_dims_rope
     )(ids)
-    # breakpoint()
+    jax.debug.print("pe.min: {x}", x=np.min(pe))
+    jax.debug.print("pe.max: {x}", x=np.max(pe))
     # img, txt = DoubleStreamBlock(
     #   hidden_size=inner_dim,
     #   num_heads=self.num_attention_heads,
diff --git a/src/maxdiffusion/models/flux/util.py b/src/maxdiffusion/models/flux/util.py
@@ -18,9 +18,6 @@
 )
 from maxdiffusion import max_logging
 
-# from jflux.model import Flux, FluxParams
-from .port import port_flux
-
 @dataclass
 class FluxParams:
     in_channels: int
@@ -42,7 +39,7 @@ def torch2jax(torch_tensor: torch.Tensor) -> Array:
     is_bfloat16 = torch_tensor.dtype == torch.bfloat16
     if is_bfloat16:
         # upcast the tensor to fp32
-        torch_tensor = torch_tensor.to(dtype=torch.float32)
+        torch_tensor = torch_tensor.float()
 
     if torch.device.type != "cpu":
         torch_tensor = torch_tensor.to("cpu")