working loop, bad generation

jfacevedo-google · ksikiric · commit cbc772389f26 · 2025-02-18T09:37:26.000+01:00
diff --git a/src/maxdiffusion/configs/base_flux_dev.yml b/src/maxdiffusion/configs/base_flux_dev.yml
@@ -28,13 +28,10 @@ clip_model_name_or_path: 'ariG23498/clip-vit-large-patch14-text-flax'
 t5xxl_model_name_or_path: 'ariG23498/t5-v1-1-xxl-flax'
 
 # Flux params
-flux_name: "flux-dev"
 max_sequence_length: 512
-time_shift: True
+time_shift: False
 base_shift: 0.5
 max_shift: 1.15
-# offloads t5 encoder after text encoding to save memory.
-offload_encoders: True
 
 
 unet_checkpoint: ''
@@ -52,22 +49,10 @@ activations_dtype: 'bfloat16'
 precision: "DEFAULT"
 
 # Set true to load weights from pytorch
-from_pt: True
+from_pt: False
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
-
 flash_block_sizes: {}
-# Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
-# flash_block_sizes: {
-#   "block_q" : 1536,
-#   "block_kv_compute" : 1536,
-#   "block_kv" : 1536,
-#   "block_q_dkv" : 1536,
-#   "block_kv_dkv" : 1536,
-#   "block_kv_dkv_compute" : 1536,
-#   "block_q_dq" : 1536,
-#   "block_kv_dq" : 1536
-# }
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -133,7 +118,6 @@ logical_axis_rules: [
                       ['activation_batch', ['data','fsdp']],
                       ['activation_heads', 'tensor'],
                       ['activation_kv', 'tensor'],
-                      ['mlp','tensor'],
                       ['embed','fsdp'],
                       ['heads', 'tensor'],
                       ['conv_batch', ['data','fsdp']],
@@ -149,8 +133,8 @@ data_sharding: [['data', 'fsdp', 'tensor']]
 dcn_data_parallelism: 1  # recommended DCN axis to be auto-sharded
 dcn_fsdp_parallelism: -1
 dcn_tensor_parallelism: 1
-ici_data_parallelism: -1
-ici_fsdp_parallelism: 1  # recommended ICI axis to be auto-sharded
+ici_data_parallelism: 1
+ici_fsdp_parallelism: -1  # recommended ICI axis to be auto-sharded
 ici_tensor_parallelism: 1
 
 # Dataset
@@ -226,7 +210,7 @@ do_classifier_free_guidance: True
 guidance_scale: 3.5
 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
 guidance_rescale: 0.0
-num_inference_steps: 50
+num_inference_steps: 20
 
 # SDXL Lightning parameters
 lightning_from_pt: True
diff --git a/src/maxdiffusion/configs/base_fux_schnell.yml b/src/maxdiffusion/configs/base_fux_schnell.yml
@@ -27,6 +27,12 @@ pretrained_model_name_or_path: 'black-forest-labs/FLUX.1-dev'
 clip_model_name_or_path: 'ariG23498/clip-vit-large-patch14-text-flax'
 t5xxl_model_name_or_path: 'ariG23498/t5-v1-1-xxl-flax'
 
+# Flux params
+max_sequence_length: 256
+time_shift: False
+base_shift: 0.5
+max_shift: 1.15
+
 unet_checkpoint: ''
 revision: 'refs/pr/95'
 # This will convert the weights to this dtype.
diff --git a/src/maxdiffusion/generate_flux.py b/src/maxdiffusion/generate_flux.py
@@ -18,7 +18,9 @@
 from absl import app
 import functools
 import math
+import time
 import numpy as np
+from PIL import Image
 import jax
 from jax.sharding import Mesh, PositionalSharding, PartitionSpec as P
 import jax.numpy as jnp
@@ -33,9 +35,8 @@
   FlaxT5EncoderModel
 )
 
-from maxdiffusion import FlaxAutoencoderKL
+from maxdiffusion import FlaxAutoencoderKL, pyconfig, max_logging
 from maxdiffusion.models.flux.transformers.transformer_flux_flax import FluxTransformer2DModel
-from maxdiffusion import pyconfig
 from max_utils import (
   device_put_replicated,
   get_memory_allocations,
@@ -57,8 +58,8 @@ def unpack(x: Array, height: int, width: int) -> Array:
 
 def vae_decode(latents, vae, state, config):
   img = unpack(x=latents, height=config.resolution, width=config.resolution)
-  img = vae.apply({"params": state.params}, img, deterministic=True, method=vae.decode).sample[0]
-  breakpoint()
+  img = img / vae.config.scaling_factor + vae.config.shift_factor
+  img = vae.apply({"params": state.params}, img, deterministic=True, method=vae.decode).sample
   return img
 
 def loop_body(
@@ -107,6 +108,19 @@ def prepare_latent_image_ids(height, width):
 
   return latent_image_ids.astype(jnp.bfloat16)
 
+def time_shift(mu: float, sigma: float, t: Array):
+  return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+def get_lin_function(
+  x1: float = 256,
+  y1: float = 0.5,
+  x2: float = 4096,
+  y2: float = 1.15
+) -> Callable[[float], float]:
+  m = (y2 - y1) / (x2 - x1)
+  b = y1 - m * x1
+  return lambda x: m * x + b
+
 def run_inference(
   states,
   transformer,
@@ -120,10 +134,18 @@ def run_inference(
   vec,
   guidance_vec,
 ):
+  
   timesteps = jnp.linspace(1, 0, config.num_inference_steps + 1)
+  # shifting the schedule to favor high timesteps for higher signal images
+  if config.time_shift:
+    # estimate mu based on linear estimation between two points
+    lin_function = get_lin_function(y1=config.base_shift, y2=config.max_shift)
+    mu = lin_function(latents.shape[1])
+    timesteps = time_shift(mu, 1.0, timesteps).tolist()
   c_ts = timesteps[:-1]
   p_ts = timesteps[1:]
 
+
   transformer_state = states["transformer"]
   vae_state = states["vae"]
 
@@ -142,7 +164,6 @@ def run_inference(
   with mesh, nn_partitioning.axis_rules(config.logical_axis_rules):
     latents, _, _, _ = jax.lax.fori_loop(0, config.num_inference_steps, loop_body_p, (latents, transformer_state, c_ts, p_ts))
   image = vae_decode_p(latents)
-  breakpoint()
   return image
 
 
@@ -383,6 +404,10 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
   
   timesteps = jnp.asarray([1.0] * global_batch_size, dtype=jnp.bfloat16)
   guidance = jnp.asarray([config.guidance_scale] * global_batch_size, dtype=jnp.bfloat16)
+  
+  # TODO - remove this later and figure out why t5x is returning wrong shape
+  prompt_embeds = jnp.ones((global_batch_size, 512, 4096))
+  
   validate_inputs(
     latents,
     latent_image_ids,
@@ -393,8 +418,7 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
     pooled_prompt_embeds
   )
 
-  # TODO - remove this later and figure out why t5x is returning wrong shape
-  prompt_embeds = jnp.ones((global_batch_size, 512, 4096))
+  
 
   # move inputs to device and shard
   data_sharding = jax.sharding.NamedSharding(mesh, P(*config.data_sharding))
@@ -420,11 +444,11 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
     config=config,
     mesh=mesh,
     weights_init_fn=weights_init_fn,
-    #model_params=transformer_params,
-    model_params=None,
+    model_params=transformer_params,
+    #model_params=None,
     training=False
   )
-  transformer_state = transformer_state.replace(params=transformer_params)
+  #transformer_state = transformer_state.replace(params=transformer_params)
   get_memory_allocations()
 
   states = {}
@@ -453,37 +477,27 @@ def validate_inputs(latents, latent_image_ids, prompt_embeds, text_ids, timestep
     in_shardings=(state_shardings,),
     out_shardings=None,
   )
-
-  img = p_run_inference(states)
-
-
-
-
-  # def run_inference(state, transformer):
-  #   img = transformer.apply(
-  #     {"params" : state.params},
-  #     img=latents,
-  #     img_ids=latent_image_ids,
-  #     txt=prompt_embeds,
-  #     txt_ids=text_ids,
-  #     timesteps=timesteps,
-  #     guidance=guidance,
-  #     y=pooled_prompt_embeds
-  #   )
-  #   return img
-
-  # p_run_inference = jax.jit(
-  #   functools.partial(
-  #     run_inference,
-  #     transformer=transformer,
-  #   ),
-  #   in_shardings=(transformer_state_shardings,),
-  #   out_shardings=None
-  # )
-
-  img = p_run_inference(transformer_state)
-  breakpoint()
-  print("img.shape: ", img.shape)
+  t0 = time.perf_counter()
+  p_run_inference(states).block_until_ready()
+  t1 = time.perf_counter()
+  max_logging.log(f"Compile time: {t1 - t0:.1f}s.")
+
+  t0 = time.perf_counter()
+  imgs = p_run_inference(states).block_until_ready()
+  t1 = time.perf_counter()
+  max_logging.log(f"Inference time: {t1 - t0:.1f}s.")
+
+  t0 = time.perf_counter()
+  imgs = p_run_inference(states).block_until_ready()
+  imgs = jax.experimental.multihost_utils.process_allgather(imgs, tiled=True)
+  t1 = time.perf_counter()
+  max_logging.log(f"Inference time: {t1 - t0:.1f}s.")
+  imgs = np.array(imgs)
+  imgs = (imgs * 0.5 + 0.5).clip(0, 1)
+  imgs = np.transpose(imgs, (0, 2, 3, 1))
+  imgs = np.uint8(imgs * 255)
+  for i, image in enumerate(imgs):
+    Image.fromarray(image).save(f"flux_{i}.png")
 
 
 def main(argv: Sequence[str]) -> None:
diff --git a/src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py b/src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py
@@ -154,14 +154,17 @@ def __call__(
         raise ValueError(
           "Didn't get guidance strength for guidance distrilled model."
         )
-      
-      vec = vec + MLPEmbedder(
+      guidance_in = MLPEmbedder(
         hidden_dim=inner_dim,
         dtype=self.dtype,
         weights_dtype=self.weights_dtype,
         precision=self.precision,
         name="guidance_in"
       )(timestep_embedding(guidance, 256))
+    else:
+      guidance_in = Identity(timestep_embedding(guidance, 256))
+
+      vec = vec + guidance_in
     
     vec = vec + MLPEmbedder(
       hidden_dim=inner_dim,