merge main

jfacevedo-google · jfacevedo-google · commit e0a538f3a75c · 2025-03-13T17:20:07.000Z
diff --git a/requirements.txt b/requirements.txt
@@ -24,9 +24,10 @@ tensorflow-datasets>=4.9.6
 ruff>=0.1.5,<=0.2
 git+https://github.com/mlperf/logging.git
 opencv-python-headless==4.10.0.84
-orbax-checkpoint==0.10.2
+orbax-checkpoint==0.10.3
 tokenizers==0.21.0
 huggingface_hub==0.24.7
 transformers==4.48.1
 einops==0.8.0
 sentencepiece
+aqtp
diff --git a/requirements_with_jax_stable_stack.txt b/requirements_with_jax_stable_stack.txt
@@ -14,7 +14,7 @@ jaxlib>=0.4.30
 Jinja2
 opencv-python-headless==4.10.0.84
 optax>=0.2.3
-orbax-checkpoint==0.10.2
+orbax-checkpoint==0.10.3
 parameterized
 Pillow
 pyink
diff --git a/src/maxdiffusion/checkpointing/base_stable_diffusion_checkpointer.py b/src/maxdiffusion/checkpointing/base_stable_diffusion_checkpointer.py
@@ -88,14 +88,11 @@ def create_unet_state(self, pipeline, params, checkpoint_item_name, is_training)
         config=self.config,
         mesh=self.mesh,
         weights_init_fn=weights_init_fn,
-        model_params=None,
+        model_params=None if self.config.train_new_unet else params.get("unet", None),
         checkpoint_manager=self.checkpoint_manager,
         checkpoint_item=checkpoint_item_name,
         training=is_training,
     )
-    if not self.config.train_new_unet:
-      unet_state = unet_state.replace(params=params.get("unet", None))
-      unet_state = jax.device_put(unet_state, state_mesh_shardings)
     return unet_state, state_mesh_shardings, learning_rate_scheduler
 
   def create_vae_state(self, pipeline, params, checkpoint_item_name, is_training=False):
@@ -153,20 +150,18 @@ def create_text_encoder_2_state(self, pipeline, params, checkpoint_item_name, is
         input_shape=(self.total_train_batch_size, pipeline.tokenizer.model_max_length),
     )
 
-    state, state_mesh_shardings = max_utils.setup_initial_state(
+    # state, state_mesh_shardings =
+    return max_utils.setup_initial_state(
         model=pipeline.text_encoder_2,
         tx=tx,
         config=self.config,
         mesh=self.mesh,
         weights_init_fn=weights_init_fn,
-        model_params=None,
+        model_params=params.get("text_encoder_2", None),
         checkpoint_manager=self.checkpoint_manager,
         checkpoint_item=checkpoint_item_name,
         training=is_training,
     )
-    state = state.replace(params=params.get("text_encoder_2", None))
-    state = jax.device_put(state, state_mesh_shardings)
-    return state, state_mesh_shardings
 
   def restore_data_iterator_state(self, data_iterator):
     if (
diff --git a/src/maxdiffusion/configs/base14.yml b/src/maxdiffusion/configs/base14.yml
@@ -216,3 +216,7 @@ prior_loss_weight: 1.0
 num_class_images: 100
 # If true, set dataset_save_location.
 cache_dreambooth_dataset: False
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
diff --git a/src/maxdiffusion/configs/base21.yml b/src/maxdiffusion/configs/base21.yml
@@ -217,3 +217,7 @@ prior_loss_weight: 1.0
 num_class_images: 100
 # If true, set dataset_save_location.
 cache_dreambooth_dataset: False
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
diff --git a/src/maxdiffusion/configs/base_2_base.yml b/src/maxdiffusion/configs/base_2_base.yml
@@ -231,3 +231,8 @@ prior_loss_weight: 1.0
 num_class_images: 100
 # If true, set dataset_save_location.
 cache_dreambooth_dataset: False
+
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
diff --git a/src/maxdiffusion/configs/base_flux_dev.yml b/src/maxdiffusion/configs/base_flux_dev.yml
@@ -260,3 +260,8 @@ controlnet_model_name_or_path: 'diffusers/controlnet-canny-sdxl-1.0'
 controlnet_from_pt: True
 controlnet_conditioning_scale: 0.5
 controlnet_image: 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c1/Google_%22G%22_logo.svg/1024px-Google_%22G%22_logo.svg.png'
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
+
diff --git a/src/maxdiffusion/configs/base_flux_schnell.yml b/src/maxdiffusion/configs/base_flux_schnell.yml
@@ -268,3 +268,7 @@ controlnet_model_name_or_path: 'diffusers/controlnet-canny-sdxl-1.0'
 controlnet_from_pt: True
 controlnet_conditioning_scale: 0.5
 controlnet_image: 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c1/Google_%22G%22_logo.svg/1024px-Google_%22G%22_logo.svg.png'
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
diff --git a/src/maxdiffusion/configs/base_xl.yml b/src/maxdiffusion/configs/base_xl.yml
@@ -233,3 +233,9 @@ controlnet_model_name_or_path: 'diffusers/controlnet-canny-sdxl-1.0'
 controlnet_from_pt: True
 controlnet_conditioning_scale: 0.5
 controlnet_image: 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c1/Google_%22G%22_logo.svg/1024px-Google_%22G%22_logo.svg.png'
+enable_mllog: False
+
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
diff --git a/src/maxdiffusion/configs/base_xl_lightning.yml b/src/maxdiffusion/configs/base_xl_lightning.yml
@@ -185,3 +185,7 @@ lora_config: {
 # }
 
 enable_mllog: False
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
diff --git a/src/maxdiffusion/configuration_utils.py b/src/maxdiffusion/configuration_utils.py
@@ -577,6 +577,7 @@ def to_json_saveable(value):
     config_dict.pop("mesh", None)
     config_dict.pop("precision", None)
     config_dict.pop("weights_dtype", None)
+    config_dict.pop("quant", None)
 
     return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
 
@@ -659,7 +660,7 @@ def init(self, *args, **kwargs):
       # ignore flax specific attributes
       if field.name in self._flax_internal_args:
         continue
-      if type(field.default) == dataclasses._MISSING_TYPE:
+      if type(field.default) == dataclasses._MISSING_TYPE:  # noqa: E721
         default_kwargs[field.name] = None
       else:
         default_kwargs[field.name] = getattr(self, field.name)
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -24,6 +24,9 @@
 from einops import rearrange
 from .. import common_types, max_logging
 
+from . import quantizations
+
+
 Array = common_types.Array
 Mesh = common_types.Mesh
 DType = common_types.DType
@@ -36,6 +39,14 @@
 HEAD = common_types.HEAD
 D_KV = common_types.D_KV
 EMBED = common_types.EMBED
+Quant = quantizations.AqtQuantization
+
+
+Quant = quantizations.AqtQuantization
+
+
+def _maybe_aqt_einsum(quant: Quant):
+  return jnp.einsum if quant is None else quant.einsum()
 
 
 class AttentionOp(nn.Module):
@@ -51,6 +62,7 @@ class AttentionOp(nn.Module):
   flash_min_seq_length: int = 4096
   flash_block_sizes: BlockSizes = None
   dtype: DType = jnp.float32
+  quant: Quant = None
 
   def setup(self):
     if self.attention_kernel == "cudnn_flash_te":
@@ -599,6 +611,7 @@ class FlaxAttention(nn.Module):
           jax mesh is required if attention is set to flash.
       dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
           Parameters `dtype`
+      quant (`AqtQuantization`, *optional*, defaults to None)
 
   """
 
@@ -619,6 +632,7 @@ class FlaxAttention(nn.Module):
   value_axis_names: AxisNames = (BATCH, LENGTH, HEAD)
   out_axis_names: AxisNames = (BATCH, LENGTH, HEAD)
   precision: jax.lax.Precision = None
+  quant: Quant = None
 
   def setup(self):
 
@@ -638,10 +652,13 @@ def setup(self):
         split_head_dim=self.split_head_dim,
         flash_block_sizes=self.flash_block_sizes,
         dtype=self.dtype,
+        quant=self.quant,
     )
 
     qkv_init_kernel = nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "heads"))
-
+    dot_general_cls = None
+    if self.quant:
+      dot_general_cls = self.quant.dot_general_cls()
     self.query = nn.Dense(
         inner_dim,
         kernel_init=qkv_init_kernel,
@@ -650,6 +667,7 @@ def setup(self):
         param_dtype=self.weights_dtype,
         name="to_q",
         precision=self.precision,
+        dot_general_cls=dot_general_cls,
     )
 
     self.key = nn.Dense(
@@ -660,6 +678,7 @@ def setup(self):
         param_dtype=self.weights_dtype,
         name="to_k",
         precision=self.precision,
+        dot_general_cls=dot_general_cls,
     )
 
     self.value = nn.Dense(
@@ -670,6 +689,7 @@ def setup(self):
         param_dtype=self.weights_dtype,
         name="to_v",
         precision=self.precision,
+        dot_general_cls=dot_general_cls,
     )
 
     self.proj_attn = nn.Dense(
@@ -679,6 +699,7 @@ def setup(self):
         param_dtype=self.weights_dtype,
         name="to_out_0",
         precision=self.precision,
+        dot_general_cls=dot_general_cls,
     )
     self.dropout_layer = nn.Dropout(rate=self.dropout)
 
@@ -731,6 +752,7 @@ class FlaxBasicTransformerBlock(nn.Module):
           Overrides default block sizes for flash attention.
       mesh (`jax.sharding.mesh`, *optional*, defaults to `None`):
           jax mesh is required if attention is set to flash.
+      quant (`AqtQuantization`, *optional*, defaults to None)
   """
 
   dim: int
@@ -747,6 +769,7 @@ class FlaxBasicTransformerBlock(nn.Module):
   flash_block_sizes: BlockSizes = None
   mesh: jax.sharding.Mesh = None
   precision: jax.lax.Precision = None
+  quant: Quant = None
 
   def setup(self):
     # self attention (or cross_attention if only_cross_attention is True)
@@ -764,6 +787,7 @@ def setup(self):
         dtype=self.dtype,
         weights_dtype=self.weights_dtype,
         precision=self.precision,
+        quant=self.quant,
     )
     # cross attention
     self.attn2 = FlaxAttention(
@@ -780,6 +804,7 @@ def setup(self):
         dtype=self.dtype,
         weights_dtype=self.weights_dtype,
         precision=self.precision,
+        quant=self.quant,
     )
     self.ff = FlaxFeedForward(
         dim=self.dim, dropout=self.dropout, dtype=self.dtype, weights_dtype=self.weights_dtype, precision=self.precision
@@ -852,6 +877,8 @@ class FlaxTransformer2DModel(nn.Module):
           Overrides default block sizes for flash attention.
       mesh (`jax.sharding.mesh`, *optional*, defaults to `None`):
           jax mesh is required if attention is set to flash.
+      quant (`AqtQuantization`, *optional*, defaults to None)
+            Configures AQT quantization github.com/google/aqt.
   """
 
   in_channels: int
@@ -872,6 +899,7 @@ class FlaxTransformer2DModel(nn.Module):
   norm_num_groups: int = 32
   precision: jax.lax.Precision = None
   hidden_state_axis_names: AxisNames = (BATCH, LENGTH, D_KV)
+  quant: Quant = (None,)
 
   def setup(self):
     self.norm = nn.GroupNorm(num_groups=self.norm_num_groups, epsilon=1e-5, dtype=self.dtype, param_dtype=self.weights_dtype)
@@ -917,6 +945,7 @@ def setup(self):
             flash_block_sizes=self.flash_block_sizes,
             mesh=self.mesh,
             precision=self.precision,
+            quant=self.quant,
         )
         for _ in range(self.depth)
     ]
diff --git a/src/maxdiffusion/models/quantizations.py b/src/maxdiffusion/models/quantizations.py
diff --git a/src/maxdiffusion/models/unet_2d_blocks_flax.py b/src/maxdiffusion/models/unet_2d_blocks_flax.py
diff --git a/src/maxdiffusion/models/unet_2d_condition_flax.py b/src/maxdiffusion/models/unet_2d_condition_flax.py
diff --git a/src/maxdiffusion/pipelines/pipeline_flax_utils.py b/src/maxdiffusion/pipelines/pipeline_flax_utils.py
diff --git a/src/maxdiffusion/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py b/src/maxdiffusion/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py