Added FA support for GPUs

ksikiric · ksikiric · commit 7b6ee9d01349 · 2025-02-13T13:02:27.000Z
diff --git a/src/maxdiffusion/configs/base_flux_dev.yml b/src/maxdiffusion/configs/base_flux_dev.yml
@@ -54,7 +54,7 @@ precision: "DEFAULT"
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'dot_product' # Supported attention: dot_product, flash
+attention: 'cudnn_flash_te' # Supported attention: dot_product, flash, cudnn_flash_te
 
 flash_block_sizes: {}
 # Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
@@ -193,11 +193,11 @@ learning_rate: 1.e-5
 scale_lr: False
 max_train_samples: -1
 # max_train_steps takes priority over num_train_epochs.
-max_train_steps: 1500
+max_train_steps: 50
 num_train_epochs: 1
 seed: 0
 output_dir: 'sdxl-model-finetuned'
-per_device_batch_size: 1
+per_device_batch_size: 8
 
 warmup_steps_fraction: 0.1
 learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -52,6 +52,24 @@ class AttentionOp(nn.Module):
   flash_block_sizes: BlockSizes = None
   dtype: DType = jnp.float32
 
+  def setup(self):
+    if self.attention_kernel == "cudnn_flash_te":
+      from transformer_engine.jax.flax.transformer import DotProductAttention  # pytype: disable=import-error
+      self.dpa_layer = DotProductAttention(
+        head_dim=self.dim_head,
+        num_attention_heads=self.heads,
+        num_gqa_groups=self.heads,
+        attn_mask_type="no_mask",  # 'no_mask', 'padding', 'causal', or 'padding_causal'
+        attn_bias_type="NO_BIAS",  # 'no_bias', 'pre_scale_bias' or 'post_scale_bias'
+        # attention_dropout=self.dropout_rate,
+        dropout_rng_name="aqt",
+        dtype=self.dtype,
+        # float32_logits=self.float32_logits,
+        qkv_layout="BSHD_BSHD_BSHD",  # 'BS3HD', 'BSHD_BS2HD' or 'BSHD_BSHD_BSHD'
+        scale_factor=self.scale,
+        transpose_batch_sequence=False,
+      )
+
   def check_attention_inputs(self, query: Array, key: Array, value: Array) -> None:
     """Check attention inputs."""
 
@@ -64,16 +82,22 @@ def check_attention_inputs(self, query: Array, key: Array, value: Array) -> None
   def apply_attention(self, query: Array, key: Array, value: Array):
     """Routes to different attention kernels."""
     self.check_attention_inputs(query, key, value)
-    can_use_flash_attention = (
-        query.shape[1] >= self.flash_min_seq_length
-        and key.shape[1] >= self.flash_min_seq_length
-        and value.shape[1] >= self.flash_min_seq_length
-    )
+
+    if self.attention_kernel == "flash":
+      can_use_flash_attention = (
+          query.shape[1] >= self.flash_min_seq_length
+          and key.shape[1] >= self.flash_min_seq_length
+          and value.shape[1] >= self.flash_min_seq_length
+      )
+    else:
+      can_use_flash_attention = True
 
     if self.attention_kernel == "dot_product" or self.use_memory_efficient_attention or not can_use_flash_attention:
       return self.apply_attention_dot(query, key, value)
     elif self.attention_kernel == "flash":
       return self.tpu_flash_attention(query, key * self.scale, value)
+    elif self.attention_kernel == "cudnn_flash_te":
+      return self.cudnn_flash_attention(query, key, value)
     else:
       raise ValueError(f"Unexpected attention kernel {self.attention_kernel=}.")
 
@@ -132,6 +156,32 @@ def wrap_flash_attention(query, key, value):
 
     return x
 
+  def cudnn_flash_attention(
+      self,
+      query: Array,
+      key: Array,
+      value: Array,
+  ) -> Array:
+    """CUDNN Flash Attention with Transformer Engine.
+    1. Stable API, supports GQA
+    2. Supports head_dim till 128; head_dim=256 support will be added soon
+    """
+    # These imports are only meant to work in a GPU build.
+    # copied from tpu_flash_attention
+    query = self.reshape_data_for_cudnn_flash(query)
+    key = self.reshape_data_for_cudnn_flash(key)
+    value = self.reshape_data_for_cudnn_flash(value)
+
+    cudnn_flash_axis_names = (BATCH, LENGTH, HEAD, D_KV)
+    axis_names = nn.logical_to_mesh_axes(cudnn_flash_axis_names)
+
+    query = nn.with_logical_constraint(query, axis_names)
+    key = nn.with_logical_constraint(key, axis_names)
+    value = nn.with_logical_constraint(value, axis_names)
+
+    out = self.dpa_layer(query, key, value, mask=None)
+    return self.reshape_data_from_cudnn_flash(out)
+
   def apply_attention_dot(self, query: Array, key: Array, value: Array):
     """Apply Attention."""
     if self.split_head_dim:
@@ -209,6 +259,16 @@ def reshape_batch_dim_to_heads(self, tensor):
     tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size)
     return tensor
 
+  def reshape_data_for_cudnn_flash(self, tensor):
+    # reshapes from [b, s, h * d] to [b, s, h, d] (input format to flash format)
+    batch, seq, heads_and_dim_head = tensor.shape
+    tensor = tensor.reshape(batch, seq, self.heads, heads_and_dim_head // self.heads)
+    return tensor
+
+  def reshape_data_from_cudnn_flash(self, tensor):
+    # reshapes from [b, s, h, d] back to [b, s, h * d]
+    return tensor.reshape(tensor.shape[0], tensor.shape[1], -1)
+
   def reshape_data_for_flash(self, tensor):
     # reshapes from [b, s, h * d] to [b, h, s, d] (input format to flash format)
     batch, seq, heads_and_dim_head = tensor.shape