AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/base_flux_dev.yml‎
Lines changed: 11 additions & 2 deletions b/‎src/maxdiffusion/configs/base_flux_dev.yml‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎src/maxdiffusion/configs/base_flux_schnell.yml‎
Lines changed: 11 additions & 2 deletions b/‎src/maxdiffusion/configs/base_flux_schnell.yml‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎src/maxdiffusion/generate_flux.py‎
Lines changed: 5 additions & 7 deletions b/‎src/maxdiffusion/generate_flux.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 171 additions & 0 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 40 additions & 4 deletions b/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 40 additions & 4 deletions
@@ -52,10 +52,19 @@ activations_dtype: 'bfloat16'
 precision: "DEFAULT"
 
 # Set true to load weights from pytorch
-from_pt: False
+from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
-flash_block_sizes: {}
+flash_block_sizes: {
+  "block_q" : 128,
+  "block_kv" : 128,
+  "block_kv_compute" : 128,
+  "block_q_dkv" : 128,
+  "block_kv_dkv" : 128,
+  "block_kv_dkv_compute" : 128,
+  "block_q_dq" : 128,
+  "block_kv_dq" : 128
+}
 # GroupNorm groups
 norm_num_groups: 32
 
 
@@ -51,10 +51,19 @@ activations_dtype: 'bfloat16'
 precision: "DEFAULT"
 
 # Set true to load weights from pytorch
-from_pt: False
+from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
-flash_block_sizes: {}
+flash_block_sizes: {
+  "block_q" : 128,
+  "block_kv" : 128,
+  "block_kv_compute" : 128,
+  "block_q_dkv" : 128,
+  "block_kv_dkv" : 128,
+  "block_kv_dkv_compute" : 128,
+  "block_q_dq" : 128,
+  "block_kv_dq" : 128
+}
 # GroupNorm groups
 norm_num_groups: 32
 
 
@@ -79,16 +79,14 @@ def loop_body(
   t_vec = jnp.full((latents.shape[0], ), t_curr, dtype=latents.dtype)
   pred = transformer.apply(
     {"params" : state.params},
-    img=latents,
+    hidden_states=latents,
     img_ids=latent_image_ids,
-    txt=prompt_embeds,
+    encoder_hidden_states=prompt_embeds,
     txt_ids=txt_ids,
-    timesteps=t_vec,
+    timestep=t_vec,
     guidance=guidance_vec,
-    y=vec
-  )
-  jax.debug.print("*****pred max: {x}", x=np.max(pred))
-  jax.debug.print("*****pred min: {x}", x=np.min(pred))
+    pooled_projections=vec
+  ).sample
   latents = latents + (t_prev - t_curr) * pred
   latents = jnp.array(latents, dtype=latents_dtype)
   return latents, state, c_ts, p_ts
 
@@ -322,6 +322,177 @@ def chunk_scanner(chunk_idx, _):
 
   return jnp.concatenate(res, axis=-3)  # fuse the chunked result back
 
+class FlaxFluxAttention(nn.Module):
+  query_dim: int
+  heads: int = 8
+  dim_head: int = 64
+  dropout: float = 0.0
+  use_memory_efficient_attention: bool = False
+  split_head_dim: bool = False
+  attention_kernel: str = "dot_product"
+  flash_min_seq_length: int = 4096
+  flash_block_sizes: BlockSizes = None
+  mesh: jax.sharding.Mesh = None
+  dtype: jnp.dtype = jnp.float32
+  weights_dtype: jnp.dtype = jnp.float32
+  query_axis_names: AxisNames = (BATCH, LENGTH, HEAD)
+  key_axis_names: AxisNames = (BATCH, LENGTH, HEAD)
+  value_axis_names: AxisNames = (BATCH, LENGTH, HEAD)
+  out_axis_names: AxisNames = (BATCH, LENGTH, EMBED)
+  precision: jax.lax.Precision = None
+  qkv_bias: bool = False
+
+  def setup(self):
+    if self.attention_kernel in {"flash", "cudnn_flash_te"} and self.mesh is None:
+      raise ValueError(f"The flash attention kernel requires a value for mesh, but mesh is {self.mesh}")
+    inner_dim = self.dim_head * self.heads
+    scale = self.dim_head**-0.5
+
+    self.attention_op = AttentionOp(
+        mesh=self.mesh,
+        attention_kernel=self.attention_kernel,
+        scale=scale,
+        heads=self.heads,
+        dim_head=self.dim_head,
+        flash_min_seq_length=self.flash_min_seq_length,
+        use_memory_efficient_attention=self.use_memory_efficient_attention,
+        split_head_dim=self.split_head_dim,
+        flash_block_sizes=self.flash_block_sizes,
+        dtype=self.dtype,
+        float32_qk_product=False,
+    )
+
+    kernel_axes = ("embed", "heads")
+    qkv_init_kernel = nn.with_logical_partitioning(nn.initializers.lecun_normal(), kernel_axes)
+
+    self.qkv = nn.Dense(
+        inner_dim * 3,
+        kernel_init=qkv_init_kernel,
+        use_bias=self.qkv_bias,
+        bias_init=nn.with_logical_partitioning(nn.initializers.zeros, ("heads",)),
+        dtype=self.dtype,
+        param_dtype=self.weights_dtype,
+        name="i_qkv",
+        precision=self.precision,
+    )
+
+    self.encoder_qkv = nn.Dense(
+        inner_dim * 3,
+        kernel_init=qkv_init_kernel,
+        use_bias=self.qkv_bias,
+        bias_init=nn.with_logical_partitioning(nn.initializers.zeros, ("heads",)),
+        dtype=self.dtype,
+        param_dtype=self.weights_dtype,
+        name="e_qkv",
+        precision=self.precision,
+    )
+
+    self.proj_attn = nn.Dense(
+        self.query_dim,
+        kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), kernel_axes),
+        use_bias=True,
+        bias_init=nn.with_logical_partitioning(nn.initializers.zeros, ("heads",)),
+        dtype=self.dtype,
+        param_dtype=self.weights_dtype,
+        name="i_proj",
+        precision=self.precision,
+    )
+
+    self.encoder_proj_attn = nn.Dense(
+        self.query_dim,
+        kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), kernel_axes),
+        use_bias=True,
+        bias_init=nn.with_logical_partitioning(nn.initializers.zeros, ("heads",)),
+        dtype=self.dtype,
+        param_dtype=self.weights_dtype,
+        name="e_proj",
+        precision=self.precision,
+    )
+
+    self.query_norm = nn.RMSNorm(
+        dtype=self.dtype,
+        scale_init=nn.with_logical_partitioning(nn.initializers.ones, ("heads",)),
+        param_dtype=self.weights_dtype,
+    )
+    self.key_norm = nn.RMSNorm(
+        dtype=self.dtype,
+        scale_init=nn.with_logical_partitioning(nn.initializers.ones, ("heads",)),
+        param_dtype=self.weights_dtype,
+    )
+
+    self.encoder_query_norm = nn.RMSNorm(
+        dtype=self.dtype,
+        scale_init=nn.with_logical_partitioning(nn.initializers.ones, ("heads",)),
+        param_dtype=self.weights_dtype,
+    )
+    self.encoder_key_norm = nn.RMSNorm(
+        dtype=self.dtype,
+        scale_init=nn.with_logical_partitioning(nn.initializers.ones, ("heads",)),
+        param_dtype=self.weights_dtype,
+    )
+
+  def apply_rope(self, xq: Array, xk: Array, freqs_cis: Array) -> tuple[Array, Array]:
+    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)
+
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+
+    return xq_out.reshape(*xq.shape).astype(xq.dtype), xk_out.reshape(*xk.shape).astype(xk.dtype)
+
+  def __call__(self, hidden_states, encoder_hidden_states=None, attention_mask=None, image_rotary_emb=None):
+
+    qkv_proj = self.qkv(hidden_states)
+    B, L = hidden_states.shape[:2]
+    H, D, K = self.heads, qkv_proj.shape[-1] // (self.heads * 3), 3
+    qkv_proj = qkv_proj.reshape(B, L, K, H, D).transpose(2, 0, 3, 1, 4)
+    query_proj, key_proj, value_proj = qkv_proj
+
+    query_proj = self.query_norm(query_proj)
+
+    key_proj = self.key_norm(key_proj)
+
+    if encoder_hidden_states is not None:
+
+      encoder_qkv_proj = self.encoder_qkv(encoder_hidden_states)
+      B, L = encoder_hidden_states.shape[:2]
+      H, D, K = self.heads, encoder_qkv_proj.shape[-1] // (self.heads * 3), 3
+      encoder_qkv_proj = encoder_qkv_proj.reshape(B, L, K, H, D).transpose(2, 0, 3, 1, 4)
+      encoder_query_proj, encoder_key_proj, encoder_value_proj = encoder_qkv_proj
+
+      encoder_query_proj = self.encoder_query_norm(encoder_query_proj)
+
+      encoder_key_proj = self.encoder_key_norm(encoder_key_proj)
+
+      query_proj = jnp.concatenate((encoder_query_proj, query_proj), axis=2)
+      key_proj = jnp.concatenate((encoder_key_proj, key_proj), axis=2)
+      value_proj = jnp.concatenate((encoder_value_proj, value_proj), axis=2)
+
+      query_proj = nn.with_logical_constraint(query_proj, self.query_axis_names)
+      key_proj = nn.with_logical_constraint(key_proj, self.key_axis_names)
+      value_proj = nn.with_logical_constraint(value_proj, self.value_axis_names)
+
+    image_rotary_emb = rearrange(image_rotary_emb, "n d (i j) -> n d i j", i=2, j=2)
+    query_proj, key_proj = self.apply_rope(query_proj, key_proj, image_rotary_emb)
+
+    query_proj = query_proj.transpose(0, 2, 1, 3).reshape(query_proj.shape[0], query_proj.shape[2], -1)
+    key_proj = key_proj.transpose(0, 2, 1, 3).reshape(key_proj.shape[0], key_proj.shape[2], -1)
+    value_proj = value_proj.transpose(0, 2, 1, 3).reshape(value_proj.shape[0], value_proj.shape[2], -1)
+
+    attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+    context_attn_output = None
+
+    if encoder_hidden_states is not None:
+      context_attn_output, attn_output = (
+          attn_output[:, : encoder_hidden_states.shape[1]],
+          attn_output[:, encoder_hidden_states.shape[1] :],
+      )
+
+      attn_output = self.proj_attn(attn_output)
+
+      context_attn_output = self.encoder_proj_attn(context_attn_output)
+
+    return attn_output, context_attn_output
 
 class FlaxFluxAttention(nn.Module):
   query_dim: int
 
@@ -73,9 +73,14 @@ class FlaxTimestepEmbedding(nn.Module):
 
   @nn.compact
   def __call__(self, temb):
-    temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, param_dtype=self.weights_dtype, name="linear_1")(temb)
+    temb = nn.Dense(self.time_embed_dim,
+                    dtype=self.dtype,
+                    param_dtype=self.weights_dtype,
+                    name="linear_1")(temb)
     temb = nn.silu(temb)
-    temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, param_dtype=self.weights_dtype, name="linear_2")(temb)
+    temb = nn.Dense(self.time_embed_dim,
+                    dtype=self.dtype,
+                    param_dtype=self.weights_dtype, name="linear_2")(temb)
     return temb
 
 
@@ -98,7 +103,6 @@ def __call__(self, timesteps):
         timesteps, embedding_dim=self.dim, flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.freq_shift
     )
 
-
 def get_1d_rotary_pos_embed(
     dim: int, pos: Union[jnp.array, int], theta: float = 10000.0, linear_factor=1.0, ntk_factor=1.0, freqs_dtype=jnp.float32
 ):
@@ -119,7 +123,6 @@ def get_1d_rotary_pos_embed(
 
   return out
 
-
 class PixArtAlphaTextProjection(nn.Module):
   """
   Projects caption embeddings. Also handles dropout for classifier-free guidance.
@@ -236,3 +239,36 @@ def __call__(self, timestep, guidance, pooled_projection):
     conditioning = time_guidance_emb + pooled_projections
 
     return conditioning
+
+
+# class HFEmbedder(nnx.Module):
+
+#   def __init__(self, version: str, max_length: int, **hf_kwargs):
+#     super().__init__()
+#     self.is_clip = version.split("/")[1].startswith("clip")
+#     self.max_length = max_length
+#     self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+
+#     if self.is_clip:
+#       self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(version, max_length=max_length, use_fast=True)
+#       self.hf_module: FlaxCLIPTextModel = FlaxCLIPTextModel.from_pretrained(version, **hf_kwargs)
+#     else:
+#       self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(version, max_length=max_length, use_fast=True)
+#       self.hf_module: FlaxT5EncoderModel = FlaxT5EncoderModel.from_pretrained(version, **hf_kwargs)
+
+#   def __call__(self, text: list[str]):
+#     batch_encoding = self.tokenizer(
+#         text,
+#         truncation=True,
+#         max_length=self.max_length,
+#         return_length=False,
+#         return_overflowing_tokens=False,
+#         padding="max_length",
+#         return_tensors="np",
+#     )
+#     outputs = self.hf_module(
+#         input_ids=batch_encoding["input_ids"],
+#         attention_mask=None,
+#         output_hidden_states=False,
+#     )
+#     return outputs[self.output_key]