AI-Hypercomputer
diff --git a/‎src/maxdiffusion/__init__.py‎
Lines changed: 182 additions & 196 deletions b/‎src/maxdiffusion/__init__.py‎
Lines changed: 182 additions & 196 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/ring_attention_kernel.py‎
Lines changed: 0 additions & 1 deletion b/‎src/maxdiffusion/kernels/splash_attention/ring_attention_kernel.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel.py‎
Lines changed: 0 additions & 2 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_test.py‎
Lines changed: 1 addition & 8 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_test.py‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask.py‎
Lines changed: 19 additions & 25 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask.py‎
Lines changed: 19 additions & 25 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_info.py‎
Lines changed: 4 additions & 6 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_info.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_test.py‎
Lines changed: 52 additions & 59 deletions b/‎src/maxdiffusion/kernels/splash_attention/splash_attention_mask_test.py‎
Lines changed: 52 additions & 59 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 0 additions & 4 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py‎
Lines changed: 42 additions & 46 deletions b/‎src/maxdiffusion/models/flux/transformers/transformer_flux_flax.py‎
Lines changed: 42 additions & 46 deletions
@@ -79,7 +79,6 @@ def _ring_attention_forward(
     ring_axis: str,
     rotate_segment_ids: bool = True,
 ) -> tuple[jax.Array, tuple[jax.Array, jax.Array]]:
-
   if q.shape[-1] != k.shape[-1]:
     raise NotImplementedError("Queries and keys must have the same head dimension.")
 
 
@@ -941,7 +941,6 @@ def _splash_attention_fwd(
     dkv_mask_sparsity: float,
     max_logit_value: jax.Array | None = None,
 ) -> tuple[tuple[jax.Array], base.SplashResidualsType]:
-
   # TODO: add some higher order AD check that isn't save_residuals based.
   # if save_residuals:
   #   raise NotImplementedError("Higher-order AD not supported.")
@@ -1180,7 +1179,6 @@ def init():
     dv_scratch_ref[...] = jnp.zeros_like(dv_scratch_ref)
 
   def body(i, _, has_partial_mask=False):
-
     slice_k = pl.ds(i * bkv_compute, bkv_compute)
     q = q_ref[...]  # We keep q potentially transposed, since it's always RHS
     if config.use_base2_exp:
 
@@ -290,14 +290,7 @@ def _generate_inputs(
     is_mqa: bool,
     is_segmented: bool,
     use_sinks: bool = False,
-) -> tuple[
-    jax.Array,
-    jax.Array,
-    jax.Array,
-    jax.Array | None,
-    splash.SegmentIds | None,
-    jax.Array,
-]:
+) -> tuple[jax.Array, jax.Array, jax.Array, jax.Array | None, splash.SegmentIds | None, jax.Array,]:
   seed = data.draw(seed_strategy())
   key = random.key(seed)
   k1, k2, k3, k_sinks, k_do = random.split(key, 5)
 
@@ -278,14 +278,12 @@ def __eq__(self, other: object):
     return self.shape == other.shape and self.offset == other.offset and np.array_equal(self.q_sequence, other.q_sequence)
 
   def __hash__(self):
-    return hash(
-        (
-            type(self),
-            self.shape,
-            self.offset,
-            self.q_sequence.tobytes() if self.q_sequence is not None else None,
-        )
-    )
+    return hash((
+        type(self),
+        self.shape,
+        self.offset,
+        self.q_sequence.tobytes() if self.q_sequence is not None else None,
+    ))
 
 
 class ChunkedCausalMask(_ComputableMask):
@@ -340,14 +338,12 @@ def __eq__(self, other: object):
     )
 
   def __hash__(self):
-    return hash(
-        (
-            type(self),
-            self.shape,
-            self.chunk_size,
-            self.q_sequence.tobytes() if self.q_sequence is not None else None,
-        )
-    )
+    return hash((
+        type(self),
+        self.shape,
+        self.chunk_size,
+        self.q_sequence.tobytes() if self.q_sequence is not None else None,
+    ))
 
 
 class LocalMask(_ComputableMask):
@@ -419,15 +415,13 @@ def __eq__(self, other: object):
     )
 
   def __hash__(self):
-    return hash(
-        (
-            type(self),
-            self.shape,
-            self.window_size,
-            self.offset,
-            self.q_sequence.tobytes() if self.q_sequence is not None else None,
-        )
-    )
+    return hash((
+        type(self),
+        self.shape,
+        self.window_size,
+        self.offset,
+        self.q_sequence.tobytes() if self.q_sequence is not None else None,
+    ))
 
 
 @dataclasses.dataclass(slots=True)
 
@@ -446,12 +446,10 @@ def _process_mask(
   # Partial blocks are deduplicated and stored in unique_chunks to save memory.
   for coords in np.ndindex((q_blocks_count, kv_blocks_count)):
     (q_idx, kv_idx) = coords
-    chunk = mask[
-        (
-            slice(q_idx * q_block_size, (q_idx + 1) * q_block_size),
-            slice(kv_idx * kv_block_size, (kv_idx + 1) * kv_block_size),
-        )
-    ]
+    chunk = mask[(
+        slice(q_idx * q_block_size, (q_idx + 1) * q_block_size),
+        slice(kv_idx * kv_block_size, (kv_idx + 1) * kv_block_size),
+    )]
     if chunk.any():
       if chunk.all():
         state_grid[q_idx, kv_idx] = 2
 
@@ -374,39 +374,37 @@ def test_lazy_causal_mask_chunking(self, block_size: tuple[int, int], shape: tup
         block_size,
     )
 
-  @parameterized.parameters(
-      [
-          ((256, 256), (1024, 1024), (128, None), 0),
-          ((256, 128), (1024, 1024), (128, None), 16),
-          ((128, 256), (1024, 1024), (128, None), 16),
-          ((256, 256), (1024, 1024), (128, 256), 0),
-          ((256, 128), (1024, 1024), (128, 256), 0),
-          ((128, 256), (1024, 1024), (128, 256), 16),
-          ((256, 256), (1024, 1024), (None, 256), 0),
-          ((256, 128), (1024, 1024), (None, 256), 32),
-          ((128, 256), (1024, 1024), (None, 256), 32),
-          #
-          ((256, 256), (1024, 2048), (128, None), 0),
-          ((256, 128), (1024, 2048), (128, None), 16),
-          ((128, 256), (1024, 2048), (128, None), 16),
-          ((256, 256), (1024, 2048), (128, 256), 0),
-          ((256, 128), (1024, 2048), (128, 256), 0),
-          ((128, 256), (1024, 2048), (128, 256), 16),
-          ((256, 256), (1024, 2048), (None, 256), 0),
-          ((256, 128), (1024, 2048), (None, 256), 32),
-          ((128, 256), (1024, 2048), (None, 256), 32),
-          #
-          ((256, 256), (2048, 1024), (128, None), 0),
-          ((256, 128), (2048, 1024), (128, None), 16),
-          ((128, 256), (2048, 1024), (128, None), 16),
-          ((256, 256), (2048, 1024), (128, 256), 0),
-          ((256, 128), (2048, 1024), (128, 256), 0),
-          ((128, 256), (2048, 1024), (128, 256), 16),
-          ((256, 256), (2048, 1024), (None, 256), 0),
-          ((256, 128), (2048, 1024), (None, 256), 32),
-          ((128, 256), (2048, 1024), (None, 256), 32),
-      ]
-  )
+  @parameterized.parameters([
+      ((256, 256), (1024, 1024), (128, None), 0),
+      ((256, 128), (1024, 1024), (128, None), 16),
+      ((128, 256), (1024, 1024), (128, None), 16),
+      ((256, 256), (1024, 1024), (128, 256), 0),
+      ((256, 128), (1024, 1024), (128, 256), 0),
+      ((128, 256), (1024, 1024), (128, 256), 16),
+      ((256, 256), (1024, 1024), (None, 256), 0),
+      ((256, 128), (1024, 1024), (None, 256), 32),
+      ((128, 256), (1024, 1024), (None, 256), 32),
+      #
+      ((256, 256), (1024, 2048), (128, None), 0),
+      ((256, 128), (1024, 2048), (128, None), 16),
+      ((128, 256), (1024, 2048), (128, None), 16),
+      ((256, 256), (1024, 2048), (128, 256), 0),
+      ((256, 128), (1024, 2048), (128, 256), 0),
+      ((128, 256), (1024, 2048), (128, 256), 16),
+      ((256, 256), (1024, 2048), (None, 256), 0),
+      ((256, 128), (1024, 2048), (None, 256), 32),
+      ((128, 256), (1024, 2048), (None, 256), 32),
+      #
+      ((256, 256), (2048, 1024), (128, None), 0),
+      ((256, 128), (2048, 1024), (128, None), 16),
+      ((128, 256), (2048, 1024), (128, None), 16),
+      ((256, 256), (2048, 1024), (128, 256), 0),
+      ((256, 128), (2048, 1024), (128, 256), 0),
+      ((128, 256), (2048, 1024), (128, 256), 16),
+      ((256, 256), (2048, 1024), (None, 256), 0),
+      ((256, 128), (2048, 1024), (None, 256), 32),
+      ((128, 256), (2048, 1024), (None, 256), 32),
+  ])
   def test_lazy_local_mask_chunking(
       self,
       block_size: tuple[int, int],
@@ -1164,17 +1162,15 @@ def test_two_qseq_shards_causal_local_stacked(self):
 
     expected_num_active_blocks = np.array([10, 10], dtype=np.int32)
 
-    expected_partial_mask_blocks = np.stack(
-        [
-            np.tri(*block_shape, dtype=np.int8),
-            np.triu(
-                np.tri(*block_shape, window_size, dtype=np.int8),
-                -window_size,
-            ),
-            np.tri(*block_shape, -window_size, dtype=np.int8),
-            np.triu(np.ones(block_shape, dtype=np.int8), window_size),
-        ]
-    )
+    expected_partial_mask_blocks = np.stack([
+        np.tri(*block_shape, dtype=np.int8),
+        np.triu(
+            np.tri(*block_shape, window_size, dtype=np.int8),
+            -window_size,
+        ),
+        np.tri(*block_shape, -window_size, dtype=np.int8),
+        np.triu(np.ones(block_shape, dtype=np.int8), window_size),
+    ])
 
     expected_mask_info = mask_info_lib.MaskInfo(
         expected_mask_next,
@@ -1345,20 +1341,18 @@ def test_two_shards_local_wide_local_narrow_stacked(self, q_seq_shards, kv_seq_s
 
     expected_active_rows_dkv = np.concatenate(
         [
-            np.array(
-                [
-                    0,
-                    0,
-                    1,
-                    1,
-                    1,
-                    2,
-                    2,
-                    2,
-                    3,
-                    3,
-                ]
-            ),
+            np.array([
+                0,
+                0,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                3,
+                3,
+            ]),
             np.array([0, 0, 1, 1, 2, 2, 3, -1, -1, -1]),
         ],
         axis=0,
@@ -1453,7 +1447,6 @@ def test_causal_two_q_shards_two_kv_shards(self, return_dynamic_grid):
           q_sequence=None,
       )
     else:
-
       expected_mask_info_dkv = mask_info_lib.MaskInfo(
           mask_next=np.array(
               [0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
 
@@ -295,7 +295,6 @@ def _tpu_flash_attention(
       check_rep=False,
   )
   def wrap_flash_attention(query, key, value):
-
     uses_fused_kernel = block_sizes.use_fused_bwd_kernel
     block_q_sizes = (
         block_sizes.block_q,
@@ -1251,7 +1250,6 @@ def setup(self):
     )
 
   def __call__(self, hidden_states, encoder_hidden_states=None, attention_mask=None, image_rotary_emb=None):
-
     qkv_proj = self.qkv(hidden_states)
     B, L = hidden_states.shape[:2]
     H, D, K = self.heads, qkv_proj.shape[-1] // (self.heads * 3), 3
@@ -1263,7 +1261,6 @@ def __call__(self, hidden_states, encoder_hidden_states=None, attention_mask=Non
     key_proj = self.key_norm(key_proj)
 
     if encoder_hidden_states is not None:
-
       encoder_qkv_proj = self.encoder_qkv(encoder_hidden_states)
       B, L = encoder_hidden_states.shape[:2]
       H, D, K = self.heads, encoder_qkv_proj.shape[-1] // (self.heads * 3), 3
@@ -1357,7 +1354,6 @@ class FlaxAttention(nn.Module):
   quant: Quant = None
 
   def setup(self):
-
     if self.attention_kernel == "flash" and self.mesh is None:
       raise ValueError(f"The flash attention kernel requires a value for mesh, but mesh is {self.mesh}")
     inner_dim = self.dim_head * self.heads
 
@@ -202,29 +202,27 @@ def setup(self):
         dtype=self.dtype,
         param_dtype=self.weights_dtype,
     )
-    self.img_mlp = nn.Sequential(
-        [
-            nn.Dense(
-                int(self.dim * self.mlp_ratio),
-                use_bias=True,
-                kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "mlp")),
-                bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
-                dtype=self.dtype,
-                param_dtype=self.weights_dtype,
-                precision=self.precision,
-            ),
-            nn.gelu,
-            nn.Dense(
-                self.dim,
-                use_bias=True,
-                kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("mlp", "embed")),
-                bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
-                dtype=self.dtype,
-                param_dtype=self.weights_dtype,
-                precision=self.precision,
-            ),
-        ]
-    )
+    self.img_mlp = nn.Sequential([
+        nn.Dense(
+            int(self.dim * self.mlp_ratio),
+            use_bias=True,
+            kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "mlp")),
+            bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
+            dtype=self.dtype,
+            param_dtype=self.weights_dtype,
+            precision=self.precision,
+        ),
+        nn.gelu,
+        nn.Dense(
+            self.dim,
+            use_bias=True,
+            kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("mlp", "embed")),
+            bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
+            dtype=self.dtype,
+            param_dtype=self.weights_dtype,
+            precision=self.precision,
+        ),
+    ])
 
     self.txt_norm2 = nn.LayerNorm(
         use_bias=False,
@@ -233,29 +231,27 @@ def setup(self):
         dtype=self.dtype,
         param_dtype=self.weights_dtype,
     )
-    self.txt_mlp = nn.Sequential(
-        [
-            nn.Dense(
-                int(self.dim * self.mlp_ratio),
-                use_bias=True,
-                kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "mlp")),
-                bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
-                dtype=self.dtype,
-                param_dtype=self.weights_dtype,
-                precision=self.precision,
-            ),
-            nn.gelu,
-            nn.Dense(
-                self.dim,
-                use_bias=True,
-                kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("mlp", "embed")),
-                bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
-                dtype=self.dtype,
-                param_dtype=self.weights_dtype,
-                precision=self.precision,
-            ),
-        ]
-    )
+    self.txt_mlp = nn.Sequential([
+        nn.Dense(
+            int(self.dim * self.mlp_ratio),
+            use_bias=True,
+            kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("embed", "mlp")),
+            bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
+            dtype=self.dtype,
+            param_dtype=self.weights_dtype,
+            precision=self.precision,
+        ),
+        nn.gelu,
+        nn.Dense(
+            self.dim,
+            use_bias=True,
+            kernel_init=nn.with_logical_partitioning(nn.initializers.lecun_normal(), ("mlp", "embed")),
+            bias_init=nn.with_logical_partitioning(nn.initializers.zeros, (None,)),
+            dtype=self.dtype,
+            param_dtype=self.weights_dtype,
+            precision=self.precision,
+        ),
+    ])
 
     # let chunk size default to None
     self._chunk_size = None