AI-Hypercomputer
diff --git a/‎src/maxdiffusion/__init__.py‎
Lines changed: 196 additions & 182 deletions b/‎src/maxdiffusion/__init__.py‎
Lines changed: 196 additions & 182 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/base.py‎
Lines changed: 8 additions & 25 deletions b/‎src/maxdiffusion/kernels/splash_attention/base.py‎
Lines changed: 8 additions & 25 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/ring_attention_kernel.py‎
Lines changed: 11 additions & 26 deletions b/‎src/maxdiffusion/kernels/splash_attention/ring_attention_kernel.py‎
Lines changed: 11 additions & 26 deletions
diff --git a/‎src/maxdiffusion/kernels/splash_attention/ring_attention_kernel_test.py‎
Lines changed: 4 additions & 16 deletions b/‎src/maxdiffusion/kernels/splash_attention/ring_attention_kernel_test.py‎
Lines changed: 4 additions & 16 deletions
@@ -25,9 +25,7 @@
 MaskInfo = mask_info_lib.MaskInfo
 
 
-DEFAULT_MASK_VALUE: Final[float] = -0.7 * float(
-    np.finfo(np.dtype("float32")).max
-)
+DEFAULT_MASK_VALUE: Final[float] = -0.7 * float(np.finfo(np.dtype("float32")).max)
 
 
 class SegmentIds(NamedTuple):
@@ -55,9 +53,7 @@ class SegmentIds(NamedTuple):
 
 
 # Return type of SplashAttention function that implements the custom vjp rule.
-SplashCustomReturnType: TypeAlias = (
-    jax.Array | tuple[jax.Array, dict[str, jax.Array]]
-)
+SplashCustomReturnType: TypeAlias = jax.Array | tuple[jax.Array, dict[str, jax.Array]]
 
 SplashResidualsType = tuple[
     jax.Array,  # q
@@ -85,9 +81,7 @@ def _attention_reference_impl(
   logits = jnp.einsum("sd,td->st", q.astype(jnp.float32), k.astype(jnp.float32))
 
   if segment_ids is not None:
-    mask = jnp.logical_and(
-        mask, segment_ids.q[:, None] == segment_ids.kv[None, :]
-    )
+    mask = jnp.logical_and(mask, segment_ids.q[:, None] == segment_ids.kv[None, :])
 
   if attn_logits_soft_cap is not None:
     logits = jnp.tanh(logits / attn_logits_soft_cap)
@@ -126,9 +120,7 @@ def _attention_reference_custom_bwd(
     backward_impl: str = "vanilla",
     attn_logits_soft_cap: float | None = None,
 ) -> tuple[jax.Array, jax.Array, jax.Array, None, None, jax.Array | None]:
-  uncapped_logits = jnp.einsum(
-      "qc,kc->qk", q, k, preferred_element_type=jnp.float32
-  )
+  uncapped_logits = jnp.einsum("qc,kc->qk", q, k, preferred_element_type=jnp.float32)
 
   if attn_logits_soft_cap is not None:
     logits = jnp.tanh(uncapped_logits / attn_logits_soft_cap)
@@ -137,9 +129,7 @@ def _attention_reference_custom_bwd(
     logits = uncapped_logits
 
   if segment_ids is not None:
-    mask = jnp.logical_and(
-        mask, segment_ids.q[:, None] == segment_ids.kv[None, :]
-    )
+    mask = jnp.logical_and(mask, segment_ids.q[:, None] == segment_ids.kv[None, :])
   logits = jnp.where(mask, logits, mask_value)
 
   p = jnp.exp(logits - logsumexp[..., None])
@@ -165,10 +155,7 @@ def _attention_reference_custom_bwd(
   dq = jnp.einsum("st,td->sd", ds, k.astype(jnp.float32)).astype(q.dtype)
   dsinks = None
   if sinks is not None:
-    sinks_exp = -jnp.exp(
-        sinks[..., None, None].astype(jnp.float32)
-        - logsumexp[..., None].astype(jnp.float32)
-    )
+    sinks_exp = -jnp.exp(sinks[..., None, None].astype(jnp.float32) - logsumexp[..., None].astype(jnp.float32))
     dsinks = jnp.sum(sinks_exp.astype(o.dtype) * o * do, axis=(-1, -2))
   return dq, dk, dv, None, None, dsinks
 
@@ -229,9 +216,7 @@ def attention_reference(
   return out
 
 
-@functools.partial(
-    jax.jit, static_argnames=["is_mqa", "backward_impl", "attn_logits_soft_cap"]
-)
+@functools.partial(jax.jit, static_argnames=["is_mqa", "backward_impl", "attn_logits_soft_cap"])
 def attention_reference_vjp(
     do,
     q,
@@ -269,9 +254,7 @@ def attention_reference_vjp(
       k = jnp.repeat(k, head_multiplier, axis=0)
       v = jnp.repeat(v, head_multiplier, axis=0)
 
-  dq, dk, dv, _, _, dsinks = bwd(
-      do, q, k, v, mask, segment_ids, sinks, o, logsumexp
-  )
+  dq, dk, dv, _, _, dsinks = bwd(do, q, k, v, mask, segment_ids, sinks, o, logsumexp)
 
   if is_mqa:
     dk, dv = dk.sum(axis=0), dv.sum(axis=0)
 
@@ -41,9 +41,7 @@
 _splash_attention_bwd = splash_kernel._splash_attention_bwd  # pylint: disable=protected-access
 
 
-def _dynamic_slice_mask_info(
-    mask_info: MaskInfo, kv_shard_idx: jax.Array, ring_size: int
-) -> MaskInfo:
+def _dynamic_slice_mask_info(mask_info: MaskInfo, kv_shard_idx: jax.Array, ring_size: int) -> MaskInfo:
   """Slices MaskInfo for the current ring step."""
 
   def slice_if_exists(arr: jax.Array | None):
@@ -83,9 +81,7 @@ def _ring_attention_forward(
 ) -> tuple[jax.Array, tuple[jax.Array, jax.Array]]:
 
   if q.shape[-1] != k.shape[-1]:
-    raise NotImplementedError(
-        "Queries and keys must have the same head dimension."
-    )
+    raise NotImplementedError("Queries and keys must have the same head dimension.")
 
   if sinks is not None:
     raise NotImplementedError("Sinks aren't supportd yet.")
@@ -124,13 +120,11 @@ def _ring_attention_forward(
   l_init = jnp.zeros((o_shape[0], o_shape[1]), jnp.float32)
   m_init = jnp.full_like(l_init, mask_value, dtype=jnp.float32)
 
-  def body(carry, i: int)-> tuple[tuple[jax.Array, jax.Array, jax.Array, jax.Array, jax.Array, SegmentIds | None], None]:
+  def body(carry, i: int) -> tuple[tuple[jax.Array, jax.Array, jax.Array, jax.Array, jax.Array, SegmentIds | None], None]:
     m_prev, l_prev, o_prev, k_current, v_current, segment_ids_current = carry
 
     current_kv_shard_idx = (ring_axis_idx - i) % ring_axis_size
-    local_fwd_mask_info = _dynamic_slice_mask_info(
-        fwd_mask_info, current_kv_shard_idx, ring_axis_size
-    )
+    local_fwd_mask_info = _dynamic_slice_mask_info(fwd_mask_info, current_kv_shard_idx, ring_axis_size)
     k_next = shift(k_current)
     v_next = shift(v_current)
 
@@ -225,9 +219,7 @@ def body(carry, i: int):
     v_next = shift(v_current)
 
     current_kv_shard_idx = (ring_axis_idx - i) % ring_axis_size
-    local_dkv_mask_info = _dynamic_slice_mask_info(
-        dkv_mask_info, current_kv_shard_idx, ring_axis_size
-    )
+    local_dkv_mask_info = _dynamic_slice_mask_info(dkv_mask_info, current_kv_shard_idx, ring_axis_size)
     if segment_ids is not None and rotate_segment_ids:
       kv_segment_ids_next = shift(segment_ids_current.kv)
       segment_ids_next = SegmentIds(segment_ids.q, kv_segment_ids_next)
@@ -255,9 +247,7 @@ def body(carry, i: int):
         fwd_mask_sparsity=fwd_mask_sparsity,
         dkv_mask_sparsity=dkv_mask_sparsity,
     )
-    _, _, dq_i, dk_i, dv_i, _, dsinks, _ = attn_bwd(
-        res=residuals_for_chunk, do=do
-    )
+    _, _, dq_i, dk_i, dv_i, _, dsinks, _ = attn_bwd(res=residuals_for_chunk, do=do)
     dv_next = shift(dv_accum + dv_i.astype(dv_accum.dtype))
     dk_next = shift(dk_accum + dk_i.astype(dk_accum.dtype))
     dq_accum = dq_accum + dq_i.astype(dq_accum.dtype)
@@ -394,7 +384,7 @@ def _ring_attention_custom(
     dkv_mask_sparsity: float,
     save_residuals: bool,
     ring_axis: str,
-    rotate_segment_ids: bool ,
+    rotate_segment_ids: bool,
 ) -> SplashCustomReturnType:
   """Performs ring attention with a custom VJP.
 
@@ -561,7 +551,7 @@ def __init__(
       fwd_mask_info: MaskInfo,
       dkv_mask_info: MaskInfo | None,
       ring_axis: str,
-      rotate_segment_ids: bool ,
+      rotate_segment_ids: bool,
       **kwargs,
   ):
     self.fwd_mask_info = fwd_mask_info
@@ -590,6 +580,7 @@ def manual_sharding_spec(self):
     """
 
     spec = jax.sharding.PartitionSpec(self.ring_axis)
+
     def _resolve_spec(x):
       return spec if x is not None else None
 
@@ -618,11 +609,7 @@ def tree_flatten(self):
   @classmethod
   def tree_unflatten(cls, aux_data, children):
     fwd_mask_info, dkv_mask_info = children
-    dkv_mask_info = (
-        mask_info_lib.MaskInfo(*dkv_mask_info)
-        if dkv_mask_info is not None
-        else None
-    )
+    dkv_mask_info = mask_info_lib.MaskInfo(*dkv_mask_info) if dkv_mask_info is not None else None
     return cls(
         mask_info_lib.MaskInfo(*fwd_mask_info),
         dkv_mask_info,
@@ -674,9 +661,7 @@ def make_ring_attention(
     mask = mask_lib.NumpyMask(mask)
 
   if not isinstance(mask, (mask_lib.NumpyMask, mask_lib.FullMask)):
-    raise NotImplementedError(
-        f"Only NumpyMask and FullMask are supported, but got {type(mask)}."
-    )
+    raise NotImplementedError(f"Only NumpyMask and FullMask are supported, but got {type(mask)}.")
 
   if config is None:
     config = SplashConfig.get_default()
 
@@ -67,10 +67,7 @@ def test_ring_attention(
       mask_type,
   ):
     if len(jax.devices()) < ring_size:
-      self.skipTest(
-          f"This test requires {ring_size} devices, but has only"
-          f" {len(jax.devices())} devices available."
-      )
+      self.skipTest(f"This test requires {ring_size} devices, but has only" f" {len(jax.devices())} devices available.")
 
     # Mesh Creation and Input Generation
     ring_axis = "ring"
@@ -85,14 +82,8 @@ def test_ring_attention(
       k = random.normal(k2, (seq_len, head_dim), dtype=dtype) * scale
       v = random.normal(k3, (seq_len, head_dim), dtype=dtype) * scale
     else:
-      k = (
-          random.normal(k2, (num_heads, seq_len, head_dim), dtype=dtype)
-          * scale
-      )
-      v = (
-          random.normal(k3, (num_heads, seq_len, head_dim), dtype=dtype)
-          * scale
-      )
+      k = random.normal(k2, (num_heads, seq_len, head_dim), dtype=dtype) * scale
+      v = random.normal(k3, (num_heads, seq_len, head_dim), dtype=dtype) * scale
     do = random.normal(k4, q.shape, dtype=dtype) * scale
 
     if mask_type == "CAUSAL":
@@ -112,7 +103,6 @@ def test_ring_attention(
     q_spec = P(None, ring_axis, None)
     kv_spec = P(ring_axis, None) if is_mqa else q_spec
 
-
     splash_config = splash.SplashConfig.get_default()
     splash_config = dataclasses.replace(
         splash_config,
@@ -159,9 +149,7 @@ def ring_attn(ring_kernel, q, k, v, segment_ids):
 
     with self.subTest("bwd"):
       out, out_vjp = jax.vjp(ring_attn, ring_kernel, q, k, v, segment_ids)
-      out_ref, out_vjp_ref = jax.vjp(
-          ring_attn_ref, q, k, v, mask[:, :], segment_ids
-      )
+      out_ref, out_vjp_ref = jax.vjp(ring_attn_ref, q, k, v, mask[:, :], segment_ids)
       self._assert_allclose(out, out_ref, rtol=5e-3, atol=3e-3)
 
       _, dq, dk, dv, _ = out_vjp(do)