Fuse normalization in kernel

coolkp · coolkp · commit 195b45145e23 · 2026-04-08T02:10:50.000Z
Signed-off-by: Kunjan Patel &lt;kunjan@ucla.edu&gt;
diff --git a/src/maxdiffusion/kernels/splash_attention/ring_attention_kernel.py b/src/maxdiffusion/kernels/splash_attention/ring_attention_kernel.py
@@ -119,7 +119,7 @@ def _ring_attention_forward(
   # Initial accumulator values
   o_shape = q.shape
   o_init = jnp.zeros(o_shape, dtype=jnp.float32)
-  l_init = jnp.zeros((o_shape[0], o_shape[1]), jnp.float32)
+  l_init = jnp.zeros((o_shape[0], o_shape[1], splash_kernel.NUM_LANES), jnp.float32)
   m_init = jnp.full_like(l_init, mask_value, dtype=jnp.float32)
 
   def body(carry, i: int) -> tuple[tuple[jax.Array, jax.Array, jax.Array, jax.Array, jax.Array, SegmentIds | None], None]:
@@ -143,15 +143,21 @@ def body(carry, i: int) -> tuple[tuple[jax.Array, jax.Array, jax.Array, jax.Arra
         v_current,
         segment_ids=segment_ids_current,
         sinks=sinks,
+        m_init=m_prev,
+        l_init=l_prev,
+        o_init=o_prev
     )
-    m_curr = stats["max_logits"].astype(jnp.float32)
-    l_curr = stats["l_linear"].astype(jnp.float32)
-    o_curr = out_curr.astype(jnp.float32)
-    m_next = jnp.maximum(m_prev, m_curr)
-    alpha = exp_fn(m_prev - m_next)
-    beta = exp_fn(m_curr - m_next)
-    l_next = alpha * l_prev + beta * l_curr
-    o_next = alpha[..., None] * o_prev + beta[..., None] * o_curr
+    m_next = stats["max_logits"].astype(jnp.float32)
+    l_next = stats["l_linear"].astype(jnp.float32)
+    o_next = out_curr.astype(jnp.float32)
+    # m_curr = stats["max_logits"].astype(jnp.float32)
+    # l_curr = stats["l_linear"].astype(jnp.float32)
+    # o_curr = out_curr.astype(jnp.float32)
+    # m_next = jnp.maximum(m_prev, m_curr)
+    # alpha = exp_fn(m_prev - m_next)
+    # beta = exp_fn(m_curr - m_next)
+    # l_next = alpha * l_prev + beta * l_curr
+    # o_next = alpha[..., None] * o_prev + beta[..., None] * o_curr
     return (m_next, l_next, o_next, k_next, v_next, segment_ids_next), None
 
   # Use lax.scan to get the final carry AND the collected sequence of (k,v)
@@ -165,12 +171,25 @@ def body(carry, i: int) -> tuple[tuple[jax.Array, jax.Array, jax.Array, jax.Arra
       unroll=True,
   )  # type: ignore[arg-type]
   # Final normalization
-  assert l_final.dtype == jnp.float32
-  l_inv = jnp.where(l_final == 0.0, 0.0, 1.0 / l_final)
+  # assert l_final.dtype == jnp.float32
+  # l_inv = jnp.where(l_final == 0.0, 0.0, 1.0 / l_final)
+  # out = (o_final * l_inv[..., None]).astype(q.dtype)
+  # # Final logsumexp for residuals
+  # lse = log_fn(l_final) + m_final
+  # lse = jnp.where(l_final == 0.0, mask_value, lse)
+  # Final normalization (Slice off NUM_LANES down to 2D)
+  l_final_2d = l_final[..., 0]
+  m_final_2d = m_final[..., 0]
+
+  assert l_final_2d.dtype == jnp.float32
+  l_inv = jnp.where(l_final_2d == 0.0, 0.0, 1.0 / l_final_2d)
   out = (o_final * l_inv[..., None]).astype(q.dtype)
+  
   # Final logsumexp for residuals
-  lse = log_fn(l_final) + m_final
-  lse = jnp.where(l_final == 0.0, mask_value, lse)
+  lse = log_fn(l_final_2d) + m_final_2d
+  lse = jnp.where(l_final_2d == 0.0, mask_value, lse)
+
+  
 
   return out, (lse, m_final)
 
diff --git a/src/maxdiffusion/kernels/splash_attention/splash_attention_kernel.py b/src/maxdiffusion/kernels/splash_attention/splash_attention_kernel.py
@@ -297,6 +297,9 @@ def flash_attention_kernel(
     mask_ref,
     q_sequence_ref,
     max_logit_value_ref,
+    m_init_ref,
+    l_init_ref,
+    o_init_ref,
     # Outputs
     o_ref,
     logsumexp_ref,
@@ -348,15 +351,24 @@ def flash_attention_kernel(
 
   @pl.when(should_initialize)
   def init():
-    o_scratch_ref[...] = jnp.zeros_like(o_scratch_ref)
-
+    
+    if o_init_ref is not None:
+      o_scratch_ref[...] = o_init_ref[...]
+    else:
+      o_scratch_ref[...] = jnp.zeros_like(o_scratch_ref)
     sink = None
     if sinks_ref is not None:
       sink = sinks_ref[0, h].astype(m_scratch_ref.dtype)
 
     if sinks_ref is None and max_logit_estimate is None:
-      m_scratch_ref[...] = jnp.full_like(m_scratch_ref, mask_value)
-      l_scratch_ref[...] = jnp.zeros_like(l_scratch_ref)
+      # m_scratch_ref[...] = jnp.full_like(m_scratch_ref, mask_value)
+      # l_scratch_ref[...] = jnp.zeros_like(l_scratch_ref)
+      if m_init_ref is not None and l_init_ref is not None:
+        m_scratch_ref[...] = m_init_ref[...]
+        l_scratch_ref[...] = l_init_ref[...]
+      else:
+        m_scratch_ref[...] = jnp.full_like(m_scratch_ref, mask_value)
+        l_scratch_ref[...] = jnp.zeros_like(l_scratch_ref)
     elif sinks_ref is None and max_logit_estimate is not None:
       m_scratch_ref[...] = jnp.full_like(m_scratch_ref, max_logit_estimate)
       l_scratch_ref[...] = jnp.zeros_like(l_scratch_ref)
@@ -680,6 +692,7 @@ def mask_index_map(h, grid_idx, rows_ref, cols_ref, mask_next_ref=None, *_):
   else:
     in_specs.append(None)
 
+  in_specs += [None, None, None]
   out_shapes = [
       jax.ShapeDtypeStruct((num_q_heads, q_seq_len, head_dim_v), q.dtype),
   ]
@@ -815,6 +828,9 @@ def _fwd_cost_estimate(
         mask_info.partial_mask_blocks,
         q_sequence,
         max_logit_value,
+        None,  # m_init
+        None,  # l_init
+        None,  # o_init
     )
   out, logsumexp, l_linear, max_logits = all_out
 
@@ -872,6 +888,9 @@ def _splash_attention_forward_ring_raw(
     mask_function: MaskFunctionType | None,
     fwd_mask_sparsity: float,
     max_logit_value: jax.Array | None = None,
+    m_init: jax.Array = None,
+    l_init: jax.Array = None,
+    o_init: jax.Array = None,
 ) -> tuple[jax.Array, dict[str, jax.Array]]:
   """Ring-specific forward path that returns pre-reciprocal fp32 accumulators.
 
@@ -1039,6 +1058,19 @@ def mask_index_map(h, grid_idx, rows_ref, cols_ref, mask_next_ref=None, *_):
     in_specs.append(None)
 
   logsumexp_index_map = unravel(lambda h, i, j, *_: (h, i, 0))
+  init =  m_init is not None and l_init is not None and o_init is not None
+  if init:
+    m_index_map = unravel(lambda h, i, j: (h, i, 0))
+    l_index_map = unravel(lambda h, i, j: (h, i, 0))
+    out_init_index_map = out_index_map
+    in_specs += [
+      pl.BlockSpec((None, bq, NUM_LANES), m_index_map),
+      pl.BlockSpec((None, bq, NUM_LANES), l_index_map),
+      pl.BlockSpec((None, bq, head_dim_v), out_init_index_map),
+  ]
+  else:
+      in_specs += [None, None, None]
+
   out_shapes = [
       jax.ShapeDtypeStruct((num_q_heads, q_seq_len, head_dim_v), jnp.float32),
       None,
@@ -1143,6 +1175,9 @@ def _fwd_cost_estimate(
         mask_info.partial_mask_blocks,
         q_sequence,
         max_logit_value,
+        m_init,
+        l_init,
+        o_init,
     )
   out_linear, _, l_linear, max_logits = all_out
 
@@ -1151,11 +1186,16 @@ def init_if_empty(x: jax.Array, value: float) -> jax.Array:
       return x
     return jnp.where(is_empty_attention_block, value, x)
 
+  # out_linear = init_if_empty(out_linear, 0.0)
+  # assert l_linear is not None
+  # assert max_logits is not None
   out_linear = init_if_empty(out_linear, 0.0)
   assert l_linear is not None
   assert max_logits is not None
-  l_linear = init_if_empty(l_linear[..., 0], 0.0)
-  max_logits = init_if_empty(max_logits[..., 0], mask_value)
+  l_linear = init_if_empty(l_linear, 0.0)
+  max_logits = init_if_empty(max_logits, mask_value)
+  # l_linear = init_if_empty(l_linear[..., 0], 0.0)
+  # max_logits = init_if_empty(max_logits[..., 0], mask_value)
 
   stats = {"l_linear": l_linear, "max_logits": max_logits}
   stats = jax.tree.map(jax.lax.stop_gradient, stats)
diff --git a/src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_col.py b/src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_col.py