Move MaxText Kernels

dipannita08 · dipannita08 · commit 05700d55b381 · 2026-02-04T04:21:06.000Z
diff --git a/src/MaxText/layers/attention_op.py b/src/MaxText/layers/attention_op.py
@@ -69,15 +69,15 @@
     Q_LENGTH_NO_EXP,
 )
 
-from MaxText.kernels import jax_flash_attention
-from MaxText.kernels.ragged_attention import ragged_gqa
-from MaxText.kernels.ragged_attention import ragged_mha
 from MaxText.layers import nnx_wrappers
 from MaxText.layers.initializers import variable_to_logically_partitioned
 from MaxText.layers.quantizations import AqtQuantization as Quant
 from MaxText.sharding import logical_to_mesh_axes, maybe_shard_with_name
 from maxtext.inference import page_manager
 from maxtext.inference.kvcache import KVQuant, KVTensor
+from maxtext.kernels.attention import jax_flash_attention
+from maxtext.kernels.attention.ragged_attention import ragged_gqa
+from maxtext.kernels.attention.ragged_attention import ragged_mha
 from maxtext.utils import max_utils
 import numpy as np
 from tokamax._src.ops.experimental.tpu.splash_attention import splash_attention_kernel as tokamax_splash_kernel
diff --git a/src/MaxText/layers/deepseek_batchsplit.py b/src/MaxText/layers/deepseek_batchsplit.py
@@ -21,8 +21,8 @@
 
 import jax
 import jax.numpy as jnp
-from MaxText.kernels import megablox
-from MaxText.kernels import sort_activations
+from maxtext.kernels import megablox
+from maxtext.kernels import sort_activations
 from MaxText.layers import attention_op
 from MaxText.layers import quantizations
 
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -32,10 +32,10 @@
 from MaxText import common_types as ctypes
 from MaxText.common_types import ShardMode
 from MaxText.sharding import maybe_shard_with_logical, create_sharding
-from MaxText.kernels import megablox as mblx
 from MaxText.sharding import logical_to_mesh_axes
 from MaxText.layers import attentions, linears, nnx_wrappers, quantizations
 from MaxText.layers.initializers import NdInitializer, default_bias_init, nd_dense_init, variable_to_logically_partitioned
+from maxtext.kernels import megablox as mblx
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 import numpy as np
diff --git a/src/maxtext/kernels/__init__.py b/src/maxtext/kernels/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/attention/jax_flash_attention.py b/src/maxtext/kernels/attention/jax_flash_attention.py
@@ -1,4 +1,4 @@
-#  Copyright 2025 Google LLC
+#  Copyright 2026 Google LLC
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 
 import jax
 import jax.numpy as jnp
-from MaxText.kernels import splash_attention_kernel
+from maxtext.kernels.attention import splash_attention_kernel
 
 SegmentIds = splash_attention_kernel.SegmentIds
 
@@ -77,38 +77,34 @@ def flash_attention_block_masked(
   v_head_dim_size = v.shape[-1]
   data_type = q.dtype
   q_groups = num_q_heads // num_kv_heads
-  q = q.reshape((
-      batch_size,
-      num_kv_heads,
-      q_groups,
-      q_seq_len,
-      qk_head_dim_size,
-  ))
+  q = q.reshape(
+      (
+          batch_size,
+          num_kv_heads,
+          q_groups,
+          q_seq_len,
+          qk_head_dim_size,
+      )
+  )
 
   # Calculate the number of key/value and query blocks.
   num_kv_blocks = kv_seq_len // block_kv
   num_q_blocks = q_seq_len // block_q
 
   # Before applying the segment mask, we need to broadcast the mask in batch
   # dimension since we have same logic for all batches.
-  mask_full = jnp.broadcast_to(
-      mask[None, :, :], (batch_size, q_seq_len, kv_seq_len)
-  )
+  mask_full = jnp.broadcast_to(mask[None, :, :], (batch_size, q_seq_len, kv_seq_len))
 
   if segment_ids is not None:
     segment_ids_q = segment_ids.q[:, :, None]
     segment_ids_kv = segment_ids.kv[:, None, :]
     mask_full = jnp.logical_and(mask_full, segment_ids_q == segment_ids_kv)
-  mask_blocked = jax.jit(mask_blocker, static_argnums=[1, 2])(
-      mask_full, block_q, block_kv
-  )
+  mask_blocked = jax.jit(mask_blocker, static_argnums=[1, 2])(mask_full, block_q, block_kv)
 
   # Initialize `l` (logsumexp) and `m` (max_logits) for the online softmax.
   # `l` is initialized to 0 since no blocks have been processed yet and the sum
   # is 0.
-  l = jnp.zeros(
-      (batch_size, num_kv_heads, q_groups, q_seq_len), dtype=data_type
-  )
+  l = jnp.zeros((batch_size, num_kv_heads, q_groups, q_seq_len), dtype=data_type)
   # `m` is initialized to the mask_value so that the first block's maximum logit
   # correctly becomes the running maximum.
   m = jnp.full(
@@ -144,15 +140,9 @@ def inner_loop_body(i, carried_inner):
       # Calculates the attention computation (Q@K.T)@V with online softmax for
       # the current query and key/value blocks.
       def compute_attention_block(output, l, m):
-        output_i_slice = jax.lax.dynamic_slice_in_dim(
-            output, i * block_q, block_q, axis=-2
-        )
-        l_i_slice = jax.lax.dynamic_slice_in_dim(
-            l, i * block_q, block_q, axis=-1
-        )
-        m_i_slice = jax.lax.dynamic_slice_in_dim(
-            m, i * block_q, block_q, axis=-1
-        )
+        output_i_slice = jax.lax.dynamic_slice_in_dim(output, i * block_q, block_q, axis=-2)
+        l_i_slice = jax.lax.dynamic_slice_in_dim(l, i * block_q, block_q, axis=-1)
+        m_i_slice = jax.lax.dynamic_slice_in_dim(m, i * block_q, block_q, axis=-1)
         s_i_j = jnp.einsum(
             "bxhqc,bxkc->bxhqk",
             q_slice,
@@ -183,25 +173,19 @@ def compute_attention_block(output, l, m):
         l_i_new = m_i_difference * l_i_slice + m_i_j_difference * l_i_j
 
         divider = l_i_new[..., None]
-        numerator = l_i_slice[..., None] * m_i_difference[
+        numerator = l_i_slice[..., None] * m_i_difference[..., None] * output_i_slice + m_i_j_difference[
             ..., None
-        ] * output_i_slice + m_i_j_difference[..., None] * jnp.einsum(
+        ] * jnp.einsum(
             "bxhqk,bxkc->bxhqc",
             p_i_j,
             v_j_slice,
             preferred_element_type=data_type,
         )
 
         output_i_slice_new = numerator / divider
-        output = jax.lax.dynamic_update_index_in_dim(
-            output, output_i_slice_new, i * block_q, axis=-2
-        )
-        l = jax.lax.dynamic_update_index_in_dim(
-            l, l_i_new, i * block_q, axis=-1
-        )
-        m = jax.lax.dynamic_update_index_in_dim(
-            m, m_i_new, i * block_q, axis=-1
-        )
+        output = jax.lax.dynamic_update_index_in_dim(output, output_i_slice_new, i * block_q, axis=-2)
+        l = jax.lax.dynamic_update_index_in_dim(l, l_i_new, i * block_q, axis=-1)
+        m = jax.lax.dynamic_update_index_in_dim(m, m_i_new, i * block_q, axis=-1)
         return output, l, m
 
       def identity(output, l, m):
@@ -210,9 +194,7 @@ def identity(output, l, m):
         return output, l, m
 
       batch_size = mask_blocked.shape[0]
-      mask_i_j_slice = jax.lax.dynamic_slice(
-          mask_blocked, (0, i, j), (batch_size, 1, 1)
-      )
+      mask_i_j_slice = jax.lax.dynamic_slice(mask_blocked, (0, i, j), (batch_size, 1, 1))
       # The compute_attention_block should be executed if at least one element
       # in the slice is non-zero, meaning at least one batch requires work for
       # this block.
@@ -227,15 +209,11 @@ def identity(output, l, m):
 
       return output, l, m
 
-    output, l, m = jax.lax.fori_loop(
-        0, num_q_blocks, inner_loop_body, (output, l, m), unroll=True
-    )
+    output, l, m = jax.lax.fori_loop(0, num_q_blocks, inner_loop_body, (output, l, m), unroll=True)
 
     return (output, l, m)
 
-  output, l, m = jax.lax.fori_loop(
-      0, num_kv_blocks, outer_loop_body, (output, l, m), unroll=True
-  )
+  output, l, m = jax.lax.fori_loop(0, num_kv_blocks, outer_loop_body, (output, l, m), unroll=True)
 
   # Reshape the output to drop the size one dimension at index 2,
   # which corresponds to `num_q_heads // num_kv_heads` when
@@ -268,17 +246,11 @@ def mask_blocker(mask: jnp.ndarray, block_q: int, block_kv: int) -> jnp.ndarray:
   batch_size, q_seq_len, kv_seq_len = mask.shape
 
   if q_seq_len % block_q != 0:
-    raise ValueError(
-        f"q_seq_len {q_seq_len} must be divisible by block_q {block_q}"
-    )
+    raise ValueError(f"q_seq_len {q_seq_len} must be divisible by block_q {block_q}")
   if kv_seq_len % block_kv != 0:
-    raise ValueError(
-        f"kv_seq_len {kv_seq_len} must be divisible by block_kv {block_kv}"
-    )
+    raise ValueError(f"kv_seq_len {kv_seq_len} must be divisible by block_kv {block_kv}")
   q_blocks = q_seq_len // block_q
   kv_blocks = kv_seq_len // block_kv
 
-  blocked_mask = mask.reshape(
-      batch_size, q_blocks, block_q, kv_blocks, block_kv
-  )
+  blocked_mask = mask.reshape(batch_size, q_blocks, block_q, kv_blocks, block_kv)
   return jnp.count_nonzero(blocked_mask, axis=(2, 4)).astype(jnp.int32)
diff --git a/src/maxtext/kernels/attention/ragged_attention.py b/src/maxtext/kernels/attention/ragged_attention.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/attention/splash_attention_kernel.py b/src/maxtext/kernels/attention/splash_attention_kernel.py
@@ -1,7 +1,7 @@
 # pylint: skip-file
 from __future__ import annotations
 
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/megablox/__init__.py b/src/maxtext/kernels/megablox/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,4 +13,4 @@
 # limitations under the License.
 """Megablox kernel"""
 
-from MaxText.kernels.megablox.ops import gmm
+from maxtext.kernels.megablox.ops import gmm
diff --git a/src/maxtext/kernels/megablox/backend.py b/src/maxtext/kernels/megablox/backend.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/megablox/common.py b/src/maxtext/kernels/megablox/common.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/megablox/ops.py b/src/maxtext/kernels/megablox/ops.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 from typing import Literal, List, Tuple
 import jax
 import jax.numpy as jnp
-from MaxText.kernels.megablox import backend
+from maxtext.kernels.megablox import backend
 from tokamax._src.ops.ragged_dot import pallas_mosaic_tpu_kernel as tokamax_backend
 import qwix
 import qwix.pallas as qpl
@@ -125,10 +125,7 @@ def _gmm_fwd(
       # QAG is only supported for following conditions
   if use_tokamax_backend:
     if quantization_rule and quantization_rule.bwd_qtype:
-      if (
-          quantization_rule.weight_calibration_method.startswith("fixed")
-          and isinstance(rhs, qpl.QArray)
-      ):
+      if quantization_rule.weight_calibration_method.startswith("fixed") and isinstance(rhs, qpl.QArray):
         if weight_gather_axes:
           for axis_name, axis_idx in weight_gather_axes:
             rhs_qvalue = jax.lax.all_gather(rhs.qvalue, axis_name, axis=axis_idx, tiled=True)
diff --git a/src/maxtext/kernels/sort_activations.py b/src/maxtext/kernels/sort_activations.py
diff --git a/tests/unit/kernels_test.py b/tests/unit/kernels_test.py
@@ -23,7 +23,7 @@
 import jax
 import jax.numpy as jnp
 
-from MaxText.kernels.ragged_attention import ragged_mqa, reference_mqa, ragged_mha, reference_mha, ragged_gqa, reference_gqa
+from maxtext.kernels.attention.ragged_attention import ragged_mqa, reference_mqa, ragged_mha, reference_mha, ragged_gqa, reference_gqa
 
 
 class RaggedAttentionTest(unittest.TestCase):
diff --git a/tests/unit/quantizations_test.py b/tests/unit/quantizations_test.py
@@ -35,9 +35,9 @@
 from MaxText.globals import MAXTEXT_PKG_DIR
 from MaxText import pyconfig
 from MaxText.layers import nnx_wrappers, quantizations
-from MaxText.kernels.megablox import gmm
 from MaxText.common_types import DECODING_ACTIVE_SEQUENCE_INDICATOR
 from maxtext.common.gcloud_stub import is_decoupled
+from maxtext.kernels.megablox import gmm
 from maxtext.utils import maxtext_utils
 from maxtext.utils import model_creation_utils
 from tests.utils.test_helpers import get_test_config_path

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright 2023–2025 Google LLC`
	`1`	`+# Copyright 2023–2026 Google LLC`
`2`	`2`	`#`
`3`	`3`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`4`	`# you may not use this file except in compliance with the License.`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`# pylint: skip-file`
`2`	`2`	`from __future__ import annotations`
`3`	`3`
`4`		`-# Copyright 2023–2025 Google LLC`
	`4`	`+# Copyright 2023–2026 Google LLC`
`5`	`5`	`#`
`6`	`6`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`7`	`7`	`# you may not use this file except in compliance with the License.`