Merge pull request #3032 from AI-Hypercomputer:move-maxtext-kernels

Google-ML-Automation · Google-ML-Automation · commit 525b3992c74c · 2026-02-04T18:26:05.000-08:00
PiperOrigin-RevId: 865669628
diff --git a/src/MaxText/layers/attention_op.py b/src/MaxText/layers/attention_op.py
@@ -68,16 +68,15 @@
     Q_LENGTH,
     Q_LENGTH_NO_EXP,
 )
-
-from MaxText.kernels import jax_flash_attention
-from MaxText.kernels.ragged_attention import ragged_gqa
-from MaxText.kernels.ragged_attention import ragged_mha
+from maxtext.inference import page_manager
+from maxtext.inference.kvcache import KVQuant, KVTensor
+from maxtext.kernels.attention import jax_flash_attention
+from maxtext.kernels.attention.ragged_attention import ragged_gqa
+from maxtext.kernels.attention.ragged_attention import ragged_mha
 from MaxText.layers import nnx_wrappers
 from MaxText.layers.initializers import variable_to_logically_partitioned
 from MaxText.layers.quantizations import AqtQuantization as Quant
 from MaxText.sharding import logical_to_mesh_axes, maybe_shard_with_name
-from maxtext.inference import page_manager
-from maxtext.inference.kvcache import KVQuant, KVTensor
 from maxtext.utils import max_utils
 import numpy as np
 from tokamax._src.ops.experimental.tpu.splash_attention import splash_attention_kernel as tokamax_splash_kernel
diff --git a/src/MaxText/layers/deepseek_batchsplit.py b/src/MaxText/layers/deepseek_batchsplit.py
@@ -21,8 +21,8 @@
 
 import jax
 import jax.numpy as jnp
-from MaxText.kernels import megablox
-from MaxText.kernels import sort_activations
+from maxtext.kernels import megablox
+from maxtext.kernels import sort_activations
 from MaxText.layers import attention_op
 from MaxText.layers import quantizations
 
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -32,7 +32,7 @@
 from MaxText import common_types as ctypes
 from MaxText.common_types import ShardMode
 from MaxText.sharding import maybe_shard_with_logical, create_sharding
-from MaxText.kernels import megablox as mblx
+from maxtext.kernels import megablox as mblx
 from MaxText.sharding import logical_to_mesh_axes
 from MaxText.layers import attentions, linears, nnx_wrappers, quantizations
 from MaxText.layers.initializers import NdInitializer, default_bias_init, nd_dense_init, variable_to_logically_partitioned
diff --git a/src/maxtext/kernels/__init__.py b/src/maxtext/kernels/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/attention/jax_flash_attention.py b/src/maxtext/kernels/attention/jax_flash_attention.py
@@ -1,4 +1,4 @@
-#  Copyright 2025 Google LLC
+#  Copyright 2026 Google LLC
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 
 import jax
 import jax.numpy as jnp
-from MaxText.kernels import splash_attention_kernel
+from maxtext.kernels.attention import splash_attention_kernel
 
 SegmentIds = splash_attention_kernel.SegmentIds
 
diff --git a/src/maxtext/kernels/attention/ragged_attention.py b/src/maxtext/kernels/attention/ragged_attention.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/attention/splash_attention_kernel.py b/src/maxtext/kernels/attention/splash_attention_kernel.py
@@ -1,7 +1,7 @@
 # pylint: skip-file
 from __future__ import annotations
 
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/megablox/__init__.py b/src/maxtext/kernels/megablox/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,4 +13,4 @@
 # limitations under the License.
 """Megablox kernel"""
 
-from MaxText.kernels.megablox.ops import gmm
+from maxtext.kernels.megablox.ops import gmm
diff --git a/src/maxtext/kernels/megablox/backend.py b/src/maxtext/kernels/megablox/backend.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/megablox/common.py b/src/maxtext/kernels/megablox/common.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/kernels/megablox/ops.py b/src/maxtext/kernels/megablox/ops.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 from typing import Literal, List, Tuple
 import jax
 import jax.numpy as jnp
-from MaxText.kernels.megablox import backend
+from maxtext.kernels.megablox import backend
 from tokamax._src.ops.ragged_dot import pallas_mosaic_tpu_kernel as tokamax_backend
 import qwix
 import qwix.pallas as qpl
@@ -125,10 +125,9 @@ def _gmm_fwd(
       # QAG is only supported for following conditions
   if use_tokamax_backend:
     if quantization_rule and quantization_rule.bwd_qtype:
-      if (
-          quantization_rule.weight_calibration_method.startswith("fixed")
-          and isinstance(rhs, qpl.QArray)
-      ):
+      if quantization_rule.weight_calibration_method.startswith(
+          "fixed"
+      ) and isinstance(rhs, qpl.QArray):
         if weight_gather_axes:
           for axis_name, axis_idx in weight_gather_axes:
             rhs_qvalue = jax.lax.all_gather(rhs.qvalue, axis_name, axis=axis_idx, tiled=True)
diff --git a/src/maxtext/kernels/sort_activations.py b/src/maxtext/kernels/sort_activations.py
diff --git a/tests/unit/kernels_test.py b/tests/unit/kernels_test.py
@@ -16,14 +16,11 @@
 
 import unittest
 
-import pytest
-
-import numpy as np
-
 import jax
 import jax.numpy as jnp
-
-from MaxText.kernels.ragged_attention import ragged_mqa, reference_mqa, ragged_mha, reference_mha, ragged_gqa, reference_gqa
+from maxtext.kernels.attention.ragged_attention import ragged_gqa, ragged_mha, ragged_mqa, reference_gqa, reference_mha, reference_mqa
+import numpy as np
+import pytest
 
 
 class RaggedAttentionTest(unittest.TestCase):
diff --git a/tests/unit/quantizations_test.py b/tests/unit/quantizations_test.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright 2023–2025 Google LLC`
	`1`	`+# Copyright 2023–2026 Google LLC`
`2`	`2`	`#`
`3`	`3`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`4`	`# you may not use this file except in compliance with the License.`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`# pylint: skip-file`
`2`	`2`	`from __future__ import annotations`
`3`	`3`
`4`		`-# Copyright 2023–2025 Google LLC`
	`4`	`+# Copyright 2023–2026 Google LLC`
`5`	`5`	`#`
`6`	`6`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`7`	`7`	`# you may not use this file except in compliance with the License.`