PR #3226: Create optimizers directory and move some files to utils

bvandermoon · Google-ML-Automation · commit be963a55d1a0 · 2026-02-24T19:01:45.000-08:00
Imported from GitHub PR #3226 # Description * Create optimizers directory * Move some remaining files from `src/MaxText` to `src/maxtext/utils` # Tests ``` python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \ run_name=<run_name> \ base_output_directory=gs://<gcs_bucket> \ dataset_type=synthetic \ steps=10 \ enable_checkpointing=false ``` # Checklist Before submitting this PR, please make sure (put X in square brackets): - [x] I have performed a self-review of my code. For an optional AI review, add the `gemini-review` label. - [x] I have necessary comments in my code, particularly in hard-to-understand areas. - [x] I have run end-to-end tests tests and provided workload links above if applicable. - [x] I have made or will make corresponding changes to the doc if needed, including adding new documentation pages to the relevant Table of Contents (toctree directive) as explained in our documentation. Copybara import of the project: -- 1e4bdcb by Branden Vandermoon <bvandermoon@google.com>: Create optimizers directory and move some files to utils Merging this change closes #3226 COPYBARA_INTEGRATE_REVIEW=#3226 from AI-Hypercomputer:bvandermoon-restructure-last-files 1e4bdcb PiperOrigin-RevId: 874891637
diff --git a/src/MaxText/generate_param_only_checkpoint.py b/src/MaxText/generate_param_only_checkpoint.py
@@ -29,12 +29,12 @@
 import jax
 from jax import random
 from jax.sharding import Mesh
-from MaxText import optimizers
 from MaxText import pyconfig
 from maxtext.common import checkpointing
 from maxtext.common.common_types import DecoderBlockType, MODEL_MODE_TRAIN
 from maxtext.layers import quantizations
 from maxtext.models import models
+from maxtext.optimizers import optimizers
 from maxtext.utils import gcs_utils
 from maxtext.utils import lora_utils
 from maxtext.utils import max_logging
diff --git a/src/MaxText/sft_trainer.py b/src/MaxText/sft_trainer.py
@@ -28,7 +28,6 @@
 from flax.linen import partitioning as nn_partitioning
 
 from MaxText import pyconfig
-from MaxText import sharding
 from maxtext.trainers.pre_train.train import (
     eval_step,
     get_first_step,
@@ -48,6 +47,7 @@
 from maxtext.utils import max_utils
 from maxtext.utils import max_logging
 from maxtext.utils import maxtext_utils
+from maxtext.utils import sharding
 from maxtext.utils import train_utils
 
 
diff --git a/src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py b/src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py
@@ -42,13 +42,13 @@
 import jax
 from jax import random
 from jax.sharding import Mesh
-from MaxText import optimizers
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
 from maxtext.common import checkpointing
 from maxtext.common.common_types import MODEL_MODE_TRAIN
 from maxtext.layers import quantizations
 from maxtext.models.models import transformer_as_linen
+from maxtext.optimizers import optimizers
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
diff --git a/src/maxtext/common/data_loader.py b/src/maxtext/common/data_loader.py
@@ -19,13 +19,13 @@
 import jax.numpy as jnp
 from jax.experimental import checkify
 
-from MaxText.sharding import get_input_data_sharding
 from maxtext.common.goodput import (
     GoodputEvent,
     maybe_record_goodput,
 )
-from maxtext.utils import exceptions
 from maxtext.trainers.diloco import diloco
+from maxtext.utils import exceptions
+from maxtext.utils.sharding import get_input_data_sharding
 
 
 class DataLoader:
diff --git a/src/maxtext/experimental/rl/grpo_trainer.py b/src/maxtext/experimental/rl/grpo_trainer.py
@@ -67,7 +67,6 @@
 from ml_goodput_measurement.src.goodput import GoodputRecorder
 
 import MaxText as mt
-from MaxText import sharding
 from MaxText import pyconfig
 from MaxText.globals import EPS
 from maxtext.trainers.pre_train.train import get_first_step
@@ -89,6 +88,7 @@
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
+from maxtext.utils import sharding
 from maxtext.utils import train_utils
 
 # pylint: disable=too-many-positional-arguments
diff --git a/src/maxtext/inference/paged_attention.py b/src/maxtext/inference/paged_attention.py
@@ -30,7 +30,7 @@
 from maxtext.inference import page_manager
 from maxtext.inference import paged_attention_kernel_v2
 from maxtext.layers.initializers import variable_to_logically_partitioned
-from MaxText.sharding import logical_to_mesh_axes
+from maxtext.utils.sharding import logical_to_mesh_axes
 
 _use_kernel_v2 = False
 
diff --git a/src/maxtext/layers/attention_mla.py b/src/maxtext/layers/attention_mla.py
@@ -62,7 +62,6 @@
     DEFAULT_MASK_VALUE,
 )
 
-from MaxText.sharding import create_sharding
 from maxtext.layers import nnx_wrappers
 from maxtext.layers.attentions import Attention
 from maxtext.layers.initializers import nd_dense_init, NdInitializer, variable_to_logically_partitioned
@@ -73,6 +72,7 @@
 from maxtext.inference import page_manager
 from maxtext.inference import paged_attention
 from maxtext.inference.kvcache import KVQuant
+from maxtext.utils.sharding import create_sharding
 
 
 class Indexer(nnx.Module):
diff --git a/src/maxtext/layers/attention_op.py b/src/maxtext/layers/attention_op.py
@@ -77,8 +77,8 @@
 from maxtext.layers import nnx_wrappers
 from maxtext.layers.initializers import variable_to_logically_partitioned
 from maxtext.layers.quantizations import AqtQuantization as Quant
-from MaxText.sharding import logical_to_mesh_axes, maybe_shard_with_name
 from maxtext.utils import max_utils
+from maxtext.utils.sharding import logical_to_mesh_axes, maybe_shard_with_name
 import numpy as np
 from tokamax._src.ops.experimental.tpu.splash_attention import splash_attention_kernel as tokamax_splash_kernel
 from tokamax._src.ops.experimental.tpu.splash_attention import splash_attention_mask as tokamax_splash_mask
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -53,7 +53,6 @@
     EP_AS_CONTEXT,
     AttentionType,
 )
-from MaxText.sharding import maybe_shard_with_logical, create_sharding
 from maxtext.layers import nnx_wrappers
 from maxtext.layers.attention_op import AttentionOp
 from maxtext.layers.embeddings import (
@@ -71,6 +70,7 @@
 from maxtext.layers.quantizations import AqtQuantization as Quant
 from maxtext.inference import kvcache, page_manager, paged_attention
 from maxtext.inference.kvcache import KVQuant
+from maxtext.utils.sharding import maybe_shard_with_logical, create_sharding
 
 # pylint: disable=line-too-long, g-doc-args, g-doc-return-or-yield, bad-continuation, g-inconsistent-quotes
 # pytype: disable=attribute-error
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -27,7 +27,6 @@
 from jax.ad_checkpoint import checkpoint_name
 import jax.numpy as jnp
 from jax.sharding import Mesh
-from MaxText import sharding
 from maxtext.common.common_types import Config, DecoderBlockType, EP_AS_CONTEXT, ShardMode
 from maxtext.common.common_types import MODEL_MODE_AUTOREGRESSIVE, MODEL_MODE_PREFILL, MODEL_MODE_TRAIN
 from maxtext.inference import page_manager
@@ -56,10 +55,11 @@
     simple_layer,
 )
 from maxtext.multimodal import utils as mm_utils
-from MaxText.sharding import create_sharding
+from maxtext.utils.sharding import create_sharding
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
+from maxtext.utils import sharding
 
 # ------------------------------------------------------------------------------
 # The network: Decoder Definitions
diff --git a/src/maxtext/layers/embeddings.py b/src/maxtext/layers/embeddings.py
@@ -24,12 +24,12 @@
 
 from flax import nnx
 
-from MaxText.sharding import logical_to_mesh_axes, create_sharding
 from maxtext.common.common_types import ShardMode, MODEL_MODE_PREFILL, MODEL_MODE_TRAIN, Array, Config, DType
 from maxtext.layers import nnx_wrappers
 from maxtext.layers.initializers import Initializer, default_embed_init, variable_to_logically_partitioned
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
+from maxtext.utils.sharding import logical_to_mesh_axes, create_sharding
 
 _MAX_WAVELENGTH = 10_000
 
diff --git a/src/maxtext/layers/linears.py b/src/maxtext/layers/linears.py
@@ -29,7 +29,6 @@
 from flax import nnx
 import flax.linen as nn
 
-from MaxText.sharding import maybe_shard_with_logical
 from maxtext.common.common_types import DecoderBlockType, ShardMode, DType, Array, Config
 from maxtext.common.common_types import MODEL_MODE_TRAIN, MODEL_MODE_PREFILL, EP_AS_CONTEXT
 from maxtext.layers import nnx_wrappers, quantizations
@@ -38,6 +37,7 @@
 from maxtext.layers.quantizations import AqtQuantization as Quant
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
+from maxtext.utils.sharding import maybe_shard_with_logical
 
 
 def _convert_to_activation_function(fn_or_string: str | Callable[..., Any]) -> Callable[..., Any]:
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -31,13 +31,13 @@
 import jax.numpy as jnp
 from maxtext.common import common_types as ctypes
 from maxtext.common.common_types import ShardMode
-from MaxText.sharding import maybe_shard_with_logical, create_sharding
-from MaxText.sharding import logical_to_mesh_axes
 from maxtext.layers import attentions, linears, nnx_wrappers, quantizations
 from maxtext.layers.initializers import NdInitializer, default_bias_init, nd_dense_init, variable_to_logically_partitioned
 from maxtext.kernels import megablox as mblx
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
+from maxtext.utils.sharding import maybe_shard_with_logical, create_sharding
+from maxtext.utils.sharding import logical_to_mesh_axes
 import numpy as np
 import qwix.pallas as qpl
 import tokamax
diff --git a/src/maxtext/layers/multi_token_prediction.py b/src/maxtext/layers/multi_token_prediction.py
@@ -21,7 +21,6 @@
 import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
-from MaxText import sharding
 from maxtext.common.common_types import Config, MODEL_MODE_TRAIN
 from MaxText.globals import EPS
 from maxtext.layers import nnx_wrappers
@@ -31,6 +30,7 @@
 from maxtext.layers.normalizations import RMSNorm
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
+from maxtext.utils import sharding
 
 
 # Custom Variable types for MTP intermediate outputs
diff --git a/src/maxtext/layers/pipeline.py b/src/maxtext/layers/pipeline.py
@@ -29,7 +29,7 @@
 from flax.linen.spmd import LogicallyPartitioned
 
 from maxtext.common.common_types import Config, MODEL_MODE_TRAIN, EP_AS_CONTEXT, ShardMode
-from MaxText.sharding import (
+from maxtext.utils.sharding import (
     maybe_shard_with_logical,
     maybe_shard_with_name,
     create_sharding,
diff --git a/src/maxtext/models/deepseek.py b/src/maxtext/models/deepseek.py
@@ -35,9 +35,9 @@
 from maxtext.layers.linears import Dropout
 from maxtext.layers.normalizations import RMSNorm
 from maxtext.models import deepseek_batchsplit
-from MaxText.sharding import create_sharding
-from MaxText.sharding import maybe_shard_with_logical
 from maxtext.utils import max_utils
+from maxtext.utils.sharding import create_sharding
+from maxtext.utils.sharding import maybe_shard_with_logical
 # -----------------------------------------
 # The Decoder Layer for DeepSeek v3
 # -----------------------------------------
diff --git a/src/maxtext/models/llama2.py b/src/maxtext/models/llama2.py
@@ -32,7 +32,7 @@
 from maxtext.layers.normalizations import RMSNorm
 from maxtext.layers.quantizations import AqtQuantization as Quant
 from maxtext.utils import max_utils
-from MaxText.sharding import create_sharding, maybe_shard_with_logical
+from maxtext.utils.sharding import create_sharding, maybe_shard_with_logical
 
 # -----------------------------------------
 # The Decoder Layer specific for Llama2
diff --git a/src/maxtext/models/simple_layer.py b/src/maxtext/models/simple_layer.py
@@ -19,9 +19,9 @@
 
 from flax import nnx
 from maxtext.common.common_types import Config, ShardMode
-from MaxText.sharding import create_sharding
 from maxtext.layers import quantizations, nnx_wrappers
 from maxtext.layers.initializers import variable_to_logically_partitioned
+from maxtext.utils.sharding import create_sharding
 
 
 from typing import Optional
diff --git a/src/maxtext/optimizers/gradient_accumulation.py b/src/maxtext/optimizers/gradient_accumulation.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Google LLC
+# Copyright 2025-2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 from jax.sharding import NamedSharding
 
 from maxtext.common.common_types import ShardMode
-from MaxText.sharding import maybe_shard_with_name
+from maxtext.utils.sharding import maybe_shard_with_name
 
 
 def gradient_accumulation_loss_and_grad(
diff --git a/src/maxtext/optimizers/optimizers.py b/src/maxtext/optimizers/optimizers.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -44,10 +44,10 @@
 from orbax import checkpoint
 
 # MaxText Imports
-from MaxText import optimizers
 from MaxText import pyconfig
 from maxtext.input_pipeline import tokenizer
 from maxtext.input_pipeline import input_pipeline_interface
+from maxtext.optimizers import optimizers
 from maxtext.trainers.post_train.distillation import distillation_utils
 from maxtext.utils import max_logging
 from maxtext.utils import maxtext_utils
diff --git a/src/maxtext/trainers/post_train/sft/hooks.py b/src/maxtext/trainers/post_train/sft/hooks.py
@@ -32,7 +32,6 @@
 from tunix.sft import peft_trainer
 from tunix.sft.hooks import DataHooks, TrainingHooks
 
-from MaxText import sharding
 from maxtext.input_pipeline.input_pipeline_interface import create_data_iterator
 from maxtext.common.data_loader import DataLoader
 from maxtext.common.goodput import GoodputEvent, record_goodput
@@ -41,6 +40,7 @@
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
+from maxtext.utils import sharding
 
 
 class SFTTrainingHooks(TrainingHooks):
diff --git a/src/maxtext/trainers/post_train/sft/train_sft.py b/src/maxtext/trainers/post_train/sft/train_sft.py
@@ -49,7 +49,6 @@
 
 from tunix.sft import metrics_logger, peft_trainer, profiler
 
-from MaxText import optimizers
 from MaxText import pyconfig
 from maxtext.trainers.pre_train.train import loss_fn
 from maxtext.common.goodput import (
@@ -58,6 +57,7 @@
     maybe_monitor_goodput,
     maybe_record_goodput,
 )
+from maxtext.optimizers import optimizers
 from maxtext.trainers.post_train.sft import hooks
 from maxtext.utils import max_utils
 from maxtext.utils import max_logging
diff --git a/src/maxtext/trainers/pre_train/train.py b/src/maxtext/trainers/pre_train/train.py
@@ -40,12 +40,9 @@
 
 from maxtext.common.common_types import ShardMode
 from MaxText import pyconfig
-from MaxText import sharding
 from MaxText.globals import EPS
 # Placeholder: internal
 
-from MaxText.gradient_accumulation import gradient_accumulation_loss_and_grad
-from MaxText.vocabulary_tiling import vocab_tiling_linen_loss
 # pylint: disable=too-many-positional-arguments
 from maxtext.layers.multi_token_prediction import calculate_mtp_acceptance_rate, calculate_mtp_loss
 from maxtext.common import checkpointing, profiler
@@ -58,13 +55,16 @@
 from maxtext.common.gcloud_stub import cloud_diagnostics as _cloud_diag, is_decoupled
 from maxtext.common.gcloud_stub import vertex_tensorboard_modules
 from maxtext.common.metric_logger import MetricLogger, record_activation_metrics
+from maxtext.optimizers.gradient_accumulation import gradient_accumulation_loss_and_grad
 from maxtext.trainers.post_train.dpo.dpo_utils import _merge_dpo_state, _split_dpo_state, dpo_loss_fn
 from maxtext.utils import exceptions
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
+from maxtext.utils import sharding
 from maxtext.utils import train_utils
+from maxtext.utils.vocabulary_tiling import vocab_tiling_linen_loss
 
 _diag_modules = _cloud_diag()
 diagnostic, debug_configuration, diagnostic_configuration, stack_trace_configuration = _diag_modules
diff --git a/src/maxtext/trainers/pre_train/train_compile.py b/src/maxtext/trainers/pre_train/train_compile.py
@@ -33,17 +33,17 @@
 from jax.experimental.topologies import get_topology_desc
 from jax.sharding import AxisType, Mesh
 from MaxText import accelerator_to_spec_map
-from MaxText import optimizers
 from MaxText import pyconfig
-from MaxText import sharding
 from maxtext.common.common_types import MODEL_MODE_TRAIN, ShardMode
 from maxtext.layers import quantizations
 from maxtext.models import models
+from maxtext.optimizers import optimizers
 from maxtext.trainers.diloco import diloco
 from maxtext.trainers.pre_train import train
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
+from maxtext.utils import sharding
 
 # pylint: disable=too-many-positional-arguments
 
diff --git a/src/maxtext/utils/layerwise_quantization.py b/src/maxtext/utils/layerwise_quantization.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/utils/maxtext_utils.py b/src/maxtext/utils/maxtext_utils.py
@@ -36,7 +36,6 @@
 import orbax.checkpoint.experimental.emergency.checkpoint_manager as emergency_checkpoint_manager
 import orbax.checkpoint.experimental.emergency.replicator_checkpoint_manager as emergency_replicator_checkpoint_manager
 
-from MaxText import sharding
 from maxtext.common.common_types import DecoderBlockType, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
 from maxtext.configs import types
 from maxtext.inference.page_manager import PageState
@@ -45,6 +44,7 @@
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
+from maxtext.utils import sharding
 
 OVERWRITE_WITH_GRADIENT = "_overwrite_with_gradient"
 
diff --git a/src/maxtext/utils/rampup_batch.py b/src/maxtext/utils/rampup_batch.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Google LLC
+# Copyright 2025-2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/maxtext/utils/sharding.py b/src/maxtext/utils/sharding.py
diff --git a/src/maxtext/utils/train_utils.py b/src/maxtext/utils/train_utils.py
diff --git a/src/maxtext/utils/vocabulary_tiling.py b/src/maxtext/utils/vocabulary_tiling.py
diff --git a/tests/unit/data_loader_test.py b/tests/unit/data_loader_test.py
diff --git a/tests/unit/maxtext_utils_test.py b/tests/unit/maxtext_utils_test.py
diff --git a/tests/unit/sharding_compare_test.py b/tests/unit/sharding_compare_test.py
diff --git a/tests/unit/state_dtypes_test.py b/tests/unit/state_dtypes_test.py
diff --git a/tests/unit/tiling_test.py b/tests/unit/tiling_test.py
diff --git a/tests/utils/attention_test_util.py b/tests/utils/attention_test_util.py
diff --git a/tests/utils/sharding_dump.py b/tests/utils/sharding_dump.py