Merge pull request #3220 from AI-Hypercomputer:bvandermoon-repo-restructure

Google-ML-Automation · Google-ML-Automation · commit 708c406fedce · 2026-02-23T22:33:32.000-08:00
PiperOrigin-RevId: 874382885
diff --git a/src/MaxText/train_tokenizer.py b/src/MaxText/train_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,150 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Train tokenizer
-Example usage: python3 -m MaxText.train_tokenizer --dataset_path=gs://maxtext-dataset --dataset_name=c4/en:3.0.1
-"""
+"""Shim for `train_tokenizer` in `src/maxtext/trainers/tokenizer`."""
 
-import os
-import tempfile
-import time
-
-from absl import app
-from absl import flags
 from absl import logging
 
-from sentencepiece import SentencePieceTrainer
-
-import jax
-
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-from MaxText.globals import MAXTEXT_ASSETS_ROOT
-
-_DATASET_PATH = flags.DEFINE_string("dataset_path", None, "Path to the dataset", required=True)
-_DATASET_NAME = flags.DEFINE_string("dataset_name", None, "Name to the dataset", required=True)
-_VOCAB_SIZE = flags.DEFINE_integer("vocab_size", 32_768, "Vocab size")
-_MAX_CORPUS_CHARS = flags.DEFINE_integer("max_corpus_chars", 10_000_000, "Max corpus chars")
-_ASSETS_PATH = flags.DEFINE_string("assets_path", MAXTEXT_ASSETS_ROOT, "Name to the dataset")
-_VOCAB_MODEL_NAME = flags.DEFINE_string("vocab_model_name", "tokenizer", "Name to the dataset")
-
-
-def _dump_chars_to_textfile(dataset: tf.data.Dataset, maxchars: int = int(1e7), data_keys=("text",)) -> tuple[str, int]:
-  """Write part of a TFDS sentence dataset to lines in a text file.
-  Args:
-    dataset: tf.dataset containing string-data.
-    maxchars: int: approximate number of characters to save from dataset.
-    data_keys: tuple[str]: what keys in dataset to dump from.
-  Returns:
-    name of temp file with dataset bytes, exact number of characters dumped.
-  """
-  char_count = 0
-  ds_iter = dataset.as_numpy_iterator()
-  temp_dir = tempfile.gettempdir()
-  with tempfile.NamedTemporaryFile(delete=False, prefix=os.path.join(temp_dir, "ds_chars")) as outfp:
-    while char_count < maxchars:
-      example = next(ds_iter)
-      for k in data_keys:
-        line = example[k] + b"\n"
-        char_count += len(line)
-        outfp.write(line)
-  return outfp.name, char_count
-
-
-def _train_sentencepiece(
-    dataset: tf.data.Dataset,
-    *,
-    vocab_size: int,
-    maxchars: int = int(1e7),
-    model_path: str,
-    model_type: str = "unigram",
-    character_coverage: float = 1.0,
-    data_keys=("text",),
-):
-  """Train SentencePiece tokenizer from subset of tf dataset.
-  Args:
-    dataset: tf.dataset
-    vocab_size: int: size of vocab tokens to train.
-    maxchars: int: number of characters to use for sentencepiece training.
-    model_path: str: path of model file to save vocab model to.
-    model_type: str: type of sentencepiece vocab to train.
-    character_coverage: amount of characters covered by the model, good defaults
-      are 0.9995 for languages with rich character set like Japanese or Chinese
-      and 1.0 for other languages with small character set.
-    data_keys: tuple[str]: keys of dataset to use for training.
-  Returns:
-    path to the trained sentencepiece vocabulary model.
-  """
-  if model_path.startswith("gs://"):
-    abs_model_path = model_path
-  else:
-    abs_model_path = os.path.abspath(os.path.expanduser(model_path))
-  fname, _ = _dump_chars_to_textfile(dataset, maxchars=maxchars, data_keys=data_keys)
-  temp_dir = tempfile.gettempdir()
-  with tempfile.NamedTemporaryFile(delete=False, prefix=os.path.join(temp_dir, "sp_tmp")) as model_fp:
-    pass  # we just want a prefix'd tmp-filename
-  argstr = " ".join(
-      [
-          f"--input={fname}",
-          f"--vocab_size={vocab_size}",
-          f"--character_coverage={character_coverage}",
-          f"--model_prefix={model_fp.name}",
-          f"--model_type={model_type}",
-      ]
-  )
-  SentencePieceTrainer.Train(argstr)
-  if jax.process_index() == 0:
-    # Use an intermediate filename that is renamed to the target name to address
-    # create and fill delays.
-    copy_rename_path = abs_model_path + ".rntmp"
-    tf.io.gfile.makedirs(os.path.dirname(abs_model_path))
-    tf.io.gfile.copy(model_fp.name + ".model", copy_rename_path, overwrite=True)
-    tf.io.gfile.rename(copy_rename_path, abs_model_path, overwrite=True)
-    logging.info("copied %s to %s", model_fp.name + ".model", abs_model_path)
-  else:
-    while not tf.io.gfile.exists(abs_model_path):
-      time.sleep(1)
-    time.sleep(1)
-  return abs_model_path
-
-
-def train_tokenizer(
-    dataset: tf.data.Dataset,
-    *,
-    vocab_path: str,
-    vocab_size: int,
-    max_corpus_chars: int,
-    data_keys: tuple[str] = ("text",),
-):
-  """tokenizer training function"""
-  logging.info("SentencePiece vocab not found, building one from data.")
-  vocab_path = _train_sentencepiece(
-      dataset,
-      vocab_size=vocab_size,
-      maxchars=max_corpus_chars,
-      model_path=vocab_path,
-      data_keys=data_keys,
-  )
-  logging.info("Model saved at %s", vocab_path)
-
-
-def main(argv):
-  del argv
-  os.environ["TFDS_DATA_DIR"] = _DATASET_PATH.value
+import importlib
+import sys
 
-  read_config = tfds.ReadConfig(
-      shuffle_seed=0,
-  )
-  train_ds_builder = tfds.builder(_DATASET_NAME.value)
-  train_ds = train_ds_builder.as_dataset(split="train", read_config=read_config, shuffle_files=True)
-  train_tokenizer(
-      train_ds,
-      vocab_path=os.path.join(_ASSETS_PATH.value, _VOCAB_MODEL_NAME.value),
-      vocab_size=_VOCAB_SIZE.value,
-      max_corpus_chars=_MAX_CORPUS_CHARS.value,
-  )
+from maxtext.utils import max_logging
 
+OLD_MODULE_PATH = "MaxText.train_tokenizer"
+NEW_MODULE_PATH = "maxtext.trainers.tokenizer.train_tokenizer"
 
 if __name__ == "__main__":
-  app.run(main)
+  try:
+    logging.set_verbosity(logging.INFO)
+    _new_module = importlib.import_module(NEW_MODULE_PATH)
+    if hasattr(_new_module, "main"):
+      max_logging.warning(f"'{OLD_MODULE_PATH}' is deprecated; use '{NEW_MODULE_PATH}' instead.\n")
+      _new_module.main(sys.argv)
+  except ImportError as e:
+    max_logging.error(f"Shim could not find target module: '{NEW_MODULE_PATH}'\n")
+    raise e
diff --git a/src/maxtext/trainers/tokenizer/train_tokenizer.py b/src/maxtext/trainers/tokenizer/train_tokenizer.py
@@ -0,0 +1,163 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Train tokenizer
+Example usage: python3 -m MaxText.train_tokenizer --dataset_path=gs://maxtext-dataset --dataset_name=c4/en:3.0.1
+"""
+
+import os
+import sys
+import tempfile
+import time
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from sentencepiece import SentencePieceTrainer
+
+import jax
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from MaxText.globals import MAXTEXT_ASSETS_ROOT
+
+_DATASET_PATH = flags.DEFINE_string("dataset_path", None, "Path to the dataset", required=True)
+_DATASET_NAME = flags.DEFINE_string("dataset_name", None, "Name to the dataset", required=True)
+_VOCAB_SIZE = flags.DEFINE_integer("vocab_size", 32_768, "Vocab size")
+_MAX_CORPUS_CHARS = flags.DEFINE_integer("max_corpus_chars", 10_000_000, "Max corpus chars")
+_ASSETS_PATH = flags.DEFINE_string("assets_path", MAXTEXT_ASSETS_ROOT, "Name to the dataset")
+_VOCAB_MODEL_NAME = flags.DEFINE_string("vocab_model_name", "tokenizer", "Name to the dataset")
+
+
+def _dump_chars_to_textfile(dataset: tf.data.Dataset, maxchars: int = int(1e7), data_keys=("text",)) -> tuple[str, int]:
+  """Write part of a TFDS sentence dataset to lines in a text file.
+  Args:
+    dataset: tf.dataset containing string-data.
+    maxchars: int: approximate number of characters to save from dataset.
+    data_keys: tuple[str]: what keys in dataset to dump from.
+  Returns:
+    name of temp file with dataset bytes, exact number of characters dumped.
+  """
+  char_count = 0
+  ds_iter = dataset.as_numpy_iterator()
+  temp_dir = tempfile.gettempdir()
+  with tempfile.NamedTemporaryFile(delete=False, prefix=os.path.join(temp_dir, "ds_chars")) as outfp:
+    while char_count < maxchars:
+      example = next(ds_iter)
+      for k in data_keys:
+        line = example[k] + b"\n"
+        char_count += len(line)
+        outfp.write(line)
+  return outfp.name, char_count
+
+
+def _train_sentencepiece(
+    dataset: tf.data.Dataset,
+    *,
+    vocab_size: int,
+    maxchars: int = int(1e7),
+    model_path: str,
+    model_type: str = "unigram",
+    character_coverage: float = 1.0,
+    data_keys=("text",),
+):
+  """Train SentencePiece tokenizer from subset of tf dataset.
+  Args:
+    dataset: tf.dataset
+    vocab_size: int: size of vocab tokens to train.
+    maxchars: int: number of characters to use for sentencepiece training.
+    model_path: str: path of model file to save vocab model to.
+    model_type: str: type of sentencepiece vocab to train.
+    character_coverage: amount of characters covered by the model, good defaults
+      are 0.9995 for languages with rich character set like Japanese or Chinese
+      and 1.0 for other languages with small character set.
+    data_keys: tuple[str]: keys of dataset to use for training.
+  Returns:
+    path to the trained sentencepiece vocabulary model.
+  """
+  if model_path.startswith("gs://"):
+    abs_model_path = model_path
+  else:
+    abs_model_path = os.path.abspath(os.path.expanduser(model_path))
+  fname, _ = _dump_chars_to_textfile(dataset, maxchars=maxchars, data_keys=data_keys)
+  temp_dir = tempfile.gettempdir()
+  with tempfile.NamedTemporaryFile(delete=False, prefix=os.path.join(temp_dir, "sp_tmp")) as model_fp:
+    pass  # we just want a prefix'd tmp-filename
+  argstr = " ".join(
+      [
+          f"--input={fname}",
+          f"--vocab_size={vocab_size}",
+          f"--character_coverage={character_coverage}",
+          f"--model_prefix={model_fp.name}",
+          f"--model_type={model_type}",
+      ]
+  )
+  SentencePieceTrainer.Train(argstr)
+  if jax.process_index() == 0:
+    # Use an intermediate filename that is renamed to the target name to address
+    # create and fill delays.
+    copy_rename_path = abs_model_path + ".rntmp"
+    tf.io.gfile.makedirs(os.path.dirname(abs_model_path))
+    tf.io.gfile.copy(model_fp.name + ".model", copy_rename_path, overwrite=True)
+    tf.io.gfile.rename(copy_rename_path, abs_model_path, overwrite=True)
+    logging.info("copied %s to %s", model_fp.name + ".model", abs_model_path)
+  else:
+    while not tf.io.gfile.exists(abs_model_path):
+      time.sleep(1)
+    time.sleep(1)
+  return abs_model_path
+
+
+def train_tokenizer(
+    dataset: tf.data.Dataset,
+    *,
+    vocab_path: str,
+    vocab_size: int,
+    max_corpus_chars: int,
+    data_keys: tuple[str] = ("text",),
+):
+  """tokenizer training function"""
+  logging.info("SentencePiece vocab not found, building one from data.")
+  vocab_path = _train_sentencepiece(
+      dataset,
+      vocab_size=vocab_size,
+      maxchars=max_corpus_chars,
+      model_path=vocab_path,
+      data_keys=data_keys,
+  )
+  logging.info("Model saved at %s", vocab_path)
+
+
+def main(argv):
+  del argv
+  flags.FLAGS(sys.argv)
+  os.environ["TFDS_DATA_DIR"] = _DATASET_PATH.value
+
+  read_config = tfds.ReadConfig(
+      shuffle_seed=0,
+  )
+  train_ds_builder = tfds.builder(_DATASET_NAME.value)
+  train_ds = train_ds_builder.as_dataset(split="train", read_config=read_config, shuffle_files=True)
+  train_tokenizer(
+      train_ds,
+      vocab_path=os.path.join(_ASSETS_PATH.value, _VOCAB_MODEL_NAME.value),
+      vocab_size=_VOCAB_SIZE.value,
+      max_corpus_chars=_MAX_CORPUS_CHARS.value,
+  )
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tests/unit/tokenizer_test.py b/tests/unit/tokenizer_test.py
@@ -15,9 +15,9 @@
 """Tests for tokenizer"""
 
 import numpy as np
-from MaxText import train_tokenizer
 from MaxText.globals import MAXTEXT_ASSETS_ROOT
 from maxtext.input_pipeline import input_pipeline_utils
+from maxtext.trainers.tokenizer import train_tokenizer
 
 import unittest
 import pytest