Merge pull request #3180 from AI-Hypercomputer:anisha-rl-refactor

Google-ML-Automation · Google-ML-Automation · commit 5f1717b13e5d · 2026-02-19T16:03:36.000-08:00
PiperOrigin-RevId: 872606777
diff --git a/.github/workflows/run_jupyter_notebooks.yml b/.github/workflows/run_jupyter_notebooks.yml
@@ -98,6 +98,10 @@ jobs:
 
           for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
             filename=$(basename "$notebook")
+            if [[ "$filename" == "sft_qwen3_demo.ipynb" ]]; then
+              echo "Skipping $filename"
+              continue
+            fi
             output_name="${filename%.ipynb}_output.ipynb"
 
             echo "------------------------------------------------------"
diff --git a/codecov.yml b/codecov.yml
@@ -40,6 +40,7 @@ ignore:
   - "src/maxtext/scratch_code"
   - "src/MaxText/distillation" # code moved to src/maxtext/trainers/post_train/distillation
   - "src/MaxText/sft" # code moved to src/maxtext/trainers/post_train/sft
+  - "src/MaxText/rl" # code moved to src/maxtext/trainers/post_train/rl
 
 
 flags:
diff --git a/docs/tutorials/posttraining/rl.md b/docs/tutorials/posttraining/rl.md
@@ -161,7 +161,7 @@ export MAXTEXT_CKPT_PATH=<gcs path for MaxText checkpoint> # e.g., gs://my-bucke
 Run the following command for GRPO:
 
 ```
-python3 -m src.MaxText.rl.train_rl src/maxtext/configs/post_train/rl.yml \
+python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=${MODEL} \
   tokenizer_path=${TOKENIZER} \
   load_parameters_path=${MAXTEXT_CKPT_PATH} \
@@ -185,7 +185,7 @@ The overview of what this run will do is as follows:
 Run the following command for GSPO:
 
 ```
-python3 -m src.MaxText.rl.train_rl src/maxtext/configs/post_train/rl.yml \
+python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=${MODEL} \
   tokenizer_path=${TOKENIZER} \
   load_parameters_path=${MAXTEXT_CKPT_PATH} \
diff --git a/docs/tutorials/posttraining/rl_on_multi_host.md b/docs/tutorials/posttraining/rl_on_multi_host.md
@@ -208,7 +208,7 @@ xpk workload create-pathways --workload $WORKLOAD \
 --tpu-type=$TPU_TYPE --num-slices=1 \
 --project=$PROJECT_ID --priority=high \
 --command "HF_TOKEN=${HF_TOKEN} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
-python3 -m src.MaxText.rl.train_rl src/maxtext/configs/post_train/rl.yml \
+python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=${MODEL} \
   tokenizer_path=${TOKENIZER} \
   load_parameters_path=${MAXTEXT_CKPT_PATH} \
@@ -225,7 +225,7 @@ xpk workload create-pathways --workload $WORKLOAD \
 --tpu-type=$TPU_TYPE --num-slices=1 \
 --project=$PROJECT_ID --priority=high \
 --command "HF_TOKEN=${HF_TOKEN} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
-python3 -m src.MaxText.rl.train_rl src/maxtext/configs/post_train/rl.yml \
+python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=${MODEL} \
   tokenizer_path=${TOKENIZER} \
   load_parameters_path=${MAXTEXT_CKPT_PATH} \
diff --git a/src/maxtext/examples/rl_llama3_demo.ipynb b/src/maxtext/examples/rl_llama3_demo.ipynb
@@ -94,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -148,13 +148,15 @@
     "from pathlib import Path\n",
     "import MaxText\n",
     "from huggingface_hub import login\n",
+    "from etils import epath\n",
     "import jax\n",
     "\n",
-    "from MaxText import max_utils\n",
-    "from MaxText.rl.train_rl import rl_train, setup_configs_and_devices\n",
+    "from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices\n",
     "\n",
     "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"0\"\n",
     "os.environ[\"SKIP_JAX_PRECOMPILE\"] = \"1\"  # Faster startup for vLLM\n",
+    "# Suppress vLLM logging with a severity level below ERROR\n",
+    "os.environ[\"VLLM_LOGGING_LEVEL\"] = \"ERROR\"\n",
     "\n",
     "MAXTEXT_PKG_DIR = os.path.dirname(MaxText.__file__)\n",
     "MAXTEXT_REPO_ROOT = os.sep.join([\"maxtext\" if p == \"MaxText\" else p for p in MAXTEXT_PKG_DIR.split(os.sep)])\n",
@@ -243,7 +245,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if not os.path.exists(MODEL_CHECKPOINT_PATH):\n",
+    "if not epath.Path(MODEL_CHECKPOINT_PATH).exists():\n",
     "    # install torch for the conversion script\n",
     "    !python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu\n",
     "\n",
@@ -256,8 +258,8 @@
     "      scan_layers=true \\\n",
     "      skip_jax_distributed_system=True\n",
     "\n",
-    "if not os.path.exists(MODEL_CHECKPOINT_PATH):\n",
-    "    raise ValueError(\"Model checkpoint conversion failed. Check the logs above.\")"
+    "    if not epath.Path(MODEL_CHECKPOINT_PATH).exists():\n",
+    "        raise ValueError(\"Model checkpoint conversion failed. Check the logs above.\")"
    ]
   },
   {
@@ -276,7 +278,7 @@
     "# Load configuration for RL training\n",
     "config_argv = [\n",
     "    \"\",\n",
-    "    f\"{MAXTEXT_PKG_DIR}/configs/rl.yml\",\n",
+    "    f\"{MAXTEXT_PKG_DIR}/configs/post_train/rl.yml\",\n",
     "    f\"model_name={MODEL_NAME}\",\n",
     "    f\"tokenizer_path={TOKENIZER_PATH}\",\n",
     "    f\"run_name={RUN_NAME}\",\n",
@@ -344,13 +346,13 @@
     "\n",
     "- **CLI Usage**: https://maxtext.readthedocs.io/en/latest/tutorials/rl.html\n",
     "- **Configuration**: See `src/maxtext/configs/rl.yml` for all available options\n",
-    "- **Documentation**: Check `src/MaxText/rl/train_rl.py` for the `rl_train` function implementation"
+    "- **Documentation**: Check `src/maxtext/trainers/post_train/rl/train_rl.py` for the `rl_train` function implementation"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "maxtext_venv",
    "language": "python",
    "name": "python3"
   },
@@ -364,7 +366,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.12"
   }
  },
  "nbformat": 4,
diff --git a/src/maxtext/examples/sft_qwen3_demo.ipynb b/src/maxtext/examples/sft_qwen3_demo.ipynb
@@ -377,7 +377,7 @@
     "config = pyconfig.initialize(\n",
     "    [\n",
     "        \"\",\n",
-    "        f\"{MAXTEXT_PKG_DIR}/configs/sft.yml\",\n",
+    "        f\"{MAXTEXT_PKG_DIR}/configs/post_train/sft.yml\",\n",
     "        f\"load_parameters_path={MODEL_CHECKPOINT_PATH}/0/items\",\n",
     "        f\"model_name={MODEL_NAME}\",\n",
     "        f\"hf_access_token={HF_TOKEN}\",\n",
diff --git a/src/maxtext/examples/sft_train_and_evaluate.py b/src/maxtext/examples/sft_train_and_evaluate.py
@@ -301,6 +301,7 @@ def create_vllm_rollout(config, model, mesh, tokenizer):
           rollout_vllm_hbm_utilization=0.2,
           rollout_vllm_init_with_random_weights=True,
           rollout_vllm_tpu_backend_type="jax",
+          data_type="bfloat16",
       ),
   )
 
diff --git a/src/maxtext/rl/__init__.py b/src/maxtext/rl/__init__.py
diff --git a/src/maxtext/rl/evaluate_rl.py b/src/maxtext/rl/evaluate_rl.py
@@ -0,0 +1,29 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shim for RL Evaluation in `src/maxtext/trainers/post_train/rl`."""
+
+import importlib
+
+from maxtext.utils import max_logging
+
+OLD_MODULE_PATH = "MaxText.rl.evaluate_rl"
+NEW_MODULE_PATH = "maxtext.trainers.post_train.rl.evaluate_rl"
+
+max_logging.warning(f"'{OLD_MODULE_PATH}' is deprecated; use '{NEW_MODULE_PATH}' instead.\n")
+_new_module = importlib.import_module(NEW_MODULE_PATH)
+
+evaluate = _new_module.evaluate
+generate_responses = _new_module.generate_responses
+score_responses = _new_module.score_responses
diff --git a/src/maxtext/rl/train_rl.py b/src/maxtext/rl/train_rl.py
@@ -0,0 +1,36 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shim for RL Trainer in `src/maxtext/trainers/post_train/rl`."""
+
+import sys
+import importlib
+
+from absl import logging
+
+from maxtext.utils import max_logging
+
+OLD_MODULE_PATH = "MaxText.rl.train_rl"
+NEW_MODULE_PATH = "maxtext.trainers.post_train.rl.train_rl"
+
+if __name__ == "__main__":
+  try:
+    logging.set_verbosity(logging.INFO)
+    _new_module = importlib.import_module(NEW_MODULE_PATH)
+    if hasattr(_new_module, "main"):
+      max_logging.warning(f"'{OLD_MODULE_PATH}' is deprecated; use '{NEW_MODULE_PATH}' instead.\n")
+      _new_module.main(sys.argv)
+  except ImportError as e:
+    max_logging.error(f"Shim could not find target module: '{NEW_MODULE_PATH}'\n")
+    raise e
diff --git a/src/maxtext/rl/utils_rl.py b/src/maxtext/rl/utils_rl.py
@@ -0,0 +1,39 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shim for RL Utils in `src/maxtext/trainers/post_train/rl`."""
+
+import importlib
+
+from maxtext.utils import max_logging
+
+OLD_MODULE_PATH = "MaxText.rl.utils_rl"
+NEW_MODULE_PATH = "maxtext.trainers.post_train.rl.utils_rl"
+
+max_logging.warning(f"'{OLD_MODULE_PATH}' is deprecated; use '{NEW_MODULE_PATH}' instead.\n")
+_new_module = importlib.import_module(NEW_MODULE_PATH)
+
+# Re-export all public names for backward compatibility
+SUBSTITUTIONS = _new_module.SUBSTITUTIONS
+REMOVED_EXPRESSIONS = _new_module.REMOVED_EXPRESSIONS
+get_match_format_regex = _new_module.get_match_format_regex
+match_format_exactly = _new_module.match_format_exactly
+match_format_approximately = _new_module.match_format_approximately
+normalize_final_answer = _new_module.normalize_final_answer
+check_answer = _new_module.check_answer
+get_match_numbers_regex = _new_module.get_match_numbers_regex
+check_numbers = _new_module.check_numbers
+extract_hash_answer = _new_module.extract_hash_answer
+get_optimizer = _new_module.get_optimizer
+process_data = _new_module.process_data
diff --git a/src/maxtext/trainers/post_train/rl/__init__.py b/src/maxtext/trainers/post_train/rl/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/maxtext/trainers/post_train/rl/evaluate_rl.py b/src/maxtext/trainers/post_train/rl/evaluate_rl.py
@@ -19,7 +19,7 @@
 from tqdm.auto import tqdm
 from tunix.rl.rollout.base_rollout import RolloutConfig
 
-from MaxText.rl import utils_rl
+from maxtext.trainers.post_train.rl import utils_rl
 from maxtext.utils import max_logging
 
 # ## Evaluate
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py

Original file line number	Diff line number	Diff line change
`@@ -301,6 +301,7 @@ def create_vllm_rollout(config, model, mesh, tokenizer):`
`301`	`301`	`rollout_vllm_hbm_utilization=0.2,`
`302`	`302`	`rollout_vllm_init_with_random_weights=True,`
`303`	`303`	`rollout_vllm_tpu_backend_type="jax",`
	`304`	`+ data_type="bfloat16",`
`304`	`305`	`),`
`305`	`306`	`)`
`306`	`307`