AI-Hypercomputer
diff --git a/‎.github/workflows/run_jupyter_notebooks.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/run_jupyter_notebooks.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎codecov.yml‎
Lines changed: 1 addition & 0 deletions b/‎codecov.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/tutorials/posttraining/rl.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/tutorials/posttraining/rl.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/tutorials/posttraining/rl_on_multi_host.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/tutorials/posttraining/rl_on_multi_host.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/MaxText/rl/evaluate_rl.py‎
Lines changed: 10 additions & 214 deletions b/‎src/MaxText/rl/evaluate_rl.py‎
Lines changed: 10 additions & 214 deletions
@@ -98,6 +98,10 @@ jobs:
 
           for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
             filename=$(basename "$notebook")
+            if [[ "$filename" == "sft_qwen3_demo.ipynb" ]]; then
+              echo "Skipping $filename"
+              continue
+            fi
             output_name="${filename%.ipynb}_output.ipynb"
 
             echo "------------------------------------------------------"
 
@@ -40,6 +40,7 @@ ignore:
   - "src/maxtext/scratch_code"
   - "src/MaxText/distillation" # code moved to src/maxtext/trainers/post_train/distillation
   - "src/MaxText/sft" # code moved to src/maxtext/trainers/post_train/sft
+  - "src/MaxText/rl" # code moved to src/maxtext/trainers/post_train/rl
 
 
 flags:
 
@@ -153,7 +153,7 @@ export MAXTEXT_CKPT_PATH=<gcs path for MaxText checkpoint> # e.g., gs://my-bucke
 Run the following command for GRPO:
 
 ```
-python3 -m src.MaxText.rl.train_rl src/maxtext/configs/post_train/rl.yml \
+python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=${MODEL} \
   tokenizer_path=${TOKENIZER} \
   load_parameters_path=${MAXTEXT_CKPT_PATH} \
@@ -176,7 +176,7 @@ The overview of what this run will do is as follows:
 Run the following command for GSPO:
 
 ```
-python3 -m src.MaxText.rl.train_rl src/maxtext/configs/post_train/rl.yml \
+python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=${MODEL} \
   tokenizer_path=${TOKENIZER} \
   load_parameters_path=${MAXTEXT_CKPT_PATH} \
 
@@ -208,7 +208,7 @@ xpk workload create-pathways --workload $WORKLOAD \
 --tpu-type=$TPU_TYPE --num-slices=1 \
 --project=$PROJECT_ID --priority=high \
 --command "HF_TOKEN=${HF_TOKEN} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
-python3 -m src.MaxText.rl.train_rl src/maxtext/configs/post_train/rl.yml \
+python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=${MODEL} \
   tokenizer_path=${TOKENIZER} \
   load_parameters_path=${MAXTEXT_CKPT_PATH} \
@@ -225,7 +225,7 @@ xpk workload create-pathways --workload $WORKLOAD \
 --tpu-type=$TPU_TYPE --num-slices=1 \
 --project=$PROJECT_ID --priority=high \
 --command "HF_TOKEN=${HF_TOKEN} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
-python3 -m src.MaxText.rl.train_rl src/maxtext/configs/post_train/rl.yml \
+python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=${MODEL} \
   tokenizer_path=${TOKENIZER} \
   load_parameters_path=${MAXTEXT_CKPT_PATH} \
 
@@ -12,222 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# pylint: disable=bare-except, consider-using-generator
-"""
-RL Evaluation Module.
-"""
-from tqdm.auto import tqdm
-from tunix.rl.rollout.base_rollout import RolloutConfig
+"""Shim for RL Evaluation in `src/maxtext/trainers/post_train/rl`."""
 
-from MaxText.rl import utils_rl
-from maxtext.utils import max_logging
-
-# ## Evaluate
-# We evaluate it in two ways:
-#
-# **Quantitative**
-#
-# * **Answer Accuracy**: percentage of samples for which the model predicts the
-# correct final numerical answer
-# * **Answer (Partial) Accuracy**: percentage of samples for which the model
-# predicts a final numerical answer such that the \`model answer / answer\`
-# ratio lies between 0.9 and 1.1.
-# * **Format Accuracy**: percentage of samples for which the model outputs the
-# correct format, i.e., reasoning between the reasoning special tokens, and the
-# final answer between the \`\<start\_answer\>\`, \`\<end\_answer\>\` tokens.
-#
-# **Qualitative**
-#
-# We'll also print outputs for a few given questions so that we can compare the generated output later.
-#
-# pylint: disable=broad-exception-caught
-
-
-def generate_responses(
-    tmvp_config,
-    prompts,
-    rl_cluster,
-    num_passes=1,
-):
-  """
-  Generate responses for a batch of prompts across potentially multiple passes.
-
-  Args:
-      tmvp_config: Configuration object
-      prompts: List of prompts to generate responses for
-      rl_cluster: Model cluster for generation
-      num_passes: Number of generation passes
-
-  Returns:
-      List of lists containing responses for each prompt across passes
-  """
-  multiple_call_responses = [[] for _ in range(len(prompts))]
-  eval_strategy = tmvp_config.generation_configs[tmvp_config.eval_sampling_strategy]
-
-  for p in range(num_passes):
-    responses = rl_cluster.rollout.generate(
-        prompts,
-        rollout_config=RolloutConfig(
-            max_tokens_to_generate=tmvp_config.max_target_length - tmvp_config.max_prefill_predict_length,
-            temperature=eval_strategy["eval_temperature"],
-            top_k=eval_strategy["eval_top_k"],
-            top_p=eval_strategy["eval_top_p"],
-        ),
-    )
-    responses = responses.text
-
-    if tmvp_config.debug.rl:
-      max_logging.log(f"Pass {p+1}/{num_passes}, responses: {responses}")
-
-    for idx, response in enumerate(responses):
-      multiple_call_responses[idx].append(response)
-
-  return multiple_call_responses
-
-
-def score_responses(tmvp_config, question, responses, answer):
-  """
-  Score a set of responses for a single question.
-
-  Args:
-      tmvp_config: Configuration object
-      question: The evaluation question
-      responses: List of generated responses for this question
-      answer: The correct answer
-
-  Returns:
-      Tuple of (is_correct, is_partially_correct, has_correct_format)
-  """
-  match_format = utils_rl.get_match_format_regex(tmvp_config)
-  match_numbers = utils_rl.get_match_numbers_regex(tmvp_config)
-
-  if tmvp_config.debug.rl:
-    max_logging.log("========================================")
-    max_logging.log(f"Evaluation Question: {question}")
-    max_logging.log(f"Evaluation Answer: {answer}")
-    max_logging.log(f"Evaluation Responses: {responses}")
-    max_logging.log("========================================")
+import importlib
 
-  is_correct = False
-  is_partially_correct = False
-  has_correct_format = False
-
-  for response in responses:
-    # Extract numerical response
-    extracted_response = guess.group(1) if (guess := match_numbers.search(response)) is not None else "-1000000"
-
-    if tmvp_config.debug.rl:
-      max_logging.log(f"Evaluation extracted_response: {extracted_response}")
-
-    # Check exact correctness
-    try:
-      # Remove ',' and '$' then convert to float
-      val_extracted = float(extracted_response.replace(",", "").replace("$", "").strip())
-      val_answer = float(answer.replace(",", "").replace("$", "").strip())
-      is_correct = val_extracted == val_answer
-
-      # Check partial correctness (within 10%)
-      ratio = val_extracted / val_answer
-      if 0.9 <= ratio <= 1.1:
-        is_partially_correct = True
-
-    except Exception as e:
-      if tmvp_config.debug.rl:
-        max_logging.log(f"Evaluation Exception: {e}")
-        max_logging.log("SKIPPED")
-
-    # Check format correctness
-    if match_format.search(response) is not None:
-      has_correct_format = True
-
-    # Early exit if all criteria are met
-    if is_correct and is_partially_correct and has_correct_format:
-      break
-
-  return is_correct, is_partially_correct, has_correct_format
-
-
-def evaluate(
-    tmvp_config,
-    dataset,
-    rl_cluster,
-    num_passes=1,
-    corr_lst=False,
-    make_lst=False,
-):
-  """
-  Computes accuracy and percentage of outputs matching the format.
-
-  Args:
-      tmvp_config: Configuration object
-      dataset: The evaluation dataset
-      rl_cluster: Model cluster for generation.
-      num_passes: Number of generation passes
-      corr_lst: If True, only include correct responses in the list
-      make_lst: If True, return a list of (question, answer, responses)
-
-  Returns:
-      Tuple of statistics and optionally the response list
-  """
-  response_lst = []
-  corr = 0
-  partially_corr = 0
-  corr_format = 0
-  total = 0
-
-  for batch in tqdm(dataset):
-    answers = batch["answer"]
-    questions = batch["question"]
-    prompts = batch["prompts"]
-
-    # Generate responses for all prompts in the batch
-    multiple_call_responses = generate_responses(
-        tmvp_config=tmvp_config,
-        prompts=prompts,
-        rl_cluster=rl_cluster,
-        num_passes=num_passes,
-    )
-
-    # Score each question-answer pair
-    for question, responses, answer in zip(questions, multiple_call_responses, answers):
-      is_correct, is_partially_correct, has_correct_format = score_responses(
-          tmvp_config=tmvp_config,
-          question=question,
-          responses=responses,
-          answer=answer,
-      )
-
-      # Update counters
-      if is_correct:
-        corr += 1
-        if corr_lst and make_lst:
-          response_lst.append((question, answer, responses))
-      else:
-        if not corr_lst and make_lst:
-          response_lst.append((question, answer, responses))
-
-      if is_partially_correct:
-        partially_corr += 1
-
-      if has_correct_format:
-        corr_format += 1
-
-      total += 1
+from maxtext.utils import max_logging
 
-      # Print progress every 10 items
-      if total % 10 == 0:
-        max_logging.log(
-            f"===> {corr=}, {total=}, {corr / total * 100=}, "
-            f"{partially_corr / total * 100=}, {corr_format / total * 100=}"
-        )
+OLD_MODULE_PATH = "MaxText.rl.evaluate_rl"
+NEW_MODULE_PATH = "maxtext.trainers.post_train.rl.evaluate_rl"
 
-  # Prepare return values
-  to_return = (
-      corr,
-      total,
-      corr / total * 100,
-      partially_corr / total * 100,
-      corr_format / total * 100,
-  )
+max_logging.warning(f"'{OLD_MODULE_PATH}' is deprecated; use '{NEW_MODULE_PATH}' instead.\n")
+_new_module = importlib.import_module(NEW_MODULE_PATH)
 
-  return to_return, response_lst
+evaluate = _new_module.evaluate
+generate_responses = _new_module.generate_responses
+score_responses = _new_module.score_responses