Merge pull request #3059 from AI-Hypercomputer:anisha-split-openmath

Google-ML-Automation · Google-ML-Automation · commit 0a5cce34bcf8 · 2026-02-24T09:04:29.000-08:00
PiperOrigin-RevId: 874640099
diff --git a/dependencies/dockerfiles/maxtext_post_training_dependencies.Dockerfile b/dependencies/dockerfiles/maxtext_post_training_dependencies.Dockerfile
@@ -35,6 +35,8 @@ RUN pip install vllm-tpu
 
 RUN pip install --no-deps qwix==0.1.4
 
+RUN pip install math-verify==0.9.0
+
 RUN if [ "$MODE" = "post-training-experimental" ]; then \
     pip uninstall -y jax jaxlib libtpu && \
     pip install --pre -U jax jaxlib -i https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ && \
diff --git a/dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile b/dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
@@ -40,6 +40,8 @@ RUN pip install -e /tpu-inference --no-cache-dir
 
 RUN pip install --no-deps qwix==0.1.4
 
+RUN pip install math-verify==0.9.0
+
 RUN if [ "$MODE" = "post-training-experimental" ]; then \
     echo "MODE=post-training-experimental: Re-installing JAX/libtpu"; \
     pip uninstall -y jax jaxlib libtpu && \
diff --git a/src/maxtext/trainers/post_train/rl/evaluate_rl.py b/src/maxtext/trainers/post_train/rl/evaluate_rl.py
@@ -16,6 +16,7 @@
 """
 RL Evaluation Module.
 """
+from math_verify import parse
 from tqdm.auto import tqdm
 from tunix.rl.rollout.base_rollout import RolloutConfig
 
@@ -99,7 +100,6 @@ def score_responses(tmvp_config, question, responses, answer):
       Tuple of (is_correct, is_partially_correct, has_correct_format)
   """
   match_format = utils_rl.get_match_format_regex(tmvp_config)
-  match_numbers = utils_rl.get_match_numbers_regex(tmvp_config)
 
   if tmvp_config.debug.rl:
     max_logging.log("========================================")
@@ -114,22 +114,32 @@ def score_responses(tmvp_config, question, responses, answer):
 
   for response in responses:
     # Extract numerical response
-    extracted_response = guess.group(1) if (guess := match_numbers.search(response)) is not None else "-1000000"
-
+    extracted_response = guess.group(1) if (guess := match_format.search(response)) is not None else "-1000000"
     if tmvp_config.debug.rl:
       max_logging.log(f"Evaluation extracted_response: {extracted_response}")
 
     # Check exact correctness
     try:
-      # Remove ',' and '$' then convert to float
-      val_extracted = float(extracted_response.replace(",", "").replace("$", "").strip())
-      val_answer = float(answer.replace(",", "").replace("$", "").strip())
-      is_correct = val_extracted == val_answer
-
-      # Check partial correctness (within 10%)
-      ratio = val_extracted / val_answer
-      if 0.9 <= ratio <= 1.1:
-        is_partially_correct = True
+      # Fix LaTeX escaping issues for both ground truth and extracted answer
+      norm_answer = utils_rl.fix_latex_escaping(answer)
+      norm_extracted = utils_rl.fix_latex_escaping(extracted_response)
+      # Normalize Normalize for certain datasets and parse
+      if "DAPO" in tmvp_config.dataset_name or "OpenMathInstruct" in tmvp_config.dataset_name:
+        norm_extracted = utils_rl.normalize_final_answer(norm_extracted).strip()
+        norm_answer = utils_rl.normalize_final_answer(answer).strip()
+      is_correct = utils_rl.math_verify_func([utils_rl.boxed(norm_answer)], [utils_rl.boxed(norm_extracted)])[0] > 0.1
+      if tmvp_config.debug.rl:
+        # is_correct is a tuple, if first value is 1.0 means it's a match;
+        # 0.0 means a mismatch. e.g. (0.0, (['3', '3'], ['3/5', '\\frac{3}{5}']))
+        max_logging.log(f"Result is_correct: {is_correct}")
+
+      val_extracted = parse(utils_rl.boxed(norm_extracted))
+      val_answer = parse(utils_rl.boxed(norm_answer))
+
+      # Check partial correctness if values can be extracted (within 10%)
+      if val_extracted and val_answer:
+        ratio = (val_extracted[0] + utils_rl.EPSILON) / (val_answer[0] + utils_rl.EPSILON)
+        is_partially_correct = 0.9 <= ratio <= 1.1
 
     except Exception as e:
       if tmvp_config.debug.rl:
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -304,14 +304,82 @@ def rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices):
   model_tokenizer = AutoTokenizer.from_pretrained(trainer_config.tokenizer_path)
 
   # Load datasets
-  train_dataset = get_dataset(
-      model_tokenizer,
-      trainer_config,
-      train_data_dir,
-      trainer_config.train_split,
-      data_files=trainer_config.hf_train_files,
-      dataset_name=trainer_config.dataset_name,
-  )
+  if trainer_config.dataset_name == "huggingface:nvidia/OpenMathInstruct-2":
+    import datasets  # pylint: disable=import-outside-toplevel
+
+    def prepare_openinstructmath2_dataset(
+        split: str = "train_1M",
+        seed: int = 42,
+        test_size: float = 0.05,
+        output_key: str = "expected_answer",
+    ):
+      """Load and split the OpenMathInstruct-2 dataset into train and validation sets using HF's train_test_split."""
+      max_logging.log(
+          "WARNING: For reproducible experiments, preprocess the dataset once and "
+          "define your own HfDataset subclass that directly uses the preprocessed datasets."
+      )
+
+      # Load the original dataset
+      original_ds = datasets.load_dataset(
+          "parquet",
+          data_files={trainer_config.train_split: trainer_config.hf_train_files},
+          split=split,
+          cache_dir=train_data_dir,
+      )
+
+      # Split into train and validation sets using HF's train_test_split
+      split_ds = original_ds.train_test_split(test_size=test_size, seed=seed)
+
+      return {
+          "train": split_ds["train"],
+          "validation": split_ds["test"],
+      }
+
+    split_name = trainer_config.train_split if trainer_config.train_split != "train" else "train_1M"
+    splits = prepare_openinstructmath2_dataset(split=split_name)
+    template_config = load_template_from_file(trainer_config.chat_template_path)
+
+    train_dataset = (
+        grain.MapDataset.source(splits["train"])
+        .shuffle(seed=trainer_config.data_shuffle_seed)
+        .map(
+            lambda x: utils_rl.process_data(
+                trainer_config.dataset_name, model_tokenizer, template_config, trainer_config, x
+            )
+        )
+    )
+
+    test_dataset = (
+        grain.MapDataset.source(splits["validation"])
+        .shuffle(seed=trainer_config.data_shuffle_seed)
+        .map(
+            lambda x: utils_rl.process_data(
+                trainer_config.dataset_name, model_tokenizer, template_config, trainer_config, x
+            )
+        )
+    )
+  else:
+    train_dataset = get_dataset(
+        model_tokenizer,
+        trainer_config,
+        train_data_dir,
+        trainer_config.train_split,
+        data_files=trainer_config.hf_train_files,
+        dataset_name=trainer_config.dataset_name,
+    )
+
+    eval_dataset_name = getattr(trainer_config, "eval_dataset_name", None)
+    if not eval_dataset_name:
+      eval_dataset_name = trainer_config.dataset_name
+
+    test_dataset = get_dataset(
+        model_tokenizer,
+        trainer_config,
+        test_data_dir,
+        trainer_config.eval_split,
+        data_files=trainer_config.hf_eval_files,
+        dataset_name=eval_dataset_name,
+    )
 
   def _filter_long_prompts(x):
     tokens = model_tokenizer.tokenize(x["prompts"])
@@ -324,24 +392,24 @@ def _filter_long_prompts(x):
 
   train_dataset = train_dataset.to_iter_dataset().batch(trainer_config.batch_size)
 
-  eval_dataset_name = getattr(trainer_config, "eval_dataset_name", None)
-  if not eval_dataset_name:
-    eval_dataset_name = trainer_config.dataset_name
-
-  test_dataset = get_dataset(
-      model_tokenizer,
-      trainer_config,
-      test_data_dir,
-      trainer_config.eval_split,
-      data_files=trainer_config.hf_eval_files,
-      dataset_name=eval_dataset_name,
-  )
-
   test_dataset = test_dataset.filter(_filter_long_prompts)
   test_dataset = test_dataset[: trainer_config.num_test_batches * trainer_config.batch_size]
 
   test_dataset = test_dataset.to_iter_dataset().batch(trainer_config.batch_size)
 
+  if trainer_config.debug.rl:
+    # Let's see how one batch of the dataset looks like!
+    if trainer_config.debug.rl:
+      for i, ele in enumerate(train_dataset):
+        if i >= 5:
+          break
+        pprint(ele)
+    if trainer_config.debug.rl:
+      for i, ele in enumerate(test_dataset):
+        if i >= 5:
+          break
+        pprint(ele)
+
   # Load reference model
   max_logging.log("Creating reference model and also meshes for reference and rollout")
   reference_model, reference_mesh = get_maxtext_model(trainer_config, trainer_devices)
@@ -499,7 +567,7 @@ def _filter_long_prompts(x):
           "enable_tunix_perf_metrics is True but tunix.perf modules are not available, skipping Tunix-managed metrics."
       )
 
-  vllm_config_path = os.path.join(MAXTEXT_CONFIGS_DIR, "inference", "vllm.yml")
+  vllm_config_path = epath.Path(MAXTEXT_CONFIGS_DIR) / "inference/vllm.yml"
   argv_list = ["", str(vllm_config_path), "log_config=False"]
   vllm_config = pyconfig.initialize(argv_list)
 
diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py