Merge pull request #3349 from AI-Hypercomputer:hengtaoguo-test

Google-ML-Automation · Google-ML-Automation · commit 76d9f945ccdf · 2026-03-09T12:29:51.000-07:00
PiperOrigin-RevId: 880983467
diff --git a/dependencies/requirements/base_requirements/requirements.txt b/dependencies/requirements/base_requirements/requirements.txt
@@ -17,6 +17,7 @@ jax
 jaxlib
 jaxtyping
 jsonlines
+math-verify
 ml-collections
 ml-goodput-measurement
 numpy
diff --git a/dependencies/requirements/generated_requirements/tpu-requirements.txt b/dependencies/requirements/generated_requirements/tpu-requirements.txt
@@ -120,6 +120,7 @@ lxml>=6.0.2
 markdown-it-py>=4.0.0
 markdown>=3.10
 markupsafe>=3.0.3
+math-verify>=0.9.0
 matplotlib>=3.10.7
 mccabe>=0.7.0
 mdurl>=0.1.2
diff --git a/src/maxtext/trainers/post_train/rl/evaluate_rl.py b/src/maxtext/trainers/post_train/rl/evaluate_rl.py
@@ -100,6 +100,7 @@ def score_responses(tmvp_config, question, responses, answer):
       Tuple of (is_correct, is_partially_correct, has_correct_format)
   """
   match_format = utils_rl.get_match_format_regex(tmvp_config)
+  answer_fallback = utils_rl.get_answer_fallback_regex(tmvp_config)
 
   if tmvp_config.debug.rl:
     max_logging.log("========================================")
@@ -113,10 +114,19 @@ def score_responses(tmvp_config, question, responses, answer):
   has_correct_format = False
 
   for response in responses:
-    # Extract numerical response
-    extracted_response = guess.group(1) if (guess := match_format.search(response)) is not None else "-1000000"
+    # Extract answer: prefer the full format match; fall back to the last
+    # <answer>...</answer> tag if full format match is not found, so result
+    # scoring is decoupled from format.
+    full_match = match_format.search(response)
+    if full_match is not None:
+      extracted_response = full_match.group(1)
+    else:
+      # Find the *last* occurrence of the answer tag (most likely the final answer).
+      fallback_matches = answer_fallback.findall(response)
+      extracted_response = fallback_matches[-1].strip() if fallback_matches else "-1000000"
     if tmvp_config.debug.rl:
-      max_logging.log(f"Evaluation extracted_response: {extracted_response}")
+      used = "full format" if full_match is not None else "answer-tag fallback"
+      max_logging.log(f"Evaluation extracted_response ({used}): {extracted_response}")
 
     # Check exact correctness
     try:
@@ -146,8 +156,8 @@ def score_responses(tmvp_config, question, responses, answer):
         max_logging.log(f"Evaluation Exception: {e}")
         max_logging.log("SKIPPED")
 
-    # Check format correctness
-    if match_format.search(response) is not None:
+    # Check format correctness (requires the full <reasoning>...</reasoning><answer>...</answer> structure)
+    if full_match is not None:
       has_correct_format = True
 
     # Early exit if all criteria are met
diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py
@@ -118,6 +118,19 @@ def get_match_format_regex(tmvp_config):
   return match_format
 
 
+def get_answer_fallback_regex(tmvp_config):
+  """Returns a compiled regex that finds the *last* answer tag in a completion.
+
+  Used as a fallback when the full <reasoning>...</reasoning><answer>...</answer>
+  format is incomplete (e.g. missing the closing reasoning tag).  The result
+  reward can still be computed independently from the format reward.
+  """
+  return re.compile(
+      rf"{re.escape(tmvp_config.solution_start_token)}(.+?){re.escape(tmvp_config.solution_end_token)}",
+      flags=re.MULTILINE | re.DOTALL,
+  )
+
+
 def match_format_exactly(prompts, completions, tmvp_config, **kargs):
   """
   Give the model a reward of tmvp_config.reward_exact_format_match points if the format matches exactly.
diff --git a/tests/unit/rl_utils_test.py b/tests/unit/rl_utils_test.py
@@ -0,0 +1,106 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for RL result parsing and reward scoring (CPU-only)."""
+
+import unittest
+import pytest
+from types import SimpleNamespace
+
+evaluate_rl = pytest.importorskip(
+    "maxtext.trainers.post_train.rl.evaluate_rl",
+    reason="tunix (required by evaluate_rl) is not installed GPU",
+)
+
+
+def _make_config():
+  """Create a minimal config object with the parameters required by score_responses."""
+  return SimpleNamespace(
+      reasoning_start_token="<reasoning>",
+      reasoning_end_token="</reasoning>",
+      solution_start_token="<answer>",
+      solution_end_token="</answer>",
+      reward_exact_format_match=2.0,
+      reward_partial_format_match=0.5,
+      reward_white_space_format_match=1.5,
+      reward_ratio_guess_to_answer_high=1.0,
+      reward_ratio_guess_to_answer_low=0.5,
+      penalty_incorrect_format=-0.5,
+      penalty_incorrect_answer=-0.5,
+      dataset_name="test",
+      debug=SimpleNamespace(rl=False),
+  )
+
+
+class TestScoreResponses(unittest.TestCase):
+  """Tests for evaluate_rl.score_responses parsing and correctness logic."""
+
+  def setUp(self):
+    self.config = _make_config()
+
+  @pytest.mark.cpu_only
+  def test_nested_tags(self):
+    """Response with nested reasoning tags still extracts the correct answer."""
+    is_correct, is_partially_correct, has_correct_format = evaluate_rl.score_responses(
+        tmvp_config=self.config,
+        question="What is 11/3?",
+        responses=[
+            "<reasoning>Need to use <reasoning> and </reasoning>, "
+            "<answer> and </answer></reasoning><answer>11/3</answer>"
+        ],
+        answer="11/3",
+    )
+    self.assertTrue(is_correct)
+    self.assertTrue(is_partially_correct)
+    self.assertTrue(has_correct_format)
+
+  @pytest.mark.cpu_only
+  def test_with_extra_ending_tags(self):
+    """Answer with extra ending tags such as <end_of_turn>."""
+    is_correct, is_partially_correct, has_correct_format = evaluate_rl.score_responses(
+        tmvp_config=self.config,
+        question=(
+            "James buys a new wardrobe.  He buys 10 suits and 10 dress pants.  "
+            "He also buys 3 dress shirts per suit.  The suits cost $750 each and "
+            "the dress pants cost 1/5 that cost.  The dress shirts were $60 each.  "
+            "How much did everything cost?"
+        ),
+        responses=[
+            "<reasoning>This is the sum of the cost of the suits, the pants, and the "
+            "shirts: $7500 + $1500 + $1800 = $10800.\n\n</reasoning>\n"
+            "<answer>10800</answer><end_of_turn>"
+        ],
+        answer="10,800",
+    )
+    self.assertTrue(is_correct)
+    self.assertTrue(is_partially_correct)
+    self.assertTrue(has_correct_format)
+
+  @pytest.mark.cpu_only
+  def test_with_incomplete_reasoning_tags(self):
+    """(1) Incomplete reasoning tags still extracts the correct answer."""
+    """(2) Currency symbols works with math_verify."""
+    is_correct, is_partially_correct, has_correct_format = evaluate_rl.score_responses(
+        tmvp_config=self.config,
+        question="What is the price of the item?",
+        responses=["<reasoning>The item costs $16.<answer>$16</answer>"],
+        answer="16",
+    )
+    self.assertTrue(is_correct)
+    self.assertTrue(is_partially_correct)
+    self.assertFalse(has_correct_format)
+
+
+if __name__ == "__main__":
+  unittest.main()