Skip to content

Commit d0eb830

Browse files
committed
RL parising: Greedy match outermost </reasoning> tag
rm stop strings
1 parent 7c69cac commit d0eb830

2 files changed

Lines changed: 2 additions & 2 deletions

File tree

src/maxtext/configs/post_train/rl.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ max_num_seqs: null
152152
# If True, enables asynchronous scheduling in vLLM for faster generation
153153
async_scheduling: True
154154
# stop generation when any of these strings is generated
155-
stop_strings: [</answer>]
155+
stop_strings: null
156156

157157
# ====== Checkpoint Configuration ======
158158
enable_checkpointing: True

src/maxtext/trainers/post_train/rl/utils_rl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def get_match_format_regex(tmvp_config):
106106
match_format = re.compile(
107107
(
108108
r"^[\s]{0,}"
109-
rf"{tmvp_config.reasoning_start_token}.+?{tmvp_config.reasoning_end_token}.*?"
109+
rf"{tmvp_config.reasoning_start_token}.+{tmvp_config.reasoning_end_token}.*?"
110110
rf"{tmvp_config.solution_start_token}(.+?){tmvp_config.solution_end_token}"
111111
),
112112
flags=re.MULTILINE | re.DOTALL,

0 commit comments

Comments
 (0)