[Speculate Decoding] Fix bug of reasoning_phase_token_constraint kernel (#7349)

lonelygsh · guanshihui] · web-flow · commit e0a1653b26f6 · 2026-04-14T20:57:11.000+08:00
Co-authored-by: guanshihui] &lt;guanshihui@baidu.com&gt;
diff --git a/custom_ops/gpu_ops/reasoning_phase_token_constraint.cu b/custom_ops/gpu_ops/reasoning_phase_token_constraint.cu
@@ -38,7 +38,7 @@
 //         - In MTP mode, accept_num must be 1 in verify kernel
 //
 //         Transition condition (x = 1 -> x = 2):
-//         - step_idx >= 3
+//         - step_idx >= 4
 //         - pre_ids[-4:] exactly match:
 //               "\n</think>\n\n"
 //
@@ -83,10 +83,10 @@ __global__ void update_reasoning_status_kernel(
   int64_t cur_step = step_idx[tid];
   const int64_t* pre_ids_now =
       token_ids_all + tid * max_seq_len + prompt_lens[tid];
-  int64_t t0 = (cur_step >= 0) ? pre_ids_now[cur_step] : -1;
-  int64_t t1 = (cur_step >= 1) ? pre_ids_now[cur_step - 1] : -1;
-  int64_t t2 = (cur_step >= 2) ? pre_ids_now[cur_step - 2] : -1;
-  int64_t t3 = (cur_step >= 3) ? pre_ids_now[cur_step - 3] : -1;
+  int64_t t0 = (cur_step >= 1) ? pre_ids_now[cur_step - 1] : -1;
+  int64_t t1 = (cur_step >= 2) ? pre_ids_now[cur_step - 2] : -1;
+  int64_t t2 = (cur_step >= 3) ? pre_ids_now[cur_step - 3] : -1;
+  int64_t t3 = (cur_step >= 4) ? pre_ids_now[cur_step - 4] : -1;
 
   int32_t new_status = status;
 
@@ -104,7 +104,7 @@ __global__ void update_reasoning_status_kernel(
   // x = 1 -> x = 2 (include think_end_id)
   // or x = 1 -> x = 3 (not include think_end_id)
   // Here must be serial judge
-  if (new_status == 1 && cur_step >= 3) {
+  if (new_status == 1 && cur_step >= 4) {
     if (t3 == line_break_id && t2 == think_end_id && t1 == line_break_id &&
         t0 == line_break_id) {
       new_status = 2;
diff --git a/tests/operators/test_reasoning_phase_token_constraint.py b/tests/operators/test_reasoning_phase_token_constraint.py
@@ -37,22 +37,24 @@ def setUp(self):
         # token_ids_all
         #
         # batch 0:
-        #   ... \n <think_end> \n \n   → status 1 -> 2
+        #   step_idx=4, pre_ids_now[0..3]
+        #   pattern: \n <think_end> \n \n  → status 1 -> 2
+        #   t3=pre_ids_now[0]=\n, t2=pre_ids_now[1]=<think_end>, t1=pre_ids_now[2]=\n, t0=pre_ids_now[3]=\n
         #
         # batch 1:
-        #   contains think_end, but pattern not complete → status 0 -> 1
+        #   contains think_end at pre_ids_now[2], but pattern not complete → status 0 -> 1
         # ------------------------
         token_ids_all = np.zeros((self.bs, self.max_seq_len), dtype=np.int64)
         self.prompt_lens = paddle.zeros([self.bs, 1], dtype="int64")
 
-        # batch 0
-        token_ids_all[0, 1] = self.line_break_id
-        token_ids_all[0, 2] = self.think_end_id
+        # batch 0: pattern \n <think_end> \n \n at pre_ids_now[0..3]
+        token_ids_all[0, 0] = self.line_break_id
+        token_ids_all[0, 1] = self.think_end_id
+        token_ids_all[0, 2] = self.line_break_id
         token_ids_all[0, 3] = self.line_break_id
-        token_ids_all[0, 4] = self.line_break_id
 
-        # batch 1
-        token_ids_all[1, 3] = self.think_end_id
+        # batch 1: think_end at pre_ids_now[2]
+        token_ids_all[1, 2] = self.think_end_id
 
         self.token_ids_all = paddle.to_tensor(token_ids_all, dtype="int64")
         self.prompt_lens = paddle.zeros([self.bs, 1], dtype="int64")
@@ -167,11 +169,13 @@ def test_status_0_to_1_only(self):
 
         # ------------------------
         # setup: only think_end appears
+        # step_idx=4, pre_ids_now[0..3]
+        # think_end at pre_ids_now[2] (cur_step - 2 = 4 - 2 = 2)
         # ------------------------
         token_ids_all = np.zeros((self.bs, self.max_seq_len), dtype=np.int64)
 
-        # batch 0: think_end at cur_step - 1
-        token_ids_all[0, 3] = self.think_end_id
+        # batch 0: think_end at pre_ids_now[2]
+        token_ids_all[0, 2] = self.think_end_id
 
         # batch 1: no think_end
         token_ids_all[1, :] = 0
@@ -424,13 +428,15 @@ def test_perf_bsz128_vocab100k_status2(self):
 
         # ------------------------
         # token_ids_all: force 1 -> 2 pattern
+        # step_idx=4, pre_ids_now[0..3]
+        # pattern: t3=pre_ids_now[0]=\n, t2=pre_ids_now[1]=<think_end>, t1=pre_ids_now[2]=\n, t0=pre_ids_now[3]=\n
         # ------------------------
         token_ids_all = np.zeros((bs, max_seq_len), dtype=np.int64)
         for i in range(bs):
-            token_ids_all[i, 1] = line_break_id
-            token_ids_all[i, 2] = think_end_id
+            token_ids_all[i, 0] = line_break_id
+            token_ids_all[i, 1] = think_end_id
+            token_ids_all[i, 2] = line_break_id
             token_ids_all[i, 3] = line_break_id
-            token_ids_all[i, 4] = line_break_id
 
         token_ids_all = paddle.to_tensor(token_ids_all, dtype="int64")