[Speculative Decoding] Support mtp super ultra overlap in pd-split mode with insert_task overlap (#7323)

freeliuzc · web-flow · commit 31e2a8bbad22 · 2026-04-13T19:41:17.000+08:00
* support mtp overlap in pd-split mode with insert_task overlap
diff --git a/fastdeploy/eplb/async_expert_loader.py b/fastdeploy/eplb/async_expert_loader.py
@@ -24,8 +24,23 @@
 import paddle
 
 try:
-    from cuda import cudart
-except ImportError:
+    import cuda as _cuda_pkg
+
+    _cuda_ver = getattr(_cuda_pkg, "__version__", None)
+    if _cuda_ver is None:
+        # cuda-python >= 13.x 无顶层 __version__，通过 cuda-bindings 子包判断
+        import importlib.metadata as _meta
+
+        _cuda_ver = _meta.version("cuda-bindings")
+    _cuda_major = int(_cuda_ver.split(".")[0])
+    if _cuda_major >= 13:
+        from cuda.bindings import runtime as cudart
+    else:
+        from cuda import cudart
+except Exception as _e:
+    import warnings
+
+    warnings.warn(f"cuda-python import failed, async_expert_loader will be unavailable: {_e}")
     cudart = None
 
 from fastdeploy.config import EPLBConfig
@@ -98,6 +113,7 @@ def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, epl
             raise ImportError(
                 "cuda-python not installed. Install the version matching your CUDA toolkit:\n"
                 "  CUDA 12.x → pip install cuda-python==12.*\n"
+                "  CUDA 13.x → pip install cuda-python cuda-bindings\n"
             )
 
         # Register memory with CUDA
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
@@ -116,36 +116,33 @@
 
 DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1"
 
-if current_platform.is_cuda():
-
-    def async_set_value(tgt, src):
-        if isinstance(src, (int, float, bool)):
-            src = paddle.full(tgt.shape, fill_value=src, dtype=tgt.dtype)
-        elif isinstance(src, (list, np.array)):
-            dtype_str = str(tgt.dtype).split(".")[1]
-            if isinstance(src, list):
-                src = np.array(src, dtype=dtype_str if dtype_str != "bfloat16" else "float32")
+
+def async_set_value(tgt, src):
+    if isinstance(src, (int, float, bool)):
+        src = paddle.full(tgt.shape, fill_value=src, dtype=tgt.dtype)
+    elif isinstance(src, (list, np.ndarray)):
+        dtype_str = str(tgt.dtype).split(".")[1]
+        if isinstance(src, list):
+            src = np.array(src, dtype=dtype_str if dtype_str != "bfloat16" else "float32")
+        if current_platform.is_cuda():
             if str(src.dtype) != dtype_str:
                 srt_tensor = paddle.empty(tgt.shape, dtype=str(src.dtype))
                 src = custom_numpy_to_tensor(src, srt_tensor)
             else:
                 return custom_numpy_to_tensor(src, tgt)
-        elif isinstance(src, paddle.Tensor):
-            pass
         else:
-            raise ValueError("async_set_value unsupported src type: {}".format(type(src)))
-        if src.shape != tgt.shape:
-            src = src.reshape(tgt.shape)
-        if src.dtype != tgt.dtype:
-            src = src.cast(tgt.dtype)
-        if src.place != tgt.place:
-            src = src.to(tgt.place)
-        tgt.copy_(src, blocking=False)
-
-else:
-
-    def async_set_value(*args, **kwargs):
-        raise RuntimeError("async_set_value is only available on CUDA")
+            src = paddle.to_tensor(src, dtype=tgt.dtype)
+    elif isinstance(src, paddle.Tensor):
+        pass
+    else:
+        raise ValueError("async_set_value unsupported src type: {}".format(type(src)))
+    if src.shape != tgt.shape:
+        src = src.reshape(tgt.shape)
+    if src.dtype != tgt.dtype:
+        src = src.cast(tgt.dtype)
+    if src.place != tgt.place:
+        src = src.to(tgt.place)
+    tgt.copy_(src, blocking=False)
 
 
 def pre_process(
diff --git a/fastdeploy/model_executor/xpu_pre_and_post_process.py b/fastdeploy/model_executor/xpu_pre_and_post_process.py
@@ -55,6 +55,29 @@
 DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1"
 
 
+def async_set_value(tgt, src):
+    if isinstance(src, (int, float, bool)):
+        src = paddle.full(tgt.shape, fill_value=src, dtype=tgt.dtype)
+    elif isinstance(src, (list, np.ndarray)):
+        dtype_str = str(tgt.dtype).split(".")[1]
+        np_dtype = dtype_str if dtype_str != "bfloat16" else "float32"
+        if isinstance(src, list):
+            src = np.array(src, dtype=np_dtype)
+        # TODO: support async_numpy_to_tensor
+        src = paddle.to_tensor(src, dtype=tgt.dtype)
+    elif isinstance(src, paddle.Tensor):
+        pass
+    else:
+        raise ValueError("async_set_value unsupported src type: {}".format(type(src)))
+    if src.shape != tgt.shape:
+        src = src.reshape(tgt.shape)
+    if src.dtype != tgt.dtype:
+        src = src.cast(tgt.dtype)
+    if src.place != tgt.place:
+        src = src.to(tgt.place)
+    tgt.copy_(src, blocking=False)
+
+
 def _build_stream_transfer_data(
     output_tokens: paddle.Tensor,
     pooler_outputs: List = None,
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
@@ -49,7 +49,10 @@
         share_external_data,
         update_attn_mask_offsets,
     )
+
+    # temporary solution
     from fastdeploy.model_executor.xpu_pre_and_post_process import (
+        async_set_value,
         xpu_pre_process,
         xpu_process_output,
     )
@@ -483,28 +486,32 @@ def insert_tasks_v1(
                 input_ids = request.prompt_token_ids + request.output_token_ids
 
                 self.model_inputs["input_ids_len"][idx] = length - 1
-                self.model_inputs["pre_ids"][idx : idx + 1] = -1
+                async_set_value(self.model_inputs["pre_ids"][idx : idx + 1], -1)
                 self.model_inputs["input_ids"][idx : idx + 1, : length - 1] = self.target_model_inputs["input_ids"][
                     idx : idx + 1, 1:length
                 ]
-                self.model_inputs["input_ids_cpu"][idx : idx + 1, : length - 1] = self.target_model_inputs[
-                    "input_ids"
-                ][idx : idx + 1, 1:length].cpu()
+                # TODO: use token_all_ids replace with input_ids_cpu
+                if getattr(self, "hybrid_mode", False) and "input_ids_cpu" in self.model_inputs:
+                    self.model_inputs["input_ids_cpu"][idx : idx + 1, : length - 1] = self.target_model_inputs[
+                        "input_ids"
+                    ][idx : idx + 1, 1:length].cpu()
                 encoder_block_num = len(request.block_tables)
-                self.model_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num
-                self.model_inputs["block_tables"][idx : idx + 1, :] = -1
-                self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
-                    request.block_tables, dtype="int32"
+                async_set_value(self.model_inputs["encoder_block_lens"][idx : idx + 1], encoder_block_num)
+                async_set_value(self.model_inputs["block_tables"][idx : idx + 1, :], -1)
+                async_set_value(
+                    self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num], request.block_tables
                 )
-                self.model_inputs["stop_flags"][idx : idx + 1] = False
-                self.model_inputs["batch_drop"][idx : idx + 1] = False
 
-                self.model_inputs["seq_lens_encoder"][idx : idx + 1] = length
+                async_set_value(self.model_inputs["stop_flags"][idx : idx + 1], False)
+                async_set_value(self.model_inputs["batch_drop"][idx : idx + 1], False)
+
+                async_set_value(self.model_inputs["seq_lens_encoder"][idx : idx + 1], length)
                 self.exist_prefill_flag = True
-                self.model_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index
-                self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1] = length
-                self.model_inputs["step_idx"][idx : idx + 1] = (
-                    len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
+                async_set_value(self.model_inputs["seq_lens_decoder"][idx : idx + 1], prefill_start_index)
+                async_set_value(self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1], length)
+                async_set_value(
+                    self.model_inputs["step_idx"][idx : idx + 1],
+                    len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0,
                 )
                 if self.use_attn_mask_offset:
                     inputs = request.multimodal_inputs
@@ -522,18 +529,19 @@ def insert_tasks_v1(
                 if (
                     self.fd_config.scheduler_config.splitwise_role == "decode"
                 ):  # In PD, we continue to decode after P generates first token
-                    self.model_inputs["seq_lens_encoder"][idx : idx + 1] = 0
+                    async_set_value(self.model_inputs["seq_lens_encoder"][idx : idx + 1], 0)
                     self.exist_prefill_flag = False
-                    self.model_inputs["recompute_token_num"][idx : idx + 1] = 0
-                    self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1] = length + 1
+                    async_set_value(self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1], length + 1)
                     # NOTE(liuzichang):
                     # extra 1 : P-D split need rollback one step
-                    self.model_inputs["mask_rollback"][idx : idx + 1] = 1
+
+                    async_set_value(self.model_inputs["recompute_token_num"][idx : idx + 1], 0)
+                    async_set_value(self.model_inputs["mask_rollback"][idx : idx + 1], 1)
                 # has_prefill_task = True
             elif request.task_type.value == RequestType.DECODE.value:  # decode task
                 encoder_block_num = len(request.block_tables)
-                self.model_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num
-                self.model_inputs["block_tables"][idx : idx + 1, :] = -1
+                async_set_value(self.model_inputs["encoder_block_lens"][idx : idx + 1], encoder_block_num)
+                async_set_value(self.model_inputs["block_tables"][idx : idx + 1, :], -1)
                 if current_platform.is_cuda():
                     async_set_value(
                         self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num], request.block_tables
@@ -542,16 +550,13 @@ def insert_tasks_v1(
                     self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
                         request.block_tables, dtype="int32"
                     )
-                # if self.model_inputs["is_block_step"][idx]:  # has tasks to continue to decode
-                #     has_decode_task = True
-                # continue
             else:
-                self.model_inputs["block_tables"][idx : idx + 1, :] = -1
-                self.model_inputs["stop_flags"][idx : idx + 1] = True
-                self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1] = 0
-                self.model_inputs["seq_lens_decoder"][idx : idx + 1] = 0
-                self.model_inputs["seq_lens_encoder"][idx : idx + 1] = 0
-                self.model_inputs["is_block_step"][idx : idx + 1] = False
+                async_set_value(self.model_inputs["block_tables"][idx : idx + 1, :], -1)
+                async_set_value(self.model_inputs["stop_flags"][idx : idx + 1], True)
+                async_set_value(self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1], 0)
+                async_set_value(self.model_inputs["seq_lens_decoder"][idx : idx + 1], 0)
+                async_set_value(self.model_inputs["seq_lens_encoder"][idx : idx + 1], 0)
+                async_set_value(self.model_inputs["is_block_step"][idx : idx + 1], False)
                 continue
 
         # TODO(liuzichang): Solve splitewise-p bug to restore
@@ -1233,6 +1238,7 @@ def _update_status(self):
             )
 
     def _extend_draft_token_with_ngram_match(self):
+        # TODO: replace with gpu tensor
         hybrid_mtp_ngram(
             self.model_inputs["input_ids_cpu"].cuda(),
             self.model_inputs["input_ids_len"].cuda(),
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
diff --git a/tests/worker/test_gpu_model_runner.py b/tests/worker/test_gpu_model_runner.py