PaddlePaddle
diff --git a/‎custom_ops/xpu_ops/src/ops/block_attn.cc‎
Lines changed: 9 additions & 4 deletions b/‎custom_ops/xpu_ops/src/ops/block_attn.cc‎
Lines changed: 9 additions & 4 deletions
@@ -159,6 +159,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
   if (use_neox_rotary_style) {
     pos_emb_type = "NEOX";
   } else if (rope_head_dim == head_dim / 2) {
+    // vl model use this
     pos_emb_type = "HALF_HEAD_DIM";
   } else {
     pos_emb_type = "NORMAL";
@@ -984,7 +985,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
   return {block_attn_out};
 }
 
-std::vector<paddle::Tensor> BlockAttn(
+std::vector<paddle::Tensor> BlockAttnFused(
     const paddle::Tensor& qkv,
     const paddle::Tensor& key_cache,
     const paddle::Tensor& value_cache,
@@ -1008,6 +1009,8 @@ std::vector<paddle::Tensor> BlockAttn(
     const paddle::Tensor& decoder_context_len_cache,
     const paddle::Tensor& decoder_batch_map,
     const paddle::Tensor& prefix_len,
+    const paddle::Tensor& slot_mapping_enc,
+    const paddle::Tensor& slot_mapping_dec,
     const paddle::optional<paddle::Tensor>& k_scales,
     const paddle::optional<paddle::Tensor>& v_scales,
     const paddle::optional<paddle::Tensor>& k_scales_inv,
@@ -1067,7 +1070,7 @@ std::vector<paddle::Tensor> BlockAttn(
   } else if (cache_dtype == paddle::DataType::INT8) {
     APPLY_KERNEL(paddle::bfloat16, int8_t, paddle::bfloat16);
   } else {
-    PD_THROW("block_attn not support cache_dtype==%d",
+    PD_THROW("block_attn_fused not support cache_dtype==%d",
              static_cast<int>(cache_dtype));
     return {};
   }
@@ -1097,7 +1100,7 @@ std::vector<paddle::DataType> BlockAttnInferDtype(
   return {qkv_dtype};
 }
 
-PD_BUILD_STATIC_OP(block_attn)
+PD_BUILD_STATIC_OP(block_attn_fused)
     .Inputs({"qkv",
              "key_cache",
              "value_cache",
@@ -1121,6 +1124,8 @@ PD_BUILD_STATIC_OP(block_attn)
              "decoder_context_len_cache",
              "decoder_batch_map",
              "prefix_len",
+             "slot_mapping_enc",
+             "slot_mapping_dec",
              paddle::Optional("k_scales"),
              paddle::Optional("v_scales"),
              paddle::Optional("k_scales_inv"),
@@ -1135,6 +1140,6 @@ PD_BUILD_STATIC_OP(block_attn)
              paddle::Optional("cachekv_signal_thread_cpu")})
     .Attrs({"use_neox_rotary_style:bool", "rope_3d:bool"})
     .Outputs({"block_attn_out"})
-    .SetKernelFn(PD_KERNEL(BlockAttn))
+    .SetKernelFn(PD_KERNEL(BlockAttnFused))
     .SetInferShapeFn(PD_INFER_SHAPE(BlockAttnInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(BlockAttnInferDtype));