[XPU] glm-4.5-air (#7071)

zhupengyang · web-flow · commit 27b00cf385e5 · 2026-04-14T11:31:49.000+08:00
diff --git a/custom_ops/xpu_ops/download_dependencies.sh b/custom_ops/xpu_ops/download_dependencies.sh
@@ -15,7 +15,7 @@ if [ "$1" == "stable" ]; then
     version_xvllm="20251017"
     version_xtdk="3.4.0.1"
 else
-    version_xvllm="20260407"
+    version_xvllm="latest"
     version_xtdk="3.6.2.1"
 fi
 
diff --git a/custom_ops/xpu_ops/src/ops/block_attn.cc b/custom_ops/xpu_ops/src/ops/block_attn.cc
@@ -156,7 +156,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
     rope_head_dim = rotary_embs.dims()[4];
   }
   std::string pos_emb_type;
-  if (use_neox_rotary_style == true) {
+  if (use_neox_rotary_style) {
     pos_emb_type = "NEOX";
   } else if (rope_head_dim == head_dim / 2) {
     pos_emb_type = "HALF_HEAD_DIM";
@@ -342,12 +342,14 @@ std::vector<paddle::Tensor> BlockAttnKernel(
                   value_cache.data<cdata_t>())),
               vsl.usual_lod_vp,     // seq_lod
               vsl.slot_mapping_vp,  // real_batch
+              prefix_lens_vp,       // start_tokens
               param.batch_size,     // batch_size
               1,                    // emb_batch_size
               rope_max_seqlen,      // max_seqlen
               param.head_num,
               param.kv_head_num,
               param.head_dim,
+              rope_head_dim,
               param.max_batch_size,
               block_size,
               max_block_per_seq,
@@ -586,7 +588,8 @@ std::vector<paddle::Tensor> BlockAttnKernel(
         ret = infer_ops::
             split_neox_cache_kv_encoder<XPU_XType, float, XPU_CType, int>(
                 xpu_ctx->x_context(),
-                reinterpret_cast<const XPU_XType*>(qkv.data<data_t>()),  // qkv
+                reinterpret_cast<const XPU_XType*>(qkv.data<data_t>()) +
+                    total_enc_len * qkv_shape[qkv_shape.size() - 1],  // qkv
                 reinterpret_cast<const float*>(
                     rotary_embs.data<float>()),  // rotary_pos_emb
                 reinterpret_cast<const int*>(
@@ -598,14 +601,16 @@ std::vector<paddle::Tensor> BlockAttnKernel(
                     key_cache.data<cdata_t>())),
                 const_cast<XPU_CType*>(reinterpret_cast<const XPU_CType*>(
                     value_cache.data<cdata_t>())),
-                decoder_seq_lod_vp,    // seq_lod
-                decoder_batch_map_vp,  // real_batch
-                param.batch_size,      // batch_size
-                1,                     // emb_batch_size
-                rope_max_seqlen,       // max_seqlen
+                decoder_seq_lod_vp,            // seq_lod
+                decoder_batch_map_vp,          // real_batch
+                decoder_context_len_cache_vp,  // start_tokens
+                param.batch_size,              // batch_size
+                1,                             // emb_batch_size
+                rope_max_seqlen,               // max_seqlen
                 param.head_num,
                 param.kv_head_num,
                 param.head_dim,
+                rope_head_dim,
                 param.max_batch_size,
                 block_size,
                 max_block_per_seq,
@@ -806,6 +811,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
             param.head_num,
             param.kv_head_num,
             param.head_dim,
+            rope_head_dim,
             param.max_batch_size,
             block_size,
             max_block_per_seq,
diff --git a/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc b/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc
@@ -76,19 +76,19 @@ std::vector<std::vector<int64_t>> FusedNoAuxTcInferShape(
     const float routed_scaling_factor) {
   std::vector<int64_t> topk_ids_shape = {gating_logits_shape[0], top_k};
   std::vector<int64_t> topk_weights_shape = {gating_logits_shape[0], top_k};
-  return {gating_logits_shape, topk_ids_shape, topk_weights_shape};
+  return {gating_logits_shape, topk_weights_shape, topk_ids_shape};
 }
 
 std::vector<paddle::DataType> FusedNoAuxTcInferDtype(
     const paddle::DataType& gating_logits_dtype,
     const paddle::DataType& bias_dtype) {
   return {
-      gating_logits_dtype, paddle::DataType::INT64, paddle::DataType::FLOAT32};
+      gating_logits_dtype, paddle::DataType::FLOAT32, paddle::DataType::INT32};
 }
 
 PD_BUILD_STATIC_OP(fused_noaux_tc)
     .Inputs({"gating_logits", "bias"})
-    .Outputs({"gating_logits_out", "topk_ids", "topk_weights"})
+    .Outputs({"gating_logits_out", "topk_weights", "topk_ids"})
     .Attrs({"n_group: int",
             "topk_group: int",
             "top_k: int",
diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
@@ -313,7 +313,7 @@ def apply_tp(
         """
         gate_out = gate(x.cast("float32"))
         if layer.topk_method == "noaux_tc":
-            _, topk_idx, topk_weights = get_moe_scores(
+            _, topk_weights, topk_idx = get_moe_scores(
                 gate_out,
                 layer.n_group,
                 layer.topk_group,
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -61,7 +61,8 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         )
 
         if self.model_format == "torch" and "output_dim" in extra_weight_attrs:
-            extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
+            if extra_weight_attrs["output_dim"] is not None:
+                extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
 
         set_weight_attrs(
             layer.weight,
diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -136,6 +136,7 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
             logger.warning(f"Failed to parse quantization config normally ({e}), trying fallback")
             quant_config_name = args.quantization["quantization"]
             quantization_config["quantization"] = quant_config_name
+        model_config.quantization_config = quantization_config
         # Special handling for Ernie models
         if quant_config_name == "wint4" and is_ernie:
             quantization_config["dense_quant_type"] = "wint8"
diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py
@@ -44,7 +44,7 @@ def __call__(self, position_ids):
         inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim)
         partial_rotary_position_ids = position_ids / self.partial_rotary_factor
         freqs = paddle.einsum("ij,k->ijk", partial_rotary_position_ids.cast("float32"), inv_freq)
-        if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"):
+        if current_platform.is_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"):
             # shape: [B, S, D]
             rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32")
             emb = paddle.stack([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim))
@@ -95,9 +95,14 @@ def __call__(self, position_ids):
         else:
             inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim)
             freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq)
-        # shape: [B, S, D/2]
-        rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32")
-        emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2))
+        if current_platform.is_xpu():
+            # shape: [B, S, D]
+            rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32")
+            emb = paddle.concat([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim))
+        else:
+            # shape: [B, S, D/2]
+            rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32")
+            emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2))
         # shape: [B, S, 1, D]
         emb = paddle.unsqueeze(emb, 2)
         rot_emb[0] = paddle.cos(emb)
diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py
@@ -73,7 +73,7 @@ def __init__(
                 fd_config=fd_config,
                 prefix=f"{prefix}.up_gate_proj",
                 input_size=fd_config.model_config.hidden_size,
-                output_size=[intermediate_size, intermediate_size],
+                output_sizes=[intermediate_size, intermediate_size],
                 with_bias=False,
             )
 
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
@@ -990,6 +990,7 @@ def _init_share_inputs(self, max_num_seqs: int):
                 position_ids=tmp_position_ids,
                 base=self.model_config.rope_theta,
                 model_config=self.model_config,
+                partial_rotary_factor=self.model_config.partial_rotary_factor,
             )
 
         # Set block tables

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,8 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):`
`61`	`61`	`)`
`62`	`62`
`63`	`63`	`if self.model_format == "torch" and "output_dim" in extra_weight_attrs:`
`64`		`- extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]`
	`64`	`+ if extra_weight_attrs["output_dim"] is not None:`
	`65`	`+ extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]`
`65`	`66`
`66`	`67`	`set_weight_attrs(`
`67`	`68`	`layer.weight,`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ def __init__(`
`73`	`73`	`fd_config=fd_config,`
`74`	`74`	`prefix=f"{prefix}.up_gate_proj",`
`75`	`75`	`input_size=fd_config.model_config.hidden_size,`
`76`		`- output_size=[intermediate_size, intermediate_size],`
	`76`	`+ output_sizes=[intermediate_size, intermediate_size],`
`77`	`77`	`with_bias=False,`
`78`	`78`	`)`
`79`	`79`
Original file line number	Diff line number	Diff line change
`@@ -990,6 +990,7 @@ def _init_share_inputs(self, max_num_seqs: int):`
`990`	`990`	`position_ids=tmp_position_ids,`
`991`	`991`	`base=self.model_config.rope_theta,`
`992`	`992`	`model_config=self.model_config,`
	`993`	`+ partial_rotary_factor=self.model_config.partial_rotary_factor,`
`993`	`994`	`)`
`994`	`995`
`995`	`996`	`# Set block tables`