PaddlePaddle
diff --git a/‎download.py‎
Lines changed: 41 additions & 0 deletions b/‎download.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎estimation_output.json‎
Lines changed: 1 addition & 0 deletions b/‎estimation_output.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/config/sft/full.yaml‎
Lines changed: 2 additions & 2 deletions b/‎examples/config/sft/full.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎learn.py‎
Lines changed: 274 additions & 0 deletions b/‎learn.py‎
Lines changed: 274 additions & 0 deletions
diff --git a/‎paddleformers/cli/cli.py‎
Lines changed: 1 addition & 1 deletion b/‎paddleformers/cli/cli.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,41 @@
+import os
+import time
+from aistudio_sdk.snapshot_download import snapshot_download
+
+os.environ["AISTUDIO_ACCESS_TOKEN"] = "2803c8bd3c444e6bacc7286c84acee55bc2d4cd5"
+os.environ["http_proxy"] = "http://agent.baidu.com:8188"
+os.environ["https_proxy"] = "http://agent.baidu.com:8188"
+
+# 调用什么模型，直接从
+MODEL_LIST = [
+    "PaddleFormers/tiny-random-glm4moe-bf16",
+]
+
+# 模型存储的基本位置
+BASE_DIR = "/home/models/"
+
+COMMON_ARGS = {
+    "revision": "master", 
+}
+
+def download_model(repo_id: str, max_retries: int = 3):
+    local_dir = os.path.join(BASE_DIR, repo_id)
+    os.makedirs(local_dir, exist_ok=True)
+
+    for attempt in range(1, max_retries + 1):
+        try:
+            res = snapshot_download(repo_id=repo_id, local_dir=local_dir, **COMMON_ARGS)
+            return True
+        except Exception as e:
+            if attempt < max_retries:
+                time.sleep(5)
+            else:
+                print(f" Skip: {repo_id} after {max_retries} retries")
+                return False
+
+
+if __name__ == "__main__":
+    for model in MODEL_LIST:
+        print(model,"download start....")
+        download_model(model)
+        print(model,"download end !")
@@ -0,0 +1 @@
+{"num_train_epochs": 1, "max_steps": 10, "train_tokens": 707, "global_batch_size": 1, "gradient_accumulation_steps": 1, "warmup_steps": 1, "per_device_train_batch_size": 1, "tensor_model_parallel_size": -1, "pipeline_model_parallel_size": -1, "sharding_parallel_size": -1, "seed": 23, "num_samples_each_epoch": 6000000, "max_seq_len": 8192, "valid": true, "train_samples": 10, "estimate_samples": 10, "actual_train_samples": 10, "skip_samples": 0, "num_of_gpus": -1}
@@ -12,7 +12,7 @@ template_backend: custom
 template: qwen3
 
 ### model
-model_name_or_path: Qwen/Qwen3-0.6B-Base
+model_name_or_path: /home/models/PaddleFormers/tiny-random-glm4moe-bf16/
 _attn_implementation: flashmask
 
 ### finetuning
@@ -31,7 +31,7 @@ evaluation_strategy: steps
 save_steps: 100
 save_strategy: steps
 logging_steps: 1
-gradient_accumulation_steps: 4
+gradient_accumulation_steps: 1
 logging_dir: ./vdl_log
 output_dir: ./checkpoints/qwen3-sft-full
 disable_tqdm: true
 
@@ -0,0 +1,274 @@
+# ===================================================================
+# 原始 learn.py 代码（预训练 Dataset 类方法版本，已注释）
+# ===================================================================
+
+def sample_independent_mtp_hidden_inputs_mask_(self, mtp_hidden_inputs_mask_tmp, mtp_ids_input_ids, eos_token_id):
+    """ mtp输入序列与eos 融合的hidden mask乘0 """
+    # 找到等于 eos_token_id 的位置
+    eos_positions = np.where(mtp_ids_input_ids == eos_token_id)
+    mtp_hidden_inputs_mask_tmp[0][eos_positions] = 0
+    return mtp_hidden_inputs_mask_tmp
+
+# 数据流在mtp情况下，新增两个输入：
+# "mtp_startend_row_indices_all",
+# "mtp_hidden_inputs_mask_all",
+
+def get_mtp_inputs_info(self, task_id, random_doc, ids):
+    """ 得到MTP的 mtp_startend_row_indices_all, mtp_hidden_inputs_mask_all 信息"""
+
+    mtp_startend_row_indices_all = []
+    mtp_hidden_inputs_mask_all = []
+    ids_input = ids[:-1] # 对齐组网输入
+    # ！！构造方式同ernie5_moe.modeling_pp.MTPmmLayerPipe.forward：不足max_seq_len时从原始input_ids里面取mtp_input_id
+    ids_mtp = ids_input[-self.multi_token_pred_depth :]
+    ids_ori = ids_input[: -self.multi_token_pred_depth]
+
+    for mtp_idx in range(self.multi_token_pred_depth):
+        # 构造mtp_startend_row_indices
+        mtp_startend_row_indices_tmp = np.expand_dims(
+                                            np.stack(
+                                                [
+                                                    np.full((self.seqlen - 1,), self.seqlen - 1, dtype=np.int32),
+                                                    np.arange(self.seqlen - 1, dtype=np.int32),
+                                                ],
+                                                axis=1,
+                                            ),
+                                            0,
+                                        )  # [1, seqlen-1, 2]
+        # 构造mtp_hidden_inputs_mask
+        mtp_hidden_inputs_mask_tmp = np.ones([1, self.seqlen - 1], dtype=np.int32)  # [1, seqlen-1]
+
+        mtp_ids_input_ids = np.concatenate([ids_ori[(mtp_idx + 1) :], ids_mtp[: (mtp_idx + 1)]])
+
+        if (mtp_ids_input_ids.shape[0] > 0 and self.document_mask_prob_text is not None
+            and task_id in [0] and random_doc < self.document_mask_prob_text):
+            # 样本互相不可见
+            assert self.inbatch_sft is False, "document_mask_prob_text not support inbatch_sft"
+            sample_independent_startend_row_indices_(
+                mtp_startend_row_indices_tmp,
+                mtp_ids_input_ids[:-1] if len(mtp_ids_input_ids) == self.seqlen else mtp_ids_input_ids,
+                self.eos_token_id,
+            )
+            mtp_hidden_inputs_mask_tmp = self.sample_independent_mtp_hidden_inputs_mask_(
+                mtp_hidden_inputs_mask_tmp,
+                mtp_ids_input_ids[:-1] if len(mtp_ids_input_ids) == self.seqlen else mtp_ids_input_ids,
+                self.eos_token_id,
+            )
+        elif mtp_ids_input_ids.shape[0] > 0 and task_id in [0] and self.inbatch_sft:
+            # 样本互相不可见
+            assert self.document_mask_prob_text is None, f"{self.document_mask_prob_text}"
+            sample_independent_startend_row_indices_(
+                mtp_startend_row_indices_tmp,
+                mtp_ids_input_ids[:-1] if len(mtp_ids_input_ids) == self.seqlen else mtp_ids_input_ids,
+                self.eos_token_id,
+            )
+            mtp_hidden_inputs_mask_tmp = self.sample_independent_mtp_hidden_inputs_mask_(
+                mtp_hidden_inputs_mask_tmp,
+                mtp_ids_input_ids[:-1] if len(mtp_ids_input_ids) == self.seqlen else mtp_ids_input_ids,
+                self.eos_token_id,
+            )
+
+        mtp_startend_row_indices_all.append(deepcopy(mtp_startend_row_indices_tmp))
+        mtp_hidden_inputs_mask_all.append(deepcopy(mtp_hidden_inputs_mask_tmp))
+    if len(mtp_startend_row_indices_all) > 0:
+        mtp_startend_row_indices_all = np.concatenate(mtp_startend_row_indices_all, axis=0)
+        mtp_hidden_inputs_mask_all = np.concatenate(mtp_hidden_inputs_mask_all, axis=0)
+    else:
+        mtp_startend_row_indices_all = None
+        mtp_hidden_inputs_mask_all = None
+
+    return mtp_startend_row_indices_all, mtp_hidden_inputs_mask_all
+
+# 得到的"mtp_startend_row_indices_all", "mtp_hidden_inputs_mask_all" 也需要进行padding：
+def pad_mtp_hidden_inputs_mask(mtp_hidden_inputs_mask_all, max_seq_len):
+    """pad_mtp_hidden_inputs_mask"""
+    head, l = mtp_hidden_inputs_mask_all.shape
+    assert head == 1, "预训练只训练了mtp=1的情况"
+    if l == max_seq_len:
+        return mtp_hidden_inputs_mask_all
+    elif l < max_seq_len:
+        pad_l = max_seq_len - l
+        padding = np.ones((head, pad_l), dtype=mtp_hidden_inputs_mask_all.dtype)
+        return np.concatenate([mtp_hidden_inputs_mask_all, padding], axis=1)
+    else:
+        raise Exception
+
+
+# ===================================================================
+# collate.py 对应翻译版（独立函数，无 self，对齐 collate.py 风格）
+# ===================================================================
+#
+# 变量对应关系：
+#   learn.py (self.xxx)              collate.py 参数
+#   ---------------------------------------------------
+#   self.multi_token_pred_depth  <-> mtp_depth
+#   self.seqlen - 1              <-> total_len（实际 token 数，padding 前）
+#   max_seq_len（含 MTP 扩展）     <-> max_seq_len（已 += mtp_depth）
+#   self.eos_token_id            <-> eos_token_id
+#   self.inbatch_sft / document  <-> not use_global_causal_attn
+#   ids（单条序列）                <-> batch_token_ids（多条打包序列的列表）
+#
+# 关键差异：
+#   learn.py 通过 EOS token 检测文档边界；
+#   collate.py 的 packing 中文档边界已由 batch_token_ids 的分组隐式给出，
+#   use_global_causal_attn=False 时直接按分组做分块 causal mask，
+#   无需再扫描 EOS。gen_mtp_layer_mask 仍保留 eos_token_id 参数以兼容
+#   有 EOS 显式标记的场景。
+# ===================================================================
+
+import numpy as np
+from typing import List
+
+
+def gen_mtp_attn_mask(
+    batch_token_ids: List[List[int]],
+    max_seq_len: int,
+    mtp_depth: int,
+    use_global_causal_attn: bool,
+) -> np.ndarray:
+    """生成 MTP 每一层的 attention mask（二维矩阵形式）。
+
+    核心逻辑：每层 MTP 的输入序列相对原始序列右移了 (mtp_idx+1) 步，
+    因此文档块边界也随之左移 (mtp_idx+1)，每层的 mask 矩阵不同。
+
+    对应 learn.py get_mtp_inputs_info 中 mtp_startend_row_indices 的构造逻辑：
+      - 初始化为全局因果 mask（end_row = seqlen-1）
+      - 若启用文档隔离，调用 sample_independent_startend_row_indices_
+        以 mtp_ids_input_ids（移位序列）的 EOS 重新划分块边界
+
+    Args:
+        batch_token_ids: 打包的多个 sequence 的 token ids 列表。
+            collate.py 中文档边界由分组隐式给出，无需扫描 EOS。
+        max_seq_len: padding 后序列长度，已含 mtp_depth 扩展。
+        mtp_depth: MTP 预测层数 D。对应 self.multi_token_pred_depth。
+        use_global_causal_attn: True 时全局因果（单块）；
+            False 时分块因果，块边界随层数左移。
+
+    Returns:
+        np.ndarray, shape [mtp_depth, 1, max_seq_len, max_seq_len], dtype=float32。
+        与 gen_self_attn_mask 返回的 [1, 1, max_seq_len, max_seq_len] 格式一致，
+        第0维扩展为 mtp_depth。
+    """
+    total_len = sum(len(ids) for ids in batch_token_ids)
+
+    # 原始文档块边界（exclusive），不含最后一个 total_len
+    # 例：batch_token_ids=[[A,B,EOS],[D,E,F]] → internal_boundaries=[3]
+    internal_boundaries = []
+    offset = 0
+    for ids in batch_token_ids[:-1]:
+        offset += len(ids)
+        internal_boundaries.append(offset)
+
+    result = []
+    for mtp_idx in range(mtp_depth):
+        mask = np.zeros((max_seq_len, max_seq_len), dtype=np.float32)
+
+        if use_global_causal_attn:
+            # 全局因果：整个序列是一个块
+            b = np.tril(np.ones([total_len, total_len]))
+            mask[:total_len, :total_len] = b
+        else:
+            # 分块因果，块边界左移 (mtp_idx+1)
+            # 原理：MTP层k的输入是原始序列右移k+1步，
+            #       原来在位置 b 的文档边界，在移位视角下出现在位置 b-(k+1)
+            shift = mtp_idx + 1
+            shifted_boundaries = [b - shift for b in internal_boundaries if b - shift > 0]
+            # 最后一个边界始终是 total_len（最后文档块延伸到序列末尾）
+            all_boundaries = shifted_boundaries + [total_len]
+
+            prev = 0
+            for boundary in all_boundaries:
+                cur_len = boundary - prev
+                if cur_len > 0:
+                    # 在 [prev:boundary, prev:boundary] 填下三角因果矩阵
+                    mask[prev:boundary, prev:boundary] = np.tril(np.ones([cur_len, cur_len]))
+                prev = boundary
+
+        result.append(mask)
+
+    # [mtp_depth, max_seq_len, max_seq_len] → [mtp_depth, 1, max_seq_len, max_seq_len]
+    return np.stack(result, axis=0)[:, None, :, :]
+
+
+def gen_mtp_attn_mask_startend_row_indices(
+    batch_token_ids: List[List[int]],
+    max_seq_len: int,
+    mtp_depth: int,
+    use_global_causal_attn: bool,
+) -> np.ndarray:
+    """生成 MTP 每一层的 attention mask（压缩一维 startend_row_indices 格式）。
+
+    是 gen_mtp_attn_mask 的压缩版本，与 gen_attn_mask_startend_row_indices 格式对齐。
+    每个位置存储其所在块的 end_row（exclusive），由 flash attention 内核推断因果关系。
+
+    块边界规律（与 gen_mtp_attn_mask 相同）：
+        层k 的块边界 = 原始边界 - (k+1)
+
+    Args:
+        同 gen_mtp_attn_mask。
+
+    Returns:
+        np.ndarray, shape [mtp_depth, 1, max_seq_len, 1], dtype=int32。
+        与 gen_attn_mask_startend_row_indices 返回的 [1, 1, max_seq_len, 1] 格式一致，
+        第0维扩展为 mtp_depth。
+    """
+    total_len = sum(len(ids) for ids in batch_token_ids)
+
+    internal_boundaries = []
+    offset = 0
+    for ids in batch_token_ids[:-1]:
+        offset += len(ids)
+        internal_boundaries.append(offset)
+
+    result = []
+    for mtp_idx in range(mtp_depth):
+        if use_global_causal_attn:
+            indices = [total_len] * total_len
+        else:
+            shift = mtp_idx + 1
+            shifted_boundaries = [b - shift for b in internal_boundaries if b - shift > 0]
+            all_boundaries = shifted_boundaries + [total_len]
+
+            indices = []
+            prev = 0
+            for boundary in all_boundaries:
+                cur_len = boundary - prev
+                # 该块内所有位置的 end_row 均为 boundary（块的 exclusive 末尾）
+                indices.extend([boundary] * cur_len)
+                prev = boundary
+
+        # padding 区域：与 gen_attn_mask_startend_row_indices 保持一致
+        # range(total_len, max_seq_len) 让 padding 位置的 end_row 递增
+        if total_len < max_seq_len:
+            indices.extend(list(range(total_len, max_seq_len)))
+
+        result.append(indices)
+
+    # [mtp_depth, max_seq_len] → [mtp_depth, 1, max_seq_len, 1]
+    return np.array(result, dtype=np.int32)[:, None, :, None]
+
+
+def gen_mtp_layer_mask(
+    batch_token_ids: List[List[int]],
+    max_seq_len: int,
+    mtp_depth: int,
+    eos_token_id: int = None,
+) -> np.ndarray:
+    """生成 MTP 每一层的 hidden inputs mask（待后续讨论，暂保留原有实现）。"""
+    all_token_ids = np.concatenate([np.array(ids, dtype=np.int32) for ids in batch_token_ids])
+    total_len = len(all_token_ids)
+    ids_mtp = all_token_ids[-mtp_depth:]
+    ids_ori = all_token_ids[:-mtp_depth]
+
+    result = []
+    for mtp_idx in range(mtp_depth):
+        mask = np.ones(total_len, dtype=np.int32)
+        if eos_token_id is not None:
+            mtp_ids_input_ids = np.concatenate([ids_ori[mtp_idx + 1:], ids_mtp[:mtp_idx + 1]])
+            eos_positions = np.where(mtp_ids_input_ids == eos_token_id)[0]
+            mask[eos_positions] = 0
+        if total_len < max_seq_len:
+            mask = np.concatenate([mask, np.ones(max_seq_len - total_len, dtype=np.int32)])
+        result.append(mask)
+
+    return np.stack(result, axis=0)  # [mtp_depth, max_seq_len]
@@ -43,7 +43,7 @@
     "-" * 60
     + "\n"
     + "| Usage:                                                              |\n"
-    + "|   paddleformers-cli train -h: model finetuning                      |\n"
+    + "|   paddleformers-cli train -h: modeyl finetuning                      |\n"
     + "|   paddleformers-cli export -h: model export                         |\n"
     + "|   paddleformers-cli version: show version info                      |\n"
     + "|   paddleformers-cli help: show helping info                         |\n"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"num_train_epochs": 1, "max_steps": 10, "train_tokens": 707, "global_batch_size": 1, "gradient_accumulation_steps": 1, "warmup_steps": 1, "per_device_train_batch_size": 1, "tensor_model_parallel_size": -1, "pipeline_model_parallel_size": -1, "sharding_parallel_size": -1, "seed": 23, "num_samples_each_epoch": 6000000, "max_seq_len": 8192, "valid": true, "train_samples": 10, "estimate_samples": 10, "actual_train_samples": 10, "skip_samples": 0, "num_of_gpus": -1}`