Merge pull request #2870 from AI-Hypercomputer:mixtral_clean

Google-ML-Automation · Google-ML-Automation · commit f8aeeadb1de2 · 2025-12-23T11:39:34.000-08:00
PiperOrigin-RevId: 848247556
diff --git a/src/MaxText/experimental/agent/ckpt_conversion_agent/README.md b/src/MaxText/experimental/agent/ckpt_conversion_agent/README.md
@@ -1,5 +1,5 @@
 # Checkpoint conversion agent 
-The agent is used to automate the model-specific mappings of checkpoint conversion.  It is designed to cooperate with the new checkpoint conversion [framework](https://github.com/AI-Hypercomputer/maxtext/tree/main/MaxText/utils/ckpt_conversion).
+The agent is used to automate the model-specific mappings of checkpoint conversion.  It is designed to cooperate with the new checkpoint conversion [framework](https://github.com/AI-Hypercomputer/maxtext/tree/main/src/MaxText/utils/ckpt_conversion).
 
 ## Quick starts
 To begin, you'll need:
@@ -16,7 +16,7 @@ pip install -q -U "google-genai>=1.0.0"
 
 ## 1. Prepare the context file
 
-The agent requires context files about the target and source model's parameter names and tensor shapes. You can generate them using the [`save_param.py`](ckpt_conversion/utils/save_param.py) script. The output directory defined by `config.base_output_directory`. The default is `src/MaxText/experimental/agent/ckpt_conversion_agent/context/<model_name>` folder.
+The agent requires context files about the target and source model's parameter names and tensor shapes. You can generate them using the [`save_param.py`](../ckpt_conversion_agent/utils/save_param.py) script. The output directory defined by `config.base_output_directory`. The default is `src/MaxText/experimental/agent/ckpt_conversion_agent/context/<model_name>` folder.
 ```bash
 python3 -m MaxText.experimental.agent.ckpt_conversion_agent.utils.save_param src/MaxText/configs/base.yml \
   per_device_batch_size=1 run_name=param_<model_name> model_name=<model_name> scan_layers=false \
diff --git a/src/MaxText/utils/ckpt_conversion/README.md b/src/MaxText/utils/ckpt_conversion/README.md
@@ -9,6 +9,7 @@ The following models are supported:
 - Gemma2 (2B, 9B, 27B).
 - Gemma3 multimodal (4B, 12B, 27B).
 - Qwen3 (0.6B, 4B, 8B, 14B, 32B).
+- Mixtral (8x7B, 8x22B).
 
 ## Prerequisites
 - Hugging Face requires Pytorch.
diff --git a/src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py b/src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py
@@ -691,6 +691,69 @@
     },
 )
 
+
+# from https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/blob/main/config.json
+mixtral_8x7b_dict = {
+    "architectures": ["MixtralForCausalLM"],
+    "attention_dropout": 0.0,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 32768,
+    "model_type": "mixtral",
+    "num_attention_heads": 32,
+    "num_experts_per_tok": 2,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "num_local_experts": 8,
+    "output_router_logits": False,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 1000000.0,
+    "router_aux_loss_coef": 0.02,
+    "sliding_window": None,
+    "tie_word_embeddings": False,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.36.0.dev0",
+    "use_cache": True,
+    "vocab_size": 32000,
+}
+mixtral_8x7b_config = transformers.MixtralConfig(**mixtral_8x7b_dict)
+
+
+# from https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1/blob/main/config.json
+mixtral_8x22b_dict = {
+    "architectures": ["MixtralForCausalLM"],
+    "attention_dropout": 0.0,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 6144,
+    "initializer_range": 0.02,
+    "intermediate_size": 16384,
+    "max_position_embeddings": 65536,
+    "model_type": "mixtral",
+    "num_attention_heads": 48,
+    "num_experts_per_tok": 2,
+    "num_hidden_layers": 56,
+    "num_key_value_heads": 8,
+    "num_local_experts": 8,
+    "output_router_logits": False,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 1000000.0,
+    "router_aux_loss_coef": 0.001,
+    "sliding_window": None,
+    "tie_word_embeddings": False,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.38.0",
+    "use_cache": True,
+    "vocab_size": 32768,
+}
+mixtral_8x22b_config = transformers.MixtralConfig(**mixtral_8x22b_dict)
+
+
 # {maxtext model name: hf model config}
 HF_MODEL_CONFIGS = {
     "gemma2-2b": gemma2_2b_config,
@@ -716,4 +779,6 @@
     "gpt-oss-20b": gpt_oss_20b_config,
     "gpt-oss-120b": gpt_oss_120b_config,
     "qwen3-omni-30b-a3b": qwen3_omni_30b_a3b_config,
+    "mixtral-8x7b": mixtral_8x7b_config,
+    "mixtral-8x22b": mixtral_8x22b_config,
 }
diff --git a/src/MaxText/utils/ckpt_conversion/utils/hf_shape.py b/src/MaxText/utils/ckpt_conversion/utils/hf_shape.py
@@ -581,6 +581,77 @@ def LLAMA31_HF_WEIGHTS_TO_SHAPE(config):
   return mapping
 
 
+def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config):
+  """
+  Returns a mapping of Hugging Face parameter names to their tensor shapes.
+
+  Args:
+      config (dict): The model configuration dictionary.
+
+  Returns:
+      A dictionary mapping Hugging Face parameter paths to their tensor shapes.
+  """
+  shapes = {}
+
+  # Embedding and LM Head
+  shapes["model.embed_tokens.weight"] = [config["vocab_size"], config["hidden_size"]]
+  shapes["lm_head.weight"] = [config["vocab_size"], config["hidden_size"]]
+
+  # Final LayerNorm
+  shapes["model.norm.weight"] = [config["hidden_size"]]
+
+  # Calculated dimensions
+  head_dim = config["hidden_size"] // config["num_attention_heads"]
+  kv_dim = config["num_key_value_heads"] * head_dim
+
+  # Decoder Layers
+  for i in range(config["num_hidden_layers"]):
+    # Attention Projections
+    shapes[f"model.layers.{i}.self_attn.q_proj.weight"] = [
+        config["hidden_size"],
+        config["hidden_size"],
+    ]
+    shapes[f"model.layers.{i}.self_attn.k_proj.weight"] = [
+        kv_dim,
+        config["hidden_size"],
+    ]
+    shapes[f"model.layers.{i}.self_attn.v_proj.weight"] = [
+        kv_dim,
+        config["hidden_size"],
+    ]
+    shapes[f"model.layers.{i}.self_attn.o_proj.weight"] = [
+        config["hidden_size"],
+        config["hidden_size"],
+    ]
+
+    # LayerNorms
+    shapes[f"model.layers.{i}.input_layernorm.weight"] = [config["hidden_size"]]
+    shapes[f"model.layers.{i}.post_attention_layernorm.weight"] = [config["hidden_size"]]
+
+    # MOE Gate
+    shapes[f"model.layers.{i}.block_sparse_moe.gate.weight"] = [
+        config["num_local_experts"],
+        config["hidden_size"],
+    ]
+
+    # MOE Experts
+    for j in range(config["num_local_experts"]):
+      shapes[f"model.layers.{i}.block_sparse_moe.experts.{j}.w1.weight"] = [
+          config["intermediate_size"],
+          config["hidden_size"],
+      ]
+      shapes[f"model.layers.{i}.block_sparse_moe.experts.{j}.w2.weight"] = [
+          config["hidden_size"],
+          config["intermediate_size"],
+      ]
+      shapes[f"model.layers.{i}.block_sparse_moe.experts.{j}.w3.weight"] = [
+          config["intermediate_size"],
+          config["hidden_size"],
+      ]
+
+  return shapes
+
+
 # {maxtext model name: {hf weight name: hf shape}}
 HF_SHAPE = {
     "gemma2-2b": GEMMA2_HF_WEIGHTS_TO_SHAPE,
@@ -604,4 +675,6 @@ def LLAMA31_HF_WEIGHTS_TO_SHAPE(config):
     "deepseek3-671b": DEEPSEEK_HF_WEIGHTS_TO_SHAPE,
     "gpt-oss-20b": GPT_OSS_HF_WEIGHTS_TO_SHAPE,
     "gpt-oss-120b": GPT_OSS_HF_WEIGHTS_TO_SHAPE,
+    "mixtral-8x7b": MIXTRAL_HF_WEIGHTS_TO_SHAPE,
+    "mixtral-8x22b": MIXTRAL_HF_WEIGHTS_TO_SHAPE,
 }
diff --git a/src/MaxText/utils/ckpt_conversion/utils/param_mapping.py b/src/MaxText/utils/ckpt_conversion/utils/param_mapping.py
@@ -1424,6 +1424,158 @@ def transform_query_kernel(arr):
   return hook_fns
 
 
+def MIXTRAL_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False):
+  """
+  Returns the mapping of parameter names from MaxText to Hugging Face for Mixtral.
+  """
+  mapping = {}
+
+  # Top-level, non-layer-specific parameters
+  mapping["params-token_embedder-embedding"] = "model.embed_tokens.weight"
+  mapping["params-decoder-decoder_norm-scale"] = "model.norm.weight"
+  mapping["params-decoder-logits_dense-kernel"] = "lm_head.weight"
+
+  num_experts = maxtext_config.num_experts
+
+  if scan_layers:
+    # Initialize lists for scanned layer weights
+    mapping.update(
+        {
+            "params-decoder-layers-self_attention-query-kernel": [],
+            "params-decoder-layers-self_attention-key-kernel": [],
+            "params-decoder-layers-self_attention-value-kernel": [],
+            "params-decoder-layers-self_attention-out-kernel": [],
+            "params-decoder-layers-pre_self_attention_layer_norm-scale": [],
+            "params-decoder-layers-post_self_attention_layer_norm-scale": [],
+            "params-decoder-layers-MoeBlock_0-gate-kernel": [],
+            "params-decoder-layers-MoeBlock_0-wi_0": [],
+            "params-decoder-layers-MoeBlock_0-wi_1": [],
+            "params-decoder-layers-MoeBlock_0-wo": [],
+        }
+    )
+
+    for i in range(config["num_hidden_layers"]):
+      hf_prefix = f"model.layers.{i}"
+      # Attention weights
+      mapping["params-decoder-layers-self_attention-query-kernel"].append(f"{hf_prefix}.self_attn.q_proj.weight")
+      mapping["params-decoder-layers-self_attention-key-kernel"].append(f"{hf_prefix}.self_attn.k_proj.weight")
+      mapping["params-decoder-layers-self_attention-value-kernel"].append(f"{hf_prefix}.self_attn.v_proj.weight")
+      mapping["params-decoder-layers-self_attention-out-kernel"].append(f"{hf_prefix}.self_attn.o_proj.weight")
+
+      # RMSNorm weights
+      mapping["params-decoder-layers-pre_self_attention_layer_norm-scale"].append(f"{hf_prefix}.input_layernorm.weight")
+      mapping["params-decoder-layers-post_self_attention_layer_norm-scale"].append(
+          f"{hf_prefix}.post_attention_layernorm.weight"
+      )
+
+      # MoE gate
+      mapping["params-decoder-layers-MoeBlock_0-gate-kernel"].append(f"{hf_prefix}.block_sparse_moe.gate.weight")
+
+    # Outer loop as experts and inner loop as layers to align with logic in _build_multi_axis_stacked_tensor()
+    for j in range(num_experts):
+      w1_layers = []
+      w3_layers = []
+      w2_layers = []
+
+      for i in range(config["num_hidden_layers"]):
+        hf_prefix = f"model.layers.{i}"
+        w1_layers.append(f"{hf_prefix}.block_sparse_moe.experts.{j}.w1.weight")
+        w3_layers.append(f"{hf_prefix}.block_sparse_moe.experts.{j}.w3.weight")
+        w2_layers.append(f"{hf_prefix}.block_sparse_moe.experts.{j}.w2.weight")
+
+      mapping["params-decoder-layers-MoeBlock_0-wi_0"].append(w1_layers)
+      mapping["params-decoder-layers-MoeBlock_0-wi_1"].append(w3_layers)
+      mapping["params-decoder-layers-MoeBlock_0-wo"].append(w2_layers)
+
+  else:
+    for i in range(config["num_hidden_layers"]):
+      maxtext_prefix = f"params-decoder-layers_{i}"
+      hf_prefix = f"model.layers.{i}"
+
+      # Attention weights
+      mapping[f"{maxtext_prefix}-self_attention-query-kernel"] = f"{hf_prefix}.self_attn.q_proj.weight"
+      mapping[f"{maxtext_prefix}-self_attention-key-kernel"] = f"{hf_prefix}.self_attn.k_proj.weight"
+      mapping[f"{maxtext_prefix}-self_attention-value-kernel"] = f"{hf_prefix}.self_attn.v_proj.weight"
+      mapping[f"{maxtext_prefix}-self_attention-out-kernel"] = f"{hf_prefix}.self_attn.o_proj.weight"
+
+      # RMSNorm weights
+      mapping[f"{maxtext_prefix}-pre_self_attention_layer_norm-scale"] = f"{hf_prefix}.input_layernorm.weight"
+      mapping[f"{maxtext_prefix}-post_self_attention_layer_norm-scale"] = f"{hf_prefix}.post_attention_layernorm.weight"
+
+      # MoE gate
+      mapping[f"{maxtext_prefix}-MoeBlock_0-gate-kernel"] = f"{hf_prefix}.block_sparse_moe.gate.weight"
+
+      # MoE expert weights (1 MaxText param -> 8 HF params)
+      w1_experts = [f"{hf_prefix}.block_sparse_moe.experts.{j}.w1.weight" for j in range(num_experts)]
+      w3_experts = [f"{hf_prefix}.block_sparse_moe.experts.{j}.w3.weight" for j in range(num_experts)]
+      w2_experts = [f"{hf_prefix}.block_sparse_moe.experts.{j}.w2.weight" for j in range(num_experts)]
+
+      mapping[f"{maxtext_prefix}-MoeBlock_0-wi_0"] = w1_experts
+      mapping[f"{maxtext_prefix}-MoeBlock_0-wi_1"] = w3_experts
+      mapping[f"{maxtext_prefix}-MoeBlock_0-wo"] = w2_experts
+
+  return mapping
+
+
+def MIXTRAL_MAXTEXT_TO_HF_PARAM_HOOK_FN(config, maxtext_config, scan_layers=False, saving_to_hf=False):
+  """
+  Generates parameter conversion hooks for Mixtral between MaxText and Hugging Face.
+  """
+  hooks = {}
+
+  def reshape_and_transpose_attention(x, target_shape):
+    """MaxText: [hidden, n_heads, h_dim] <-> HF: [n_heads * h_dim, hidden]"""
+    if saving_to_hf:
+      # (H, N, D) -> (H, N*D) -> (N*D, H)
+      return x.reshape(config["hidden_size"], -1).transpose()
+    else:
+      # (N*D, H) -> (H, N*D) -> (H, N, D)
+      return x.transpose().reshape(target_shape)
+
+  def reshape_kernel(x, target_shape):
+    return x.transpose()
+
+  def scale_query_layer(input_tensor, target_shape):
+    if saving_to_hf:
+      depth_scale = np.dtype("float32").type(np.sqrt(maxtext_config.head_dim))
+      return (input_tensor * depth_scale).astype(input_tensor.dtype)
+    else:
+      depth_scale = np.dtype("float32").type(1 / np.sqrt(maxtext_config.head_dim))
+      return (input_tensor * depth_scale).astype(input_tensor.dtype)
+
+  if scan_layers:
+    plan = [
+        ("params-decoder-layers-self_attention-query-kernel", [reshape_and_transpose_attention, scale_query_layer]),
+        ("params-decoder-layers-self_attention-key-kernel", reshape_and_transpose_attention),
+        ("params-decoder-layers-self_attention-value-kernel", reshape_and_transpose_attention),
+        ("params-decoder-layers-self_attention-out-kernel", reshape_and_transpose_attention),
+        ("params-decoder-layers-MoeBlock_0-wi_0", reshape_kernel),
+        ("params-decoder-layers-MoeBlock_0-wi_1", reshape_kernel),
+        ("params-decoder-layers-MoeBlock_0-wo", reshape_kernel),
+        ("params-decoder-layers-MoeBlock_0-gate-kernel", reshape_kernel),
+    ]
+  else:
+    plan = [
+        ("params-decoder-layers_{i}-self_attention-query-kernel", [reshape_and_transpose_attention, scale_query_layer]),
+        ("params-decoder-layers_{i}-self_attention-key-kernel", reshape_and_transpose_attention),
+        ("params-decoder-layers_{i}-self_attention-value-kernel", reshape_and_transpose_attention),
+        ("params-decoder-layers_{i}-self_attention-out-kernel", reshape_and_transpose_attention),
+        ("params-decoder-layers_{i}-MoeBlock_0-wi_0", reshape_kernel),
+        ("params-decoder-layers_{i}-MoeBlock_0-wi_1", reshape_kernel),
+        ("params-decoder-layers_{i}-MoeBlock_0-wo", reshape_kernel),
+        ("params-decoder-layers_{i}-MoeBlock_0-gate-kernel", reshape_kernel),
+    ]
+  plan.append(("params-decoder-logits_dense-kernel", reshape_kernel))
+
+  for maxtext_pattern, op_func in plan:
+    if "{i}" in maxtext_pattern:
+      for i in range(config["num_hidden_layers"]):
+        hooks[maxtext_pattern.format(i=i)] = op_func
+    else:
+      hooks[maxtext_pattern] = op_func
+  return hooks
+
+
 # {maxtext model name: {maxtext weight name: hf weight name}}
 PARAM_MAPPING = {
     "gemma2-2b": GEMMA2_MAXTEXT_TO_HF_PARAM_MAPPING,
@@ -1448,6 +1600,8 @@ def transform_query_kernel(arr):
     "gpt-oss-20b": GPT_OSS_MAXTEXT_TO_HF_PARAM_MAPPING,
     "gpt-oss-120b": GPT_OSS_MAXTEXT_TO_HF_PARAM_MAPPING,
     "qwen3-omni-30b-a3b": QWEN3_OMNI_MOE_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "mixtral-8x7b": MIXTRAL_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "mixtral-8x22b": MIXTRAL_MAXTEXT_TO_HF_PARAM_MAPPING,
 }
 
 # {maxtext model name: {maxtext weight name: bi-directional transform}}
@@ -1474,6 +1628,8 @@ def transform_query_kernel(arr):
     "gpt-oss-20b": GPT_OSS_TO_HF_PARAM_HOOK_FN,
     "gpt-oss-120b": GPT_OSS_TO_HF_PARAM_HOOK_FN,
     "qwen3-omni-30b-a3b": QWEN3_OMNI_MOE_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "mixtral-8x7b": MIXTRAL_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "mixtral-8x22b": MIXTRAL_MAXTEXT_TO_HF_PARAM_HOOK_FN,
 }
 
 VLLM_HOOK_FNS = {
diff --git a/src/MaxText/utils/ckpt_conversion/utils/utils.py b/src/MaxText/utils/ckpt_conversion/utils/utils.py
@@ -76,6 +76,8 @@
     "gpt-oss-20b": "openai/gpt-oss-20b",
     "gpt-oss-120b": "openai/gpt-oss-120b",
     "qwen3-omni-30b-a3b": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+    "mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "mixtral-8x22b": "mistralai/Mixtral-8x22B-Instruct-v0.1",
 }
 
 
@@ -195,7 +197,10 @@ def process_maxtext_param(
 
   # Case 3 or 4: The source tensor is stacked on a single axis.
   # We determine if it's an unscanned MoE (expert axis) or standard scanned (layer axis).
-  is_unscanned_moe = "moe_block" in maxtext_param_key and any(
+  # `w` is needed for weights, and except for gate.
+  # Gate values are stack in layers only, but weights are stack in both expert and layer.
+  moe_block_list = ["moe_block", "MoeBlock_0-w"]
+  is_unscanned_moe = any(block in maxtext_param_key for block in moe_block_list) and any(
       f"_{i}-" in maxtext_param_key for i in range(maxtext_config.base_num_decoder_layers)
   )
 
diff --git a/tests/forward_pass_logit_checker.py b/tests/forward_pass_logit_checker.py
@@ -380,7 +380,8 @@ def main(config, test_args):  # pylint: disable=W0621
       raise ValueError("run_hf_model requires hf_model_path")
     hf_model = AutoModelForCausalLM.from_pretrained(test_args.hf_model_path, dtype=torch.bfloat16)
     tokenizer = AutoTokenizer.from_pretrained(test_args.hf_model_path)
-    if "Llama-3.1" in test_args.hf_model_path:
+    pad_token_models = ["Llama-3.1", "Mixtral-8x"]
+    if any(model in test_args.hf_model_path for model in pad_token_models):
       tokenizer.pad_token = tokenizer.eos_token
 
     init_rng = jax.random.PRNGKey(config.init_weights_seed)