AI-Hypercomputer
diff --git a/‎end_to_end/tpu/qwen/next/qwen3-next-80b-a3b/1_test_qwen3_next_80b_a3b.sh‎
Lines changed: 59 additions & 0 deletions b/‎end_to_end/tpu/qwen/next/qwen3-next-80b-a3b/1_test_qwen3_next_80b_a3b.sh‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎end_to_end/tpu/qwen/next/run_qwen3_next.md‎
Lines changed: 82 additions & 0 deletions b/‎end_to_end/tpu/qwen/next/run_qwen3_next.md‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎src/MaxText/configs/models/qwen3-next-80b-a3b.yml‎
Lines changed: 3 additions & 0 deletions b/‎src/MaxText/configs/models/qwen3-next-80b-a3b.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 6 additions & 6 deletions b/‎src/MaxText/configs/types.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/MaxText/layers/attentions.py‎
Lines changed: 3 additions & 3 deletions b/‎src/MaxText/layers/attentions.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/MaxText/layers/decoders.py‎
Lines changed: 6 additions & 2 deletions b/‎src/MaxText/layers/decoders.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/MaxText/layers/normalizations.py‎
Lines changed: 8 additions & 0 deletions b/‎src/MaxText/layers/normalizations.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/MaxText/layers/qwen3.py‎
Lines changed: 69 additions & 17 deletions b/‎src/MaxText/layers/qwen3.py‎
Lines changed: 69 additions & 17 deletions
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# This script validates a pre-converted MaxText checkpoint against its original
+# HuggingFace counterpart to ensure numerical correctness.
+
+# ---
+# Example Usage:
+#
+# # (Required) Path to the converted MaxText checkpoint
+# export MAXTEXT_CHECKPOINT_PATH=gs://path/to/converted_ckpt/0/items/
+#
+# # (Optional) Override the default HF model
+# export HF_MODEL_PATH=MyCustom/Qwen3-variant
+#
+# bash end_to_end/tpu/qwen/next/qwen3-next-80b-a3b/1_test_qwen3_next_80b_a3b.sh
+# ---
+
+set -ex
+
+# --- Configuration & Input Validation ---
+
+if [ -z "${MAXTEXT_CHECKPOINT_PATH}" ]; then
+    echo "ERROR: The MAXTEXT_CHECKPOINT_PATH environment variable is not set."
+    echo "Please set it to the full GCS path of the pre-converted MaxText checkpoint weights."
+    exit 1
+fi
+
+# Set a default for the HF model path if it's not provided by the user
+if [ -z "${HF_MODEL_PATH}" ]; then
+    export HF_MODEL_PATH="Qwen/Qwen3-Next-80B-A3B-Instruct"
+    echo "HF_MODEL_PATH is not set, using default: ${HF_MODEL_PATH}"
+fi
+
+# Install dependencies required for the logit checker.
+python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
+
+# --- Run the Forward Pass Logit Checker ---
+
+echo "Validating MaxText checkpoint at ${MAXTEXT_CHECKPOINT_PATH}"
+echo "Against original HF model: ${HF_MODEL_PATH}"
+
+# This command runs the core validation logic.
+JAX_PLATFORMS=cpu python3 -m MaxText.tests.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}"/configs/base.yml \
+  tokenizer_type=huggingface \
+  tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText/assets}}"/qwen3-tokenizer \
+  megablox=False \
+  sparse_matmul=False \
+  load_parameters_path=${MAXTEXT_CHECKPOINT_PATH} \
+  model_name=qwen3-next-80b-a3b \
+  checkpoint_storage_concurrent_gb=1024 \
+  skip_jax_distributed_system=True \
+  dtype=float32 \
+  weight_dtype=float32 \
+  matmul_precision=highest \
+  --hf_model_path=${HF_MODEL_PATH} \
+  --max_kl_div=0.03 \
+  --run_hf_model=True
+
+echo "Validation complete."
@@ -0,0 +1,82 @@
+Qwen3 Next
+=========
+
+Qwen3-Next is Alibaba 80B Mixture-of-Experts (MoE) model (activating only 3B parameters) that features a novel **hybrid attention** architecture combining Gated DeltaNet (linear attention) and Gated Attention (full attention) for massive context scaling. This documentation covers the integration of **Qwen3-Next-80B-A3B** into MaxText:
+
+For more details on the architecture, see the [Qwen3 Technical Blog](https://qwen.ai/blog?id=4074cca80393150c248e508aa62983f9cb7d27cd&from=research.latest-advancements-list).
+
+* * * * *
+
+Checkpoint Conversion
+---------------------
+
+To get started, you first need a MaxText-compatible checkpoint.
+
+1.  **Download the Model**: Download the official model from Hugging Face. You can use a tool like `hf_transfer` for a fast download.
+
+    ```
+    # Example for Qwen3-Next-80B-A3B-Instruct
+    hf_transfer download Qwen/Qwen3-Next-80B-A3B-Instruct --local-dir /path/to/qwen3_next_hf_checkpoint
+    ```
+
+2.  **Convert the Checkpoint**: Run the `convert_qwen3_next_scanned.py` script to convert the downloaded Hugging Face weights into the Orbax format required by MaxText.
+
+    ```
+    python3 -m MaxText.utils.ckpt_scripts.convert_qwen3_next_scanned \
+      --base_model_path /path/to/qwen3_next_hf_checkpoint \
+      --maxtext_model_path gs://your-gcs-bucket/qwen3_next_maxtext_ckpt \
+      --model_size qwen3-next-80b-a3b
+    ```
+
+* * * * *
+
+Pre-training and Fine-tuning
+----------------------------
+
+After converting the checkpoint, you can use it for fine-tuning or start a pre-training run from scratch. The command below is an example for fine-tuning on a v5p-512 slice. To pre-train, simply remove the `load_parameters_path` argument.
+
+```
+python3 -m MaxText.train src/MaxText/configs/base.yml \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
+    dataset_path=${DATASET_PATH} \
+    load_parameters_path=gs://your-gcs-bucket/qwen3_next_maxtext_ckpt/0/items \
+    run_name=qwen3_next_finetuning \
+    per_device_batch_size=1 \
+    model_name=qwen3-next-80b-a3b \
+    steps=500 \
+    max_target_length=8192 \
+    ici_fsdp_parallelism=256 \
+    tokenizer_type=huggingface \
+    tokenizer_path=src/MaxText/assets/qwen3-tokenizer
+
+```
+
+* * * * *
+
+Correctness Validation
+----------------------
+
+To verify that the MaxText implementation is numerically equivalent to the original Hugging Face model, you can run the end-to-end test scripts. These scripts automate the logit comparison test for each model.
+
+Before running, you must set the `MAXTEXT_CHECKPOINT_PATH` environment variable. You can also optionally set `HF_MODEL_PATH` to point to a local copy of the Hugging Face model.
+
+### Qwen3-Next-80B-A3B
+
+Bash
+
+```
+# Set the required path to your converted MaxText checkpoint
+export MAXTEXT_CHECKPOINT_PATH=gs://your-gcs-bucket/qwen3-next-80b-a3b_maxtext_ckpt/0/items/
+
+# (Optional) Set the path to your local Hugging Face checkpoint
+# export HF_MODEL_PATH=/path/to/local/qwen3-next-80b-a3b_hf_checkpoint
+
+# Execute the validation script
+bash end_to_end/tpu/qwen/next/qwen3-next-80b-a3b/1_test_qwen3_next_80b_a3b.sh
+
+```
+
+## Supported MoE Strategies
+
+This model implementation supports both **Token Dropping** and **Dropless** strategies for Mixture of Experts routing. Take a look at the MaxText [documentation](https://github.com/AI-Hypercomputer/maxtext/blob/main/docs/reference/core_concepts/moe_configuration.md) on MoE configs and flags to set based on desired strategy.
+
@@ -46,3 +46,6 @@ gdn_chunk_size: 64
 # RoPE Settings
 rope_max_timescale: 10000000
 partial_rotary_factor: 0.25
+
+# General Model Settings
+enable_dropout: False
@@ -2094,15 +2094,15 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
             f"({self.interleave_moe_layer_step})"
         )
     if self.decoder_block == DecoderBlockType.QWEN3_NEXT:
-      if self.sparse_matmul:
-        logger.warning(
-            "For Qwen3-Next, sparse_matmul must be False for now. The dense path has been verified against reference. "
-            "Forcing to False."
-        )
-        self.sparse_matmul = False
+      if int(self.gdn_num_value_heads) % int(self.gdn_num_key_heads) != 0:
+        raise ValueError("gdn_num_value_heads must be divisible by gdn_num_key_heads")
       rotary_dim = int(self.head_dim * self.partial_rotary_factor)
       if rotary_dim % 2 != 0:
         raise ValueError(f"Calculated rotary dimension ({rotary_dim}) must be a multiple of 2.")
+    else:
+      if self.partial_rotary_factor is not None and self.partial_rotary_factor != 1.0:
+        raise ValueError("`partial_rotary_factor` is only effective when `decoder_block` is set to 'qwen3_next'.")
+
     tokenizer_path = getattr(self, "tokenizer_path", None)
     if (
         tokenizer_path
 
@@ -1104,9 +1104,6 @@ def __call__(
           bidirectional_mask,
           self.sinks,
       )
-    if self.is_qwen3_next:
-      out = out.reshape(batch_size, seq_len, self.config.num_query_heads * self.config.head_dim)
-      out = out * jax.nn.sigmoid(gate)
     if model_mode == MODEL_MODE_PREFILL:
       out = self._maybe_shard_with_logical(out, self.prefill_out_axis_names)
     elif model_mode == MODEL_MODE_TRAIN and self.config.expert_shard_attention_option == EP_AS_CONTEXT:
@@ -1115,6 +1112,9 @@ def __call__(
       out = self._maybe_shard_with_logical(out, self.out_axis_names)
     else:
       out = self._maybe_shard_with_logical(out, self.decode_out_axis_names)
+    if self.is_qwen3_next:
+      out = out.reshape(batch_size, seq_len, self.config.num_query_heads * self.config.head_dim)
+      out = out * jax.nn.sigmoid(gate)
     out = self.out_projection(out, out_sharding=out_sharding)
     out = checkpoint_name(out, "out_proj")
     return out, kv_cache
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""""Module for decoder layers."""
+"""Module for decoder layers"""
 # pylint: disable=arguments-differ
 # pylint: disable=no-name-in-module
 
@@ -35,6 +35,7 @@
 from MaxText.sharding import create_sharding
 from MaxText.inference import page_manager
 from MaxText.layers import linears
+from MaxText.layers import normalizations
 from MaxText.layers import quantizations
 from MaxText.layers import pipeline
 from MaxText import maxtext_utils
@@ -473,7 +474,6 @@ def get_norm_layer(self, num_features: int):
         DecoderBlockType.GEMMA3,
         DecoderBlockType.QWEN3,
         DecoderBlockType.QWEN3_MOE,
-        DecoderBlockType.QWEN3_NEXT,
         DecoderBlockType.GPT_OSS,
         DecoderBlockType.SIMPLE,
         DecoderBlockType.SIMPLE_MLP,
@@ -482,6 +482,10 @@ def get_norm_layer(self, num_features: int):
       return functools.partial(rms_norm, num_features=num_features, shard_mode=self.config.shard_mode)
     elif self.config.decoder_block == DecoderBlockType.GPT3:
       return functools.partial(gpt3.gpt3_layer_norm, num_features=num_features, reductions_in_fp32=False, use_bias=True)
+    elif self.config.decoder_block == DecoderBlockType.QWEN3_NEXT:
+      return functools.partial(
+          normalizations.Qwen3NextRMSNormLinen, num_features=num_features, shard_mode=self.config.shard_mode
+      )
     else:
       raise ValueError(f"Incorrect decoder_block name {self.config.decoder_block.value=}")
 
 
@@ -196,3 +196,11 @@ def l2norm(x: Array, dim: int = -1, eps: float = 1e-6) -> Array:
 
   inv_norm = jax.lax.rsqrt((x * x).sum(axis=dim, keepdims=True) + jnp.array(eps, dtype=x.dtype))
   return x * inv_norm
+
+
+Qwen3NextRMSNormLinen = nnx_wrappers.to_linen_class(
+    RMSNorm,
+    base_metadata_fn=variable_to_logically_partitioned,
+    scale_init=linen_initializers.zeros,
+    scale_offset=1.0,
+)
@@ -324,6 +324,7 @@ def __init__(self, config: Config, dtype: DType = jnp.float32, *, rngs: nnx.Rngs
     self.value_dim = self.head_v_dim * self.num_v_heads
     conv_dim = self.key_dim * 2 + self.value_dim
     conv_kernel_size = cfg.gdn_conv_kernel_dim
+    self.v_heads_per_k_head = self.num_v_heads // self.num_k_heads
 
     # Submodule instantiations
     self.in_proj_qkvz = linears.DenseGeneral(
@@ -381,33 +382,86 @@ def a_log_init(key, shape, dtype=jnp.float32):
     )
 
   def __call__(self, hidden_states: Array) -> Array:
+    # hidden_states: (B, S, E)
     cfg = self.config
+    batch, seq_len, _ = hidden_states.shape
 
     # =========================================================================
     # STEP A: Input Projections
     # =========================================================================
-    # hidden_states shape: (B, S, E)
-    # qkvz shape: (B, S, 2*key_dim + 2*value_dim)
+    # qkvz: (B, S, 2 * K_dim + 2 * V_dim)
     qkvz = self.in_proj_qkvz(hidden_states)
-    # ba shape: (B, S, 2*H_v)
+    # ba: (B, S, 2 * H_v)
     ba = self.in_proj_ba(hidden_states)
 
-    # q shape: (B, S, key_dim), k shape: (B, S, key_dim), v shape: (B, S, value_dim), z shape: (B, S, value_dim)
-    q, k, v, z = jnp.split(qkvz, [self.key_dim, 2 * self.key_dim, 2 * self.key_dim + self.value_dim], axis=-1)
-    # b shape: (B, S, H_v), a shape: (B, S, H_v)
-    b, a = jnp.split(ba, [self.num_v_heads], axis=-1)
+    # QKVZ Reshaping and Splitting
+    # Per-K_head group dim: 2 * D_k + 2 * D_v * V_per_K
+    new_shape_qkvz = (
+        batch,
+        seq_len,
+        self.num_k_heads,  # H_k
+        2 * self.head_k_dim + 2 * self.head_v_dim * self.v_heads_per_k_head,
+    )
+    # mixed_qkvz: (B, S, H_k, 2*D_k + 2*D_v*V_per_K)
+    mixed_qkvz = qkvz.reshape(new_shape_qkvz)
+
+    split_indices_qkvz = [
+        self.head_k_dim,  # D_k
+        2 * self.head_k_dim,  # 2 * D_k
+        2 * self.head_k_dim + (self.v_heads_per_k_head * self.head_v_dim),  # 2 * D_k + V_per_K * D_v
+    ]
+    # query: (B, S, H_k, D_k)
+    # key: (B, S, H_k, D_k)
+    # value_raw: (B, S, H_k, V_per_K * D_v)
+    # z_raw: (B, S, H_k, V_per_K * D_v)
+    query, key, value_raw, z_raw = jnp.split(mixed_qkvz, split_indices_qkvz, axis=3)
+
+    # value: (B, S, H_v, D_v)
+    value = value_raw.reshape(batch, seq_len, self.num_v_heads, self.head_v_dim)
+    # z: (B, S, H_v, D_v)
+    z = z_raw.reshape(batch, seq_len, self.num_v_heads, self.head_v_dim)
+
+    # BA Reshaping and Splitting
+    new_shape_ba = (
+        batch,
+        seq_len,
+        self.num_k_heads,  # H_k
+        2 * self.v_heads_per_k_head,
+    )
+    # mixed_ba: (B, S, H_k, 2 * V_per_K)
+    mixed_ba = ba.reshape(new_shape_ba)
+
+    split_indices_ba = [self.v_heads_per_k_head]
+    # b_raw: (B, S, H_k, V_per_K)
+    # a_raw: (B, S, H_k, V_per_K)
+    b_raw, a_raw = jnp.split(mixed_ba, split_indices_ba, axis=3)
+
+    # b: (B, S, H_v)
+    b = b_raw.reshape(batch, seq_len, self.num_v_heads)
+    # a: (B, S, H_v)
+    a = a_raw.reshape(batch, seq_len, self.num_v_heads)
+
+    # Flatten head dimensions for concatenation before conv
+    # q: (B, S, K_dim)
+    q = query.reshape(batch, seq_len, -1)
+    # k: (B, S, K_dim)
+    k = key.reshape(batch, seq_len, -1)
+    # v: (B, S, V_dim)
+    v = value.reshape(batch, seq_len, -1)
 
     # =========================================================================
     # STEP B: 1D Convolution
     # =========================================================================
-    # qkv shape: (B, S, conv_dim)
+    # conv_dim = 2 * K_dim + V_dim
+    # qkv: (B, S, 2 * K_dim + V_dim)
     qkv = jnp.concatenate([q, k, v], axis=-1)
 
     # TODO(parambole): Implement caching logic for conv_state and recurrent_state
 
     # Input to conv_layer should be (B, S, C)
     # qkv_conv shape: (B, S, conv_dim)
-    qkv_conv = jax.nn.silu(self.conv1d(qkv).astype(jnp.float32)).astype(cfg.dtype)
+    conv_out = self.conv1d(qkv)
+    qkv_conv = jax.nn.silu(conv_out.astype(jnp.float32)).astype(cfg.dtype)
     # q_conv shape: (B, S, key_dim), k_conv shape: (B, S, key_dim), v_conv shape: (B, S, value_dim)
     q_conv, k_conv, v_conv = jnp.split(qkv_conv, [self.key_dim, 2 * self.key_dim], axis=-1)
 
@@ -450,13 +504,11 @@ def __call__(self, hidden_states: Array) -> Array:
     # =========================================================================
     # STEP D: Final Output Stage
     # =========================================================================
+
     # The normalization and gating is applied per-head on the value dimension.
-    # We first reshape the `z` tensor to match the multi-head structure of `core_attn_out`.
-    # z shape from (B, S, value_dim) -> (B, S, H_v, D_v)
-    z_reshaped = z.reshape(batch, seq_len, self.num_v_heads, self.head_v_dim)
 
     # Apply the norm and gate. Output shape: (B, S, H_v, D_v)
-    gated_output_reshaped = self.norm(core_attn_out, z_reshaped)
+    gated_output_reshaped = self.norm(core_attn_out, z)
 
     # Reshape back to a single feature dimension for the final projection.
     # Shape from (B, S, H_v, D_v) -> (B, S, value_dim)
@@ -506,9 +558,9 @@ def __init__(
     cfg = self.config
 
     scaling_factor = self.config.head_dim**-0.5
+    batch_size, seq_len = max_utils.get_batch_seq_len_for_mode(config, model_mode)
+    dummy_inputs_shape = (batch_size, seq_len, config.emb_dim)
 
-    inputs_q_shape = (cfg.per_device_batch_size, cfg.max_target_length, cfg.emb_dim)
-    inputs_kv_shape = (cfg.per_device_batch_size, cfg.max_target_length, cfg.emb_dim)
     self.attention = attentions.Attention(
         config=cfg,
         num_query_heads=cfg.num_query_heads,
@@ -517,8 +569,8 @@ def __init__(
         max_target_length=cfg.max_target_length,
         max_prefill_predict_length=cfg.max_prefill_predict_length,
         attention_kernel=cfg.attention,
-        inputs_q_shape=inputs_q_shape,
-        inputs_kv_shape=inputs_kv_shape,
+        inputs_q_shape=dummy_inputs_shape,
+        inputs_kv_shape=dummy_inputs_shape,
         out_axis_names=(BATCH, LENGTH_NO_EXP, EMBED),
         mesh=self.mesh,
         dtype=cfg.dtype,