Skip to content

Commit ab84f8e

Browse files
authored
add qwen3-base variants and qwen3-1.7b (#3416)
add qwen3-base to configs/types and checkpoint_conversion/param_mapping add qwen3-base configs to checkpoint_conversion/hf_model_configs pyink
1 parent 086c50d commit ab84f8e

9 files changed

Lines changed: 266 additions & 0 deletions

File tree

src/maxtext/checkpoint_conversion/utils/hf_model_configs.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,22 @@
226226
torch_dtype="bfloat16",
227227
)
228228

229+
qwen3_1_7b_config = transformers.Qwen3Config(
230+
vocab_size=151936,
231+
hidden_size=2048,
232+
intermediate_size=6144,
233+
num_hidden_layers=28,
234+
num_attention_heads=16,
235+
num_key_value_heads=8,
236+
head_dim=128,
237+
hidden_act="silu",
238+
max_position_embeddings=40960,
239+
rms_norm_eps=1.0e-6,
240+
rope_theta=1000000.0,
241+
tie_word_embeddings=True,
242+
torch_dtype="bfloat16",
243+
)
244+
229245
qwen3_4b_config = transformers.Qwen3Config(
230246
vocab_size=151936,
231247
hidden_size=2560,
@@ -816,16 +832,22 @@
816832
"gemma3-12b": gemma3_12b_config,
817833
"gemma3-27b": gemma3_27b_config,
818834
"qwen3-0.6b": qwen3_0_6b_config,
835+
"qwen3-1.7b": qwen3_1_7b_config,
836+
"qwen3-1.7b-base": qwen3_1_7b_config,
819837
"qwen3-4b": qwen3_4b_config,
838+
"qwen3-4b-base": qwen3_4b_config,
820839
"qwen3-4b-thinking-2507": qwen3_4b_config,
821840
"qwen3-8b": qwen3_8b_config,
841+
"qwen3-8b-base": qwen3_8b_config,
822842
"qwen3-14b": qwen3_14b_config,
843+
"qwen3-14b-base": qwen3_14b_config,
823844
"qwen3-32b": qwen3_32b_config,
824845
"llama3.1-8b": llama31_8b_config,
825846
"llama3.1-8b-Instruct": llama31_8b_config,
826847
"llama3.1-70b": llama31_70b_config,
827848
"llama3.1-405b": llama31_405b_config,
828849
"qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config,
850+
"qwen3-30b-a3b-base": qwen3_30b_a3b_thinking_2507_config,
829851
"qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
830852
"qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
831853
"deepseek3-671b": deepseek3_671b_config,

src/maxtext/checkpoint_conversion/utils/param_mapping.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2333,15 +2333,21 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
23332333
"gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
23342334
"gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
23352335
"qwen3-0.6b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
2336+
"qwen3-1.7b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
2337+
"qwen3-1.7b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23362338
"qwen3-4b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
2339+
"qwen3-4b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23372340
"qwen3-4b-thinking-2507": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23382341
"qwen3-8b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
2342+
"qwen3-8b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23392343
"qwen3-14b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
2344+
"qwen3-14b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23402345
"qwen3-32b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23412346
"llama3.1-8b": LLAMA31_MAXTEXT_TO_HF_PARAM_MAPPING,
23422347
"llama3.1-70b": LLAMA31_MAXTEXT_TO_HF_PARAM_MAPPING,
23432348
"llama3.1-405b": LLAMA31_MAXTEXT_TO_HF_PARAM_MAPPING,
23442349
"qwen3-30b-a3b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
2350+
"qwen3-30b-a3b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23452351
"qwen3-235b-a22b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23462352
"qwen3-coder-480b-a35b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
23472353
"deepseek3-671b": DEEPSEEK_MAXTEXT_TO_HF_PARAM_MAPPING,
@@ -2365,15 +2371,21 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
23652371
"gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23662372
"gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23672373
"qwen3-0.6b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2374+
"qwen3-1.7b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2375+
"qwen3-1.7b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23682376
"qwen3-4b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2377+
"qwen3-4b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23692378
"qwen3-4b-thinking-2507": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23702379
"qwen3-8b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2380+
"qwen3-8b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23712381
"qwen3-14b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2382+
"qwen3-14b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23722383
"qwen3-32b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23732384
"llama3.1-8b": LLAMA31_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23742385
"llama3.1-70b": LLAMA31_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23752386
"llama3.1-405b": LLAMA31_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23762387
"qwen3-30b-a3b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2388+
"qwen3-30b-a3b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23772389
"qwen3-235b-a22b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23782390
"qwen3-coder-480b-a35b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
23792391
"deepseek3-671b": DEEPSEEK_MAXTEXT_TO_HF_PARAM_HOOK_FN,
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2023–2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# model config for qwen3-1.7b-base
16+
17+
base_emb_dim: 2048
18+
base_num_query_heads: 16
19+
base_num_kv_heads: 8
20+
base_mlp_dim: 6144
21+
base_num_decoder_layers: 28
22+
head_dim: 128
23+
mlp_activations: ["silu", "linear"] # "hidden_act": "silu" implies SwiGLU
24+
vocab_size: 151936
25+
26+
decoder_block: "qwen3"
27+
28+
normalization_layer_epsilon: 1.0e-6
29+
rope_max_timescale: 1000000
30+
31+
use_qk_norm: True
32+
33+
logits_via_embedding: True # from "tie_word_embeddings": true
34+
normalize_embedding_logits: False
35+
enable_dropout: False # deterministic for testing
36+
37+
tokenizer_type: "huggingface"
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2023–2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# model config for qwen3-1.7b
16+
17+
base_emb_dim: 2048
18+
base_num_query_heads: 16
19+
base_num_kv_heads: 8
20+
base_mlp_dim: 6144
21+
base_num_decoder_layers: 28
22+
head_dim: 128
23+
mlp_activations: ["silu", "linear"] # "hidden_act": "silu" implies SwiGLU
24+
vocab_size: 151936
25+
26+
decoder_block: "qwen3"
27+
28+
normalization_layer_epsilon: 1.0e-6
29+
rope_max_timescale: 1000000
30+
31+
use_qk_norm: True
32+
33+
logits_via_embedding: True # from "tie_word_embeddings": true
34+
normalize_embedding_logits: False
35+
enable_dropout: False # deterministic for testing
36+
37+
tokenizer_type: "huggingface"
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2023–2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# model config for qwen3-14b-base
16+
17+
base_emb_dim: 5120
18+
base_num_query_heads: 40
19+
base_num_kv_heads: 8
20+
base_mlp_dim: 17408
21+
base_num_decoder_layers: 40
22+
head_dim: 128
23+
mlp_activations: ["silu", "linear"] # "hidden_act": "silu" implies SwiGLU
24+
vocab_size: 151936
25+
26+
decoder_block: "qwen3"
27+
28+
normalization_layer_epsilon: 1.0e-6
29+
rope_max_timescale: 1000000
30+
31+
use_qk_norm: True
32+
33+
logits_via_embedding: False # different from 0.6 and 4B variants, "tie_word_embeddings": false
34+
normalize_embedding_logits: False
35+
36+
tokenizer_type: "huggingface"
37+
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Model config for Qwen3-30B-A3B-base
16+
17+
# Core Architectural Parameters
18+
decoder_block: "qwen3_moe"
19+
base_emb_dim: 2048
20+
base_mlp_dim: 768
21+
base_num_query_heads: 32
22+
base_num_kv_heads: 4
23+
base_num_decoder_layers: 48
24+
head_dim: 128
25+
mlp_activations: ["silu", "linear"]
26+
vocab_size: 151936
27+
normalization_layer_epsilon: 1.0e-6
28+
use_qk_norm: True
29+
30+
# MoE Specific Parameters
31+
num_experts: 128
32+
num_experts_per_tok: 8
33+
base_moe_mlp_dim: 768
34+
norm_topk_prob: true
35+
36+
# RoPE Settings
37+
rope_max_timescale: 10_000_000
38+
39+
# General Model Settings
40+
enable_dropout: False
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2023–2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# model config for qwen3-4b-base
16+
17+
base_emb_dim: 2560
18+
base_num_query_heads: 32
19+
base_num_kv_heads: 8
20+
base_mlp_dim: 9728
21+
base_num_decoder_layers: 36
22+
head_dim: 128
23+
mlp_activations: ["silu", "linear"] # "hidden_act": "silu" implies SwiGLU
24+
vocab_size: 151936
25+
26+
decoder_block: "qwen3"
27+
28+
normalization_layer_epsilon: 1.0e-6
29+
rope_max_timescale: 1000000
30+
31+
use_qk_norm: True
32+
33+
logits_via_embedding: True # from "tie_word_embeddings": true
34+
normalize_embedding_logits: False
35+
enable_dropout: False # deterministic for testing
36+
37+
tokenizer_type: "huggingface"
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2023–2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# model config for qwen3-8b-base
16+
17+
base_emb_dim: 4096
18+
base_num_query_heads: 32
19+
base_num_kv_heads: 8
20+
base_mlp_dim: 12288
21+
base_num_decoder_layers: 36
22+
head_dim: 128
23+
mlp_activations: ["silu", "linear"] # "hidden_act": "silu" implies SwiGLU
24+
vocab_size: 151936
25+
26+
decoder_block: "qwen3"
27+
28+
normalization_layer_epsilon: 1.0e-6
29+
rope_max_timescale: 1000000
30+
31+
use_qk_norm: True
32+
33+
logits_via_embedding: False # different from smaller variants, "tie_word_embeddings": false
34+
normalize_embedding_logits: False
35+
enable_dropout: False # deterministic for testing
36+
37+
tokenizer_type: "huggingface"
38+

src/maxtext/configs/types.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,13 +233,19 @@ class ProfilerType(str, Enum):
233233
"gemma3-12b",
234234
"gemma3-27b",
235235
"qwen3-0.6b",
236+
"qwen3-1.7b",
237+
"qwen3-1.7b-base",
236238
"qwen3-4b",
239+
"qwen3-4b-base",
237240
"qwen3-4b-thinking-2507",
238241
"qwen3-8b",
242+
"qwen3-8b-base",
239243
"qwen3-14b",
244+
"qwen3-14b-base",
240245
"qwen3-32b",
241246
"qwen3-235b-a22b",
242247
"qwen3-30b-a3b",
248+
"qwen3-30b-a3b-base",
243249
"qwen3-480b-a35b",
244250
"qwen3-next-80b-a3b",
245251
"qwen3-omni-30b-a3b",

0 commit comments

Comments
 (0)