|
| 1 | +# Copyright 2023–2026 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +# AllenAI OLMo 3 32B Configuration |
| 16 | +# https://huggingface.co/allenai/Olmo-3.1-32B-Instruct/blob/main/config.json |
| 17 | + |
| 18 | +model_name: "olmo3_32b" |
| 19 | +decoder_block: "olmo3" |
| 20 | + |
| 21 | +# Model Dimensions |
| 22 | +base_emb_dim: 5120 |
| 23 | +base_num_query_heads: 40 |
| 24 | +base_num_kv_heads: 8 |
| 25 | +base_mlp_dim: 27648 |
| 26 | +base_num_decoder_layers: 64 |
| 27 | +head_dim: 128 |
| 28 | + |
| 29 | +# Activations & Normalization |
| 30 | +mlp_activations: ["silu", "linear"] |
| 31 | +normalization_layer_epsilon: 1.e-6 |
| 32 | +use_qk_norm: True |
| 33 | + |
| 34 | +# Attention |
| 35 | +# Layers 0,1,2 use sliding window 4096. Layer 3 uses global. Repeats. |
| 36 | +sliding_window_size: 4096 |
| 37 | +inhomogeneous_layer_cycle_interval: 4 |
| 38 | + |
| 39 | +# RoPE (YaRN) |
| 40 | +rope_type: "yarn" |
| 41 | +rope_max_timescale: 500000 # rope_theta |
| 42 | +rope_factor: 8.0 # factor so 0.1 * ln(rope_factor) + 1.0 = 1.2079441541679836 |
| 43 | +original_max_position_embeddings: 8192 |
| 44 | +beta_fast: 32.0 |
| 45 | +beta_slow: 1.0 |
| 46 | +max_position_embeddings: 65536 |
| 47 | +rope_attention_scaling: True |
| 48 | + |
| 49 | +# Embeddings |
| 50 | +vocab_size: 100278 |
| 51 | +logits_via_embedding: False |
0 commit comments