Merge pull request #3004 from AI-Hypercomputer:olmo3_config

Google-ML-Automation · Google-ML-Automation · commit 18eee32ff9c7 · 2026-01-26T16:04:38.000-08:00
PiperOrigin-RevId: 861396239
diff --git a/src/MaxText/common_types.py b/src/MaxText/common_types.py
@@ -100,6 +100,7 @@ class DecoderBlockType(enum.Enum):
   SIMPLE = "simple"
   SIMPLE_MLP = "simple_mlp"
   LLAMA4 = "llama4"
+  OLMO3 = "olmo3"
 
 
 class AttentionType(enum.Enum):
diff --git a/src/MaxText/configs/models/olmo3_32b.yml b/src/MaxText/configs/models/olmo3_32b.yml
@@ -0,0 +1,51 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# AllenAI OLMo 3 32B Configuration
+# https://huggingface.co/allenai/Olmo-3.1-32B-Instruct/blob/main/config.json
+
+model_name: "olmo3_32b"
+decoder_block: "olmo3"
+
+# Model Dimensions
+base_emb_dim: 5120
+base_num_query_heads: 40
+base_num_kv_heads: 8
+base_mlp_dim: 27648
+base_num_decoder_layers: 64
+head_dim: 128
+
+# Activations & Normalization
+mlp_activations: ["silu", "linear"]
+normalization_layer_epsilon: 1.e-6
+use_qk_norm: True
+
+# Attention
+# Layers 0,1,2 use sliding window 4096. Layer 3 uses global. Repeats.
+sliding_window_size: 4096
+inhomogeneous_layer_cycle_interval: 4
+
+# RoPE (YaRN)
+rope_type: "yarn"
+rope_max_timescale: 500000 # rope_theta
+rope_factor: 8.0 # factor so 0.1 * ln(rope_factor) + 1.0 = 1.2079441541679836
+original_max_position_embeddings: 8192
+beta_fast: 32.0
+beta_slow: 1.0
+max_position_embeddings: 65536
+rope_attention_scaling: True
+
+# Embeddings
+vocab_size: 100278
+logits_via_embedding: False
diff --git a/src/MaxText/configs/models/olmo3_7b.yml b/src/MaxText/configs/models/olmo3_7b.yml
@@ -0,0 +1,51 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# AllenAI OLMo 3 7B Configuration
+# https://huggingface.co/allenai/Olmo-3-7B-Instruct
+
+model_name: "olmo3_7b"
+decoder_block: "olmo3"
+
+# Model Dimensions
+base_emb_dim: 4096
+base_num_query_heads: 32
+base_num_kv_heads: 32
+base_mlp_dim: 11008
+base_num_decoder_layers: 32
+head_dim: 128
+
+# Activations & Normalization
+mlp_activations: ["silu", "linear"] # SwiGLU
+normalization_layer_epsilon: 1.e-6
+use_qk_norm: True
+
+# Attention
+# Layers 0,1,2 use sliding window 4096. Layer 3 uses global. Repeats.
+sliding_window_size: 4096
+inhomogeneous_layer_cycle_interval: 4
+
+# RoPE
+rope_type: "yarn"
+rope_max_timescale: 500000 # rope_theta
+rope_factor: 8.0 # factor so 0.1 * ln(rope_factor) + 1.0 = 1.2079441541679836
+original_max_position_embeddings: 8192
+beta_fast: 32.0
+beta_slow: 1.0
+max_position_embeddings: 65536
+rope_attention_scaling: True
+
+# Embeddings
+vocab_size: 100278
+logits_via_embedding: False
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -236,6 +236,8 @@ class ProfilerType(str, Enum):
     "gpt-oss-120b",
     "llama4-17b-16e",
     "llama4-17b-128e",
+    "olmo3_7b",
+    "olmo3_32b",
 ]
 
 
diff --git a/src/MaxText/layers/decoders.py b/src/MaxText/layers/decoders.py
@@ -59,6 +59,7 @@
     mixtral,
     qwen3,
     simple_layer,
+    olmo3,
 )
 
 # ------------------------------------------------------------------------------
@@ -430,6 +431,9 @@ def get_decoder_layers(self):
         return [simple_layer.SimpleMlpDecoderLayerToLinen]
       case DecoderBlockType.LLAMA4:
         return [llama4.Llama4ScannableBlockToLinen] if self.config.scan_layers else [llama4.Llama4DecoderLayerToLinen]
+      case DecoderBlockType.OLMO3:
+        return [olmo3.Olmo3ScannableBlockToLinen] if self.config.scan_layers else [olmo3.Olmo3DecoderLayerToLinen]
+
       case _:
         # Default case to handle any unknown decoder block types.
         raise ValueError(f"Incorrect decoder_block name {self.config.decoder_block.value=}")
@@ -479,6 +483,7 @@ def get_norm_layer(self, num_features: int):
         DecoderBlockType.SIMPLE,
         DecoderBlockType.SIMPLE_MLP,
         DecoderBlockType.LLAMA4,
+        DecoderBlockType.OLMO3,
     ):
       return functools.partial(rms_norm, num_features=num_features, shard_mode=self.config.shard_mode)
     elif self.config.decoder_block == DecoderBlockType.GPT3:
diff --git a/src/MaxText/layers/olmo3.py b/src/MaxText/layers/olmo3.py
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py

Original file line number	Diff line number	Diff line change
`@@ -236,6 +236,8 @@ class ProfilerType(str, Enum):`
`236`	`236`	`"gpt-oss-120b",`
`237`	`237`	`"llama4-17b-16e",`
`238`	`238`	`"llama4-17b-128e",`
	`239`	`+ "olmo3_7b",`
	`240`	`+ "olmo3_32b",`
`239`	`241`	`]`
`240`	`242`
`241`	`243`