AI-Hypercomputer
diff --git a/‎src/MaxText/layers/attentions.py‎
Lines changed: 23 additions & 6 deletions b/‎src/MaxText/layers/attentions.py‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎src/MaxText/layers/decoders.py‎
Lines changed: 2 additions & 0 deletions b/‎src/MaxText/layers/decoders.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/MaxText/layers/normalizations.py‎
Lines changed: 22 additions & 0 deletions b/‎src/MaxText/layers/normalizations.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/MaxText/layers/olmo3.py‎
Lines changed: 5 additions & 0 deletions b/‎src/MaxText/layers/olmo3.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/MaxText/utils/ckpt_conversion/to_maxtext.py‎
Lines changed: 13 additions & 8 deletions b/‎src/MaxText/utils/ckpt_conversion/to_maxtext.py‎
Lines changed: 13 additions & 8 deletions
@@ -67,7 +67,7 @@
 )
 from MaxText.layers.initializers import nd_dense_init, NdInitializer, variable_to_logically_partitioned, default_bias_init
 from MaxText.layers.linears import DenseGeneral, canonicalize_tuple, normalize_axes
-from MaxText.layers.normalizations import RMSNorm, Qwen3NextRMSNorm
+from MaxText.layers.normalizations import RMSNorm, Qwen3NextRMSNorm, GlobalRMSNorm
 from MaxText.layers.quantizations import AqtQuantization as Quant
 from maxtext.inference import kvcache, page_manager, paged_attention
 from maxtext.inference.kvcache import KVQuant
@@ -164,6 +164,7 @@ def attention_as_linen(
     use_mrope: bool = False,
     mrope_section: tuple[int, int, int] | None = None,
     name: str | None = None,
+    rope_type: str | None = None,
 ):
   """A factory function to create an Attention as a Linen module.
 
@@ -228,6 +229,7 @@ def attention_as_linen(
       use_mrope=use_mrope,
       mrope_section=mrope_section,
       name=name,
+      rope_type=rope_type,
       metadata_fn=variable_to_logically_partitioned,
       abstract_init=False,
   )
@@ -328,6 +330,7 @@ def __init__(
       use_mrope: bool = False,
       mrope_section: tuple[int, int, int] | None = None,
       name: str | None = None,
+      rope_type: str | None = None,
       rngs: Optional[nnx.Rngs] = None,
   ):
     """Initializes the Attention module.
@@ -367,6 +370,8 @@ def __init__(
       is_vision: Whether this is a vision attention layer.
       model_mode: The model's operational mode (e.g., 'train', 'prefill').
       base_kv_cache: Whether to use base (non-MLA) kv cache, if KVCache is used
+      rope_type: Optional override for the RoPE type (e.g., 'default', 'yarn').
+          If provided, this takes precedence over `config.rope_type`.
       rngs: RNG state for initialization, passed by the nnx.to_linen wrapper.
     """
 
@@ -424,6 +429,8 @@ def __init__(
     self.use_mrope = use_mrope
     self.mrope_section = mrope_section
     self.rngs = rngs
+    # Use the rope type specified in the arguments if provided, otherwise fall back to the one in the config.
+    self.rope_type = (rope_type or self.config.rope_type).lower()
 
     self.is_qwen3_next = self.config.decoder_block == DecoderBlockType.QWEN3_NEXT
 
@@ -490,18 +497,28 @@ def __init__(
       self.sinks = None
 
     is_llama4_decoder_block = self.config.decoder_block == DecoderBlockType.LLAMA4
+
     if self.use_qk_norm and not is_llama4_decoder_block:
-      self.query_norm = RMSNorm(
-          num_features=self.head_dim,
+      # Check if this is Olmo3, which uses a unique "Global" QK Norm strategy.
+      # GlobalRMSNorm flattens (Heads, Dim) to normalize across the entire hidden state.
+      use_global_qk_norm = self.config.model_name.startswith("olmo3")
+      qk_norm_cls = GlobalRMSNorm if use_global_qk_norm else RMSNorm
+
+      # For RMSNorm use `head_dim` (per-head normalization), while for GlobalRMSNorm use `num_heads * head_dim` (global normalization).
+      q_features = (self.num_query_heads * self.head_dim) if use_global_qk_norm else self.head_dim
+      k_features = (self.num_kv_heads * self.head_dim) if use_global_qk_norm else self.head_dim
+
+      self.query_norm = qk_norm_cls(
+          num_features=q_features,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           shard_mode=self.config.shard_mode,
           epsilon=self.config.normalization_layer_epsilon,
           kernel_axes=("norm",),
           rngs=self.rngs,
       )
-      self.key_norm = RMSNorm(
-          num_features=self.head_dim,
+      self.key_norm = qk_norm_cls(
+          num_features=k_features,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           shard_mode=self.config.shard_mode,
@@ -726,7 +743,7 @@ def init_rotary_embedding(self):
     else:
       rope_embedding_dims = self.head_dim
 
-    rope_type = self.config.rope_type.lower()
+    rope_type = self.rope_type
     rope_use_scale = self.config.rope_use_scale
     if self.is_vision:
       if self.config.model_name.startswith("qwen3-omni"):
 
@@ -902,6 +902,8 @@ def __call__(
               layer_kwargs = {"layer_idx": lyr}
             if cfg.decoder_block == DecoderBlockType.GPT_OSS:
               layer_kwargs = {"attention_type": gpt_oss.get_attention_type(layer_id=lyr)}
+            if cfg.decoder_block == DecoderBlockType.OLMO3:
+              layer_kwargs = {"attention_type": olmo3.get_attention_type(layer_id=lyr)}
             layer = RemattedBlockLayer(
                 config=cfg, mesh=mesh, name=f"layers_{lyr}", quant=self.quant, model_mode=self.model_mode, **layer_kwargs
             )
 
@@ -80,6 +80,28 @@ def __call__(self, x: jnp.ndarray, out_sharding: NamedSharding | None = None) ->
     return jnp.einsum("i...k,...k->i...k", y, effective_scale, out_sharding=out_sharding)
 
 
+class GlobalRMSNorm(RMSNorm):
+  """
+  Applies RMSNorm over the last two dimensions (Heads * HeadDim).
+  Used for Olmo3 which normalizes across all heads combined.
+  """
+
+  def __call__(self, x: jnp.ndarray, out_sharding: NamedSharding | None = None) -> jnp.ndarray:
+    # x shape: [..., Heads, HeadDim]
+    input_shape = x.shape
+
+    # Flatten the last two dimensions: [..., Heads * HeadDim]
+    # We use -2 and -1 to ensure we capture the last two dims regardless of rank
+    flattened_shape = input_shape[:-2] + (input_shape[-2] * input_shape[-1],)
+    x_flat = x.reshape(flattened_shape)
+
+    # Apply standard RMSNorm (which normalizes over the last axis)
+    y_flat = super().__call__(x_flat, out_sharding)
+
+    # Reshape back to [..., Heads, HeadDim]
+    return y_flat.reshape(input_shape)
+
+
 def Qwen3NextRMSNorm(num_features: int, eps: float, dtype: DType, weight_dtype: DType, *, rngs: nnx.Rngs):
   """
   Used for input and post attention layernorms
 
@@ -98,6 +98,10 @@ def __init__(
         rngs=rngs,
     )
 
+    current_rope_type = config.rope_type.lower()
+    if self.attention_type == attentions.AttentionType.LOCAL_SLIDING:
+      current_rope_type = "default"
+
     # Self-attention block
     self.attention = Attention(
         config=config,
@@ -121,6 +125,7 @@ def __init__(
         query_pre_attn_scalar=(config.head_dim**-0.5),
         model_mode=model_mode,
         use_qk_norm=config.use_qk_norm,
+        rope_type=current_rope_type,
         rngs=rngs,
     )
 
 
@@ -111,9 +111,10 @@ class LazyHFLoader:
   can still occur in parallel.
   """
 
-  def __init__(self, model_id, token):
+  def __init__(self, model_id, token, revision=None):
     self.model_id = model_id
     self.token = token
+    self.revision = revision
     # Whether loads from local directory
     self.is_local = os.path.isdir(self.model_id)
     self.shard_map = {}
@@ -156,7 +157,7 @@ def _initialize_index(self):
     if self.is_local:
       index_path = os.path.join(self.model_id, index_file)
     else:
-      index_path = hf_hub_download(repo_id=self.model_id, filename=index_file, token=self.token)
+      index_path = hf_hub_download(repo_id=self.model_id, filename=index_file, token=self.token, revision=self.revision)
     with open(index_path, "r", encoding="utf-8") as f:
       index_data = json.load(f)
     self.shard_map = index_data["weight_map"]
@@ -186,7 +187,7 @@ def get_tensor(self, key: str) -> np.ndarray:
     else:
       # STEP 1: Download outside the lock.
       # multiple threads can download different shards at the same time.
-      local_path = hf_hub_download(repo_id=self.model_id, filename=shard_name, token=self.token)
+      local_path = hf_hub_download(repo_id=self.model_id, filename=shard_name, token=self.token, revision=self.revision)
 
     # STEP 2: Lock ONLY the reading into RAM.
     # This prevents multiple threads from simultaneously allocating large chunks of RAM.
@@ -574,7 +575,7 @@ def main(args: Sequence[str], test_args: Sequence[str]) -> None:
     output_directory = config.base_output_directory
 
   hf_token = config.hf_access_token
-
+  revision = test_args.revision
   use_lazy_load = test_args.lazy_load_tensors
 
   if use_lazy_load and config.use_multimodal:
@@ -586,14 +587,14 @@ def main(args: Sequence[str], test_args: Sequence[str]) -> None:
   # Define the appropriate tensor getter based on mode
   if use_lazy_load:
     max_logging.log(f"Lazy loading ENABLED. Initializing LazyHFLoader for: {model_id}...")
-    hf_loader = LazyHFLoader(model_id, hf_token)
-    hf_config_obj = AutoConfig.from_pretrained(model_id, token=hf_token)
+    hf_loader = LazyHFLoader(model_id, hf_token, revision=revision)
+    hf_config_obj = AutoConfig.from_pretrained(model_id, token=hf_token, revision=revision)
     print_ram_usage("After LazyLoader init")
     tensor_getter = hf_loader.get_tensor
   else:
     max_logging.log(f"Lazy loading DISABLED. Loading full HuggingFace model: {model_id}...")
-    hf_config_obj = AutoConfig.from_pretrained(model_id, token=hf_token)
-    hf_model = get_hf_model(model_id, token=hf_token)
+    hf_config_obj = AutoConfig.from_pretrained(model_id, token=hf_token, revision=revision)
+    hf_model = get_hf_model(model_id, token=hf_token, revision=revision)
     hf_state_dict_numpy = hf_model.state_dict()
     # Convert all to numpy immediately in eager mode
     for k, v in hf_state_dict_numpy.items():
@@ -729,6 +730,10 @@ def _eager_getter(key):
   #   storage:  chunk_shape=(151936, 1024) <-- Full layer in one chunk
   parser.add_argument("--simulated_cpu_devices_count", type=int, required=False, default=16)
 
+  parser.add_argument(
+      "--revision", type=str, required=False, default=None, help="Specific Hugging Face revision (branch/tag/commit)"
+  )
+
   # Parse local arguments
   # Parse known args returns the namespace AND the list of remaining arguments
   local_args, remaining_args = parser.parse_known_args()
Original file line number	Diff line number	Diff line change
`@@ -902,6 +902,8 @@ def __call__(`
`902`	`902`	`layer_kwargs = {"layer_idx": lyr}`
`903`	`903`	`if cfg.decoder_block == DecoderBlockType.GPT_OSS:`
`904`	`904`	`layer_kwargs = {"attention_type": gpt_oss.get_attention_type(layer_id=lyr)}`
	`905`	`+ if cfg.decoder_block == DecoderBlockType.OLMO3:`
	`906`	`+ layer_kwargs = {"attention_type": olmo3.get_attention_type(layer_id=lyr)}`
`905`	`907`	`layer = RemattedBlockLayer(`
`906`	`908`	`config=cfg, mesh=mesh, name=f"layers_{lyr}", quant=self.quant, model_mode=self.model_mode, **layer_kwargs`
`907`	`909`	`)`