AI-Hypercomputer
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/MaxText/configs/models/qwen3-omni-30b-a3b.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/MaxText/configs/models/qwen3-omni-30b-a3b.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 3 additions & 0 deletions b/‎src/MaxText/configs/types.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/MaxText/input_pipeline/_input_pipeline_utils.py‎
Lines changed: 83 additions & 0 deletions b/‎src/MaxText/input_pipeline/_input_pipeline_utils.py‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎src/MaxText/layers/attentions.py‎
Lines changed: 20 additions & 0 deletions b/‎src/MaxText/layers/attentions.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/MaxText/layers/decoders.py‎
Lines changed: 2 additions & 0 deletions b/‎src/MaxText/layers/decoders.py‎
Lines changed: 2 additions & 0 deletions
@@ -1013,6 +1013,10 @@ num_conv_layers_for_audio: 3
 max_timescale_for_audio: 10000.0
 max_sample_len_for_audio: 10000
 
+use_mrope: false
+mrope_section: [24, 20, 20]
+position_id_per_seconds: 25
+
 # Subslice shape in the form of "x,y,z" when using pathways (single controller). 
 # Example: "8,8" to use a 8x8 subgrid (64 chips) of a full pod (16x16) of trillium.
 subslice_shape: ""
 
@@ -74,3 +74,7 @@ conv_chunksize_for_audio: 500
 num_conv_layers_for_audio: 3
 max_timescale_for_audio: 10000.0
 max_sample_len_for_audio: 10000
+# MRoPE Settings (Multi-dimensional RoPE for multimodal)
+use_mrope: true
+mrope_section: [24, 20, 20]
+position_id_per_seconds: 25
@@ -1394,6 +1394,9 @@ class MultimodalGeneral(BaseModel):
   video_path: PathStr = Field("", description="Path to a video for decoding.")
   audio_path: PathStr = Field("", description="Path to an audio file for decoding.")
   use_audio_in_video: bool = Field(False, description="Extract and use audio from video files.")
+  use_mrope: bool = Field(False, description="Enable Multi-dimensional RoPE for Qwen3-Omni models.")
+  mrope_section: list[int] = Field([24, 20, 20], description="Dimensions for temporal, height, width in MRoPE.")
+  position_id_per_seconds: int = Field(25, description="Temporal granularity for MRoPE (tokens per second).")
 
 
 class VisionTower(BaseModel):
 
@@ -712,3 +712,86 @@ def __init__(self, ignored_ids, axis=1):
 
   def map(self, element):
     return shift_and_refine(element, ignored_ids=self.ignored_ids, axis=self.axis)
+
+
+@dataclasses.dataclass
+class ComputeQwen3OmniPositions(grain.MapTransform):
+  """Computes 3D position IDs for Qwen3-Omni multimodal sequences.
+
+  This transform replaces the standard 1D sequential positions with 3D
+  positions (temporal, height, width) for multimodal models like Qwen3-Omni.
+
+  For text-only sequences, all 3 dimensions receive the same sequential values.
+  For multimodal sequences with vision/audio, vision tokens get true 3D positions
+  and text tokens continue sequentially from max(vision_pos) + 1.
+
+  The actual position computation is delegated to multimodal_utils.get_rope_index(),
+  which can be tested and modified independently.
+  """
+
+  def __init__(
+      self,
+      data_column: str = "inputs",
+      spatial_merge_size: int = 2,
+      position_id_per_seconds: int = 25,
+      use_audio_in_video: bool = False,
+  ):
+    """Initialize the Qwen3-Omni position computation transform.
+
+    Args:
+      data_column: Name of the data column to compute positions for (default: "inputs").
+      spatial_merge_size: Number of patches merged spatially (e.g., 2 for 2x2→1).
+      position_id_per_seconds: Temporal granularity (tokens per second, typically 25).
+      use_audio_in_video: If True, audio tokens are interleaved with video tokens.
+    """
+    self.data_column = data_column
+    self.spatial_merge_size = spatial_merge_size
+    self.position_id_per_seconds = position_id_per_seconds
+    self.use_audio_in_video = use_audio_in_video
+
+  def map(self, element: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
+    """Compute 3D position IDs for the batch element.
+
+    Args:
+      element: Dictionary containing:
+        - {data_column}: Token IDs with shape (batch, seq_len)
+        - {data_column}_segmentation: Attention mask (1=real, 0=padding)
+        - image_grid_thw: Optional (num_images, 3) array
+        - video_grid_thw: Optional (num_videos, 3) array
+        - audio_lengths: Optional (num_audios,) array
+        - second_per_grids: Optional (num_videos,) array
+
+    Returns:
+      element with {data_column}_position updated to shape (3, batch, seq_len)
+      for 3D positions (always 3D, even for text-only sequences).
+    """
+
+    # Extract inputs and metadata
+    input_ids = element[self.data_column]
+    attention_mask = element.get(f"{self.data_column}_segmentation")
+
+    # Extract multimodal metadata (if present)
+    image_grid_thw = element.get("image_grid_thw")
+    video_grid_thw = element.get("video_grid_thw")
+    audio_lengths = element.get("audio_lengths")
+    second_per_grids = element.get("second_per_grids")
+
+    # Call the standalone get_rope_index function from multimodal_utils
+    position_ids, mrope_position_deltas = multimodal_utils.get_rope_index(
+        input_ids=input_ids,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        attention_mask=attention_mask,
+        use_audio_in_video=self.use_audio_in_video,
+        audio_lengths=audio_lengths,
+        second_per_grids=second_per_grids,
+        spatial_merge_size=self.spatial_merge_size,
+        position_id_per_seconds=self.position_id_per_seconds,
+    )
+
+    # Update element with 3D positions
+    # Shape: (3, batch, seq_len) for multimodal, or (batch, seq_len) for text-only
+    element[f"{self.data_column}_position"] = position_ids
+    element[f"{self.data_column}_mrope_deltas"] = mrope_position_deltas
+
+    return element
@@ -59,6 +59,7 @@
 from MaxText.layers.embeddings import (
     LLaMARotaryEmbedding,
     LlamaVisionRotaryEmbedding,
+    Qwen3OmniMoeThinkerTextRotaryEmbedding,
     Qwen3OmniMoeVisionRotaryEmbedding,
     RotaryEmbedding,
     YarnRotaryEmbedding,
@@ -160,6 +161,8 @@ def attention_as_linen(
     is_nope_layer: bool = False,
     is_vision: bool = False,
     model_mode: str = MODEL_MODE_TRAIN,
+    use_mrope: bool = False,
+    mrope_section: tuple[int, int, int] | None = None,
     name: str | None = None,
 ):
   """A factory function to create an Attention as a Linen module.
@@ -222,6 +225,8 @@ def attention_as_linen(
       is_nope_layer=is_nope_layer,
       is_vision=is_vision,
       model_mode=model_mode,
+      use_mrope=use_mrope,
+      mrope_section=mrope_section,
       name=name,
       metadata_fn=variable_to_logically_partitioned,
       abstract_init=False,
@@ -320,6 +325,8 @@ def __init__(
       is_vision: bool = False,
       model_mode: str = MODEL_MODE_TRAIN,
       base_kv_cache: bool = True,
+      use_mrope: bool = False,
+      mrope_section: tuple[int, int, int] | None = None,
       name: str | None = None,
       rngs: Optional[nnx.Rngs] = None,
   ):
@@ -414,6 +421,8 @@ def __init__(
     self.is_nope_layer = is_nope_layer
     self.is_vision = is_vision
     self.model_mode = model_mode
+    self.use_mrope = use_mrope
+    self.mrope_section = mrope_section
     self.rngs = rngs
 
     self.is_qwen3_next = self.config.decoder_block == DecoderBlockType.QWEN3_NEXT
@@ -743,6 +752,17 @@ def init_rotary_embedding(self):
       else:
         raise ValueError(f"Unsupported model type for vision rotary embedding: {self.config.model_name}")
 
+    elif self.use_mrope:
+      rotary_embedding = Qwen3OmniMoeThinkerTextRotaryEmbedding(
+          min_timescale=self.config.rope_min_timescale,
+          max_timescale=self.config.rope_max_timescale,
+          embedding_dims=rope_embedding_dims,
+          cast_as_fprop_dtype=True,
+          fprop_dtype=self.dtype,
+          mrope_section=self.mrope_section,
+          rngs=self.rngs,
+      )
+
     elif self.config.model_name.startswith("llama3.1") or rope_type.startswith("llama3.1"):
       rotary_embedding = LLaMARotaryEmbedding(
           min_timescale=self.config.rope_min_timescale,
 
@@ -152,6 +152,8 @@ def __call__(
         ar_cache_axis_order=tuple(map(int, cfg.ar_cache_axis_order.split(","))),
         compute_axis_order=tuple(map(int, cfg.compute_axis_order.split(","))),
         reshape_q=cfg.reshape_q,
+        use_mrope=cfg.use_mrope,
+        mrope_section=cfg.mrope_section,
         model_mode=model_mode,
     )
Original file line number	Diff line number	Diff line change
`@@ -152,6 +152,8 @@ def __call__(`
`152`	`152`	`ar_cache_axis_order=tuple(map(int, cfg.ar_cache_axis_order.split(","))),`
`153`	`153`	`compute_axis_order=tuple(map(int, cfg.compute_axis_order.split(","))),`
`154`	`154`	`reshape_q=cfg.reshape_q,`
	`155`	`+ use_mrope=cfg.use_mrope,`
	`156`	`+ mrope_section=cfg.mrope_section,`
`155`	`157`	`model_mode=model_mode,`
`156`	`158`	`)`
`157`	`159`