Add Preprocessing and token placeholders

eitanporat · hengtaoguo · commit 5e78bbf7d986 · 2026-02-20T23:09:29.000Z
diff --git a/benchmarks/api_server/maxtext_generator.py b/benchmarks/api_server/maxtext_generator.py
@@ -505,15 +505,13 @@ def _preprocess_inputs(self, text, prefill_length, image_path):
           num_images=1,
       )
       processor_output = mm_processor.preprocess_mm_data(self.config)
-      prefill_length -= mm_processor.get_image_offsets(self.config.model_name, processor_output=processor_output)
+      prefill_length -= mm_processor.get_image_offsets(config=self.config, processor_output=processor_output)
       images = processor_output.pixel_values
 
     tokens, true_length = self.tokenizer.encode(text, is_bos=not self.has_chat_template, prefill_lengths=[prefill_length])
     if self.config.use_multimodal and image_path:
-      tokens = mm_processor.prepare_text_for_image_fusion(
-          tokens, model_name=self.config.model_name, processor_output=processor_output
-      )
-      true_length += mm_processor.get_image_offsets(self.config.model_name, processor_output=processor_output)
+      tokens = mm_processor.prepare_text_for_image_fusion(tokens, config=self.config, processor_output=processor_output)
+      true_length += mm_processor.get_image_offsets(config=self.config, processor_output=processor_output)
 
     return tokens, true_length, images
 
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -992,9 +992,11 @@ dtype_mm: "float32"  # Data type for multimodal model's vision encoder
 remat_policy_for_vit: "minimal"  # Remat policy for multimodal model's vision encoder. Check `remat_policy` for options.
 image_size_for_vit: 896 # Default for Gemma3, and should be overwritten by model's config
 image_path: "" # Local image path used for decoding, can be multiple paths separated by comma, exp "/path/image1.jpg,/path/image2.jpg"
-image_placeholder: "<|image|>"
 video_path: "" # Local video path used for decoding, can be multiple paths separated by comma, exp "/path/video1.mp4,/path/video2.mp4"
 audio_path: "" # Local audio path used for decoding, can be multiple paths separated by comma, exp "/path/audio1.wav,/path/audio2.wav"
+image_placeholder: "<|image|>"
+video_placeholder: "<|video|>"
+audio_placeholder: "<|audio|>"
 use_audio_in_video: False
 posemb_type_for_vit: "learn"
 # max_num_images_per_example only applies for training when your image column is a list of images.
diff --git a/src/maxtext/configs/models/qwen3-omni-30b-a3b.yml b/src/maxtext/configs/models/qwen3-omni-30b-a3b.yml
@@ -77,4 +77,4 @@ max_sample_len_for_audio: 10000
 # MRoPE Settings (Multi-dimensional RoPE for multimodal)
 use_mrope: true
 mrope_section: [24, 20, 20]
-position_id_per_seconds: 25
+position_id_per_seconds: 13
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1457,6 +1457,8 @@ class MultimodalGeneral(BaseModel):
   )
   video_path: PathStr = Field("", description="Path to a video for decoding.")
   audio_path: PathStr = Field("", description="Path to an audio file for decoding.")
+  video_placeholder: str = Field("<|video|>", description="Placeholder string for video in text prompts.")
+  audio_placeholder: str = Field("<|audio|>", description="Placeholder string for audio in text prompts.")
   use_audio_in_video: bool = Field(False, description="Extract and use audio from video files.")
   use_mrope: bool = Field(False, description="Enable Multi-dimensional RoPE for Qwen3-Omni models.")
   mrope_section: list[int] = Field([24, 20, 20], description="Dimensions for temporal, height, width in MRoPE.")
diff --git a/src/maxtext/decode.py b/src/maxtext/decode.py
@@ -104,14 +104,16 @@ def main(argv: Sequence[str]) -> None:
   processor_outputs = mm_utils.PreprocessorOutput()
   if config.use_multimodal:
     processor_outputs = mm_processor.preprocess_mm_data(config)
-    image_offsets = mm_processor.get_image_offsets(config.model_name, processor_output=processor_outputs)
+    image_offsets = mm_processor.get_image_offsets(config=config, processor_output=processor_outputs)
 
     prefill_length -= image_offsets
     text = mm_processor.reformat_prompt(
         prompt=config.prompt,
         image_placeholder=config.image_placeholder,
+        video_placeholder=config.video_placeholder,
         model_name=config.model_name,
         num_images=processor_outputs.num_images,
+        num_videos=getattr(processor_outputs, 'num_videos', 0),
     )
 
   metadata = engine.get_tokenizer()
@@ -135,9 +137,7 @@ def main(argv: Sequence[str]) -> None:
   mrope_position_deltas = None
 
   if config.use_multimodal:
-    tokens = mm_processor.prepare_text_for_image_fusion(
-        tokens, model_name=config.model_name, processor_output=processor_outputs
-    )
+    tokens = mm_processor.prepare_text_for_image_fusion(tokens=tokens, config=config, processor_output=processor_outputs)
     true_length += image_offsets
 
     if config.use_mrope:
@@ -148,7 +148,7 @@ def main(argv: Sequence[str]) -> None:
           image_grid_thw=processor_outputs.pixel_grid_thw,  # pytype: disable=attribute-error
           video_grid_thw=processor_outputs.video_grid_thw,  # pytype: disable=attribute-error
           attention_mask=np.ones_like(tokens),
-          use_audio_in_video=config.use_audio and processor_outputs.num_videos > 0,  # pytype: disable=attribute-error
+          use_audio_in_video=config.use_audio and getattr(processor_outputs, 'num_videos', 0) > 0,
           audio_lengths=processor_outputs.audio_lengths,  # pytype: disable=attribute-error
           second_per_grids=processor_outputs.video_second_per_grid,  # pytype: disable=attribute-error
           spatial_merge_size=config.spatial_merge_size_for_vit,  # pytype: disable=attribute-error
diff --git a/src/maxtext/input_pipeline/hf_data_processing.py b/src/maxtext/input_pipeline/hf_data_processing.py
@@ -126,7 +126,7 @@ def vision_sft_preprocessing_pipeline(
   )
   dataset = dataset.map(
       input_pipeline_utils.prepare_text_for_image_fusion,
-      fn_kwargs={"column_name": text_columns[0], "model_name": config.model_name},
+      fn_kwargs={"column_name": text_columns[0], "config": config},
   )
 
   dataset = input_pipeline_utils.HFDataSource(
diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py
@@ -115,10 +115,10 @@ def _process_image_fn(image):
   return example
 
 
-def prepare_text_for_image_fusion(example, column_name, model_name):
+def prepare_text_for_image_fusion(example, column_name, config):
   """prepare text for image fusion for multimodal SFT"""
   example[column_name] = mm_processor.prepare_text_for_image_fusion(
-      example[column_name], model_name, processor_output=example["images"]
+      tokens=example[column_name], config=config, processor_output=example["images"]
   )
   return example
 
diff --git a/src/maxtext/multimodal/processor.py b/src/maxtext/multimodal/processor.py
@@ -63,21 +63,25 @@ def preprocess_image_for_training(image, model_name):
     raise ValueError(f"Model {model_name} not supported for image preprocessing.")
 
 
-def get_image_offsets(model_name, processor_output: mm_utils.PreprocessorOutput | None):
+def get_image_offsets(config, processor_output: mm_utils.PreprocessorOutput | None):
   """Get the increase in total token count after inserting image token placeholders"""
-  if model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b"]:
+  if config.model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b"]:
     from maxtext.multimodal.processor_gemma3 import get_image_offsets_gemma3  # pylint: disable=import-outside-toplevel
 
     return get_image_offsets_gemma3(processor_output)
-  elif model_name in ["llama4-17b-16e", "llama4-17b-128e"]:
+  elif config.model_name in ["llama4-17b-16e", "llama4-17b-128e"]:
     from maxtext.multimodal.processor_llama4 import get_image_offsets_llama4  # pylint: disable=import-outside-toplevel
 
     return get_image_offsets_llama4(processor_output)
+  elif config.model_name in ["qwen3-omni-30b-a3b"]:
+    from maxtext.multimodal.processor_qwen3_omni import get_mm_offsets_qwen3_omni  # pylint: disable=import-outside-toplevel
+
+    return get_mm_offsets_qwen3_omni(config, processor_output)
   else:
     return 0
 
 
-def reformat_prompt(prompt, image_placeholder, model_name, num_images):
+def reformat_prompt(prompt, image_placeholder, model_name, num_images, video_placeholder="<|video|>", num_videos=0):
   """Reformat prompt for different models."""
   if model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b"]:
     from maxtext.multimodal.processor_gemma3 import reformat_prompt_gemma3  # pylint: disable=import-outside-toplevel
@@ -87,6 +91,16 @@ def reformat_prompt(prompt, image_placeholder, model_name, num_images):
     from maxtext.multimodal.processor_llama4 import reformat_prompt_llama4  # pylint: disable=import-outside-toplevel
 
     return reformat_prompt_llama4(prompt, image_placeholder, num_images)
+  elif model_name in ["qwen3-omni-30b-a3b"]:
+    from maxtext.multimodal.processor_qwen3_omni import reformat_prompt_qwen3_omni  # pylint: disable=import-outside-toplevel
+
+    return reformat_prompt_qwen3_omni(
+        prompt=prompt,
+        image_placeholder=image_placeholder,
+        num_images=num_images,
+        video_placeholder=video_placeholder,
+        num_videos=num_videos,
+    )
   else:
     return prompt
 
@@ -99,22 +113,29 @@ def reformat_response(response, model_name):
   elif model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b"]:
     formatted_response = f"{response}<end_of_turn>"
     return formatted_response
+  elif model_name in ["qwen3-omni-30b-a3b"]:
+    formatted_response = f"{response}<|im_end|>"
+    return formatted_response
   else:
     return response
 
 
-def prepare_text_for_image_fusion(texts, model_name, processor_output=None):
+def prepare_text_for_image_fusion(tokens, config, processor_output=None):
   """Prepare text by adding extra tokens for image fusion based on the model."""
-  if model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b"]:
+  if config.model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b"]:
     from maxtext.multimodal.processor_gemma3 import add_extra_tokens_for_images_gemma3  # pylint: disable=import-outside-toplevel
 
-    return add_extra_tokens_for_images_gemma3(texts, max_num_images=processor_output.num_images)
-  elif model_name in ["llama4-17b-16e", "llama4-17b-128e"]:
+    return add_extra_tokens_for_images_gemma3(tokens, max_num_images=processor_output.num_images)
+  elif config.model_name in ["llama4-17b-16e", "llama4-17b-128e"]:
     from maxtext.multimodal.processor_llama4 import add_extra_tokens_for_images_llama4  # pylint: disable=import-outside-toplevel
 
-    return add_extra_tokens_for_images_llama4(texts, processor_output)
+    return add_extra_tokens_for_images_llama4(tokens, processor_output)
+  elif config.model_name in ["qwen3-omni-30b-a3b"]:
+    from maxtext.multimodal.processor_qwen3_omni import add_extra_tokens_for_qwen3_omni  # pylint: disable=import-outside-toplevel
+
+    return add_extra_tokens_for_qwen3_omni(tokens, config, processor_output)
   else:
-    raise ValueError(f"Model {model_name} does not support multimodal inference.")
+    raise ValueError(f"Model {config.model_name} does not support multimodal inference.")
 
 
 def get_dummy_image_shape_for_init(model_name, batch_size=1, num_image_per_sequence=1):
diff --git a/src/maxtext/multimodal/processor_qwen3_omni.py b/src/maxtext/multimodal/processor_qwen3_omni.py
@@ -59,11 +59,11 @@
 
 # Qwen3OmniMoe-specific processing
 QWEN3_OMNI_VISION_START_TOKEN = 151652  # <|vision_start|>
-QWEN3_OMNI_VISION_END_TOKEN = 151653  # <|vision_eos|>
+QWEN3_OMNI_VISION_END_TOKEN = 151653  # <|vision_end|>
 QWEN3_OMNI_IMAGE_TOKEN = 151655  # <|image_pad|>
 QWEN3_OMNI_VIDEO_TOKEN = 151656  # <|video_pad|>
 QWEN3_OMNI_AUDIO_START_TOKEN = 151669  # <|audio_start|>
-QWEN3_OMNI_AUDIO_END_TOKEN = 151648  # <|audio_eos|>
+QWEN3_OMNI_AUDIO_END_TOKEN = 151670  # <|audio_end|>
 QWEN3_OMNI_AUDIO_TOKEN = 151675  # <|audio_pad|>
 QWEN3_TEMPORAL_PATCH_SIZE = 2
 QWEN3_OMNI_IMAGE_SIZE = 768
@@ -90,6 +90,7 @@ class Qwen3OmniPreprocessorOutput(mm_utils.PreprocessorOutput):
   num_audios: int = 0
   audio_values: None | np.ndarray = None
   audio_mask: None | np.ndarray = None
+  audio_lengths: None | np.ndarray = None
 
 
 def smart_resize(
@@ -477,41 +478,36 @@ def preprocess_mm_data_qwen3_omni(config):
   """Placeholder for multimodal data preprocessing."""
   processor_outputs = Qwen3OmniPreprocessorOutput()
 
-  if config.image_path is not None:
+  if config.image_path:
     images = [mm_utils.load_image_from_path(p) for p in config.image_path.split(",")]
     pixel_values, pixel_grid_thw = pre_process_qwen3_image(images, config)
     processor_outputs.pixel_values = pixel_values
     processor_outputs.pixel_grid_thw = pixel_grid_thw
     processor_outputs.num_images = len(images)
 
-  if config.video_path is not None:
+  if config.video_path:
     video_array, _ = _read_video_decord(config.video_path)
     video_processed, video_grid_thw = preprocess_video(video_array, config)
     processor_outputs.video_values = video_processed
     processor_outputs.video_grid_thw = video_grid_thw
     processor_outputs.video_second_per_grid = np.asarray([config.temporal_patch_size_for_vit], dtype=np.float32)
     processor_outputs.num_videos = 1  # Only one video for now.
 
-  if config.video_path is not None and config.use_audio_in_video:
+  if config.video_path and config.use_audio_in_video:
     # TODO(hengtaoguo): add support for separate audio files. Now only extract audio from video files.
     mt_audio = mm_utils.load_audio(config.video_path, sample_rate=SAMPLE_RATE)
     mt_audio, mt_audio_mask = pre_process_audio_qwen3_omni(mt_audio)
     processor_outputs.audio_values = mt_audio
     processor_outputs.audio_mask = mt_audio_mask
+    # Compute audio_lengths from audio_mask
+    audio_mask_sum = np.sum(mt_audio_mask, axis=-1)
+    audio_lengths = _get_feat_extract_output_lengths(audio_mask_sum)
+    processor_outputs.audio_lengths = np.array(audio_lengths, dtype=np.int32)
 
   return processor_outputs
 
 
-def add_extra_tokens_for_qwen3_omni(
-    tokens: np.ndarray | list,
-    image_grid_thw: np.ndarray | None = None,
-    video_grid_thw: np.ndarray | None = None,
-    audio_lengths: np.ndarray | None = None,
-    spatial_merge_size: int = 2,
-    use_audio_in_video: bool = False,
-    second_per_grids: np.ndarray | None = None,
-    position_id_per_seconds: int = 25,
-):
+def add_extra_tokens_for_qwen3_omni(tokens, config, processor_output):
   """Add extra tokens for Qwen3-Omni multimodal sequences.
 
   Expands special tokens (<|image_pad|>, <|video_pad|>, <|audio_pad|>) into
@@ -532,6 +528,13 @@ def add_extra_tokens_for_qwen3_omni(
   Returns:
     Expanded token sequence with correct number of image/video/audio tokens.
   """
+  image_grid_thw = getattr(processor_output, "pixel_grid_thw", None)
+  video_grid_thw = getattr(processor_output, "video_grid_thw", None)
+  audio_lengths = getattr(processor_output, "audio_lengths", None)
+  second_per_grids = getattr(processor_output, "video_second_per_grid", None)
+  spatial_merge_size = config.spatial_merge_size_for_vit
+  position_id_per_seconds = config.position_id_per_seconds
+
   if not isinstance(tokens, np.ndarray):
     tokens = np.asarray(tokens)
 
@@ -561,7 +564,7 @@ def add_extra_tokens_for_qwen3_omni(
 
     # Handle audio-in-video: <|vision_start|><|video_pad|><|vision_end|>
     elif (
-        use_audio_in_video
+        config.use_audio_in_video
         and token == QWEN3_OMNI_VISION_START_TOKEN
         and i + 2 < len(token_list)
         and token_list[i + 1] == QWEN3_OMNI_VIDEO_TOKEN
@@ -1039,3 +1042,63 @@ def get_rope_index(
   mrope_position_deltas = np.array(mrope_position_deltas).reshape(batch_size, 1)
 
   return position_ids, mrope_position_deltas
+
+
+def reformat_prompt_qwen3_omni(
+    prompt, image_placeholder="<|image|>", num_images=0, video_placeholder="<|video|>", num_videos=0
+):
+  """Reformat the prompt for Qwen3-Omni model."""
+  # Qwen3-Omni vision format: <|vision_start|><|image_pad|><|vision_end|>
+  # Qwen3-Omni mm token order: image_pad, video_pad, audio_pad (standalone audios), then text tokens.
+  # use_audio_in_video mode: such audio tokens are interleaved within video tokens.
+  qwen3_image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+  qwen3_video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
+  if video_placeholder in prompt:
+    prompt = prompt.replace(video_placeholder, qwen3_video_placeholder)
+  video_placeholder_count = prompt.count(qwen3_video_placeholder)
+  if video_placeholder_count < num_videos:
+    prompt = qwen3_video_placeholder * (num_videos - video_placeholder_count) + prompt
+
+  if image_placeholder in prompt:
+    prompt = prompt.replace(image_placeholder, qwen3_image_placeholder)
+  image_placeholder_count = prompt.count(qwen3_image_placeholder)
+  if image_placeholder_count < num_images:
+    prompt = qwen3_image_placeholder * (num_images - image_placeholder_count) + prompt
+
+  # Qwen chat template
+  formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+  return formatted_prompt
+
+
+def get_mm_offsets_qwen3_omni(config, processor_output):
+  """Calculate the token offsets for multimodal tokens in Qwen3-Omni model."""
+  # Calculate token expansion for Qwen3-Omni multimodal inputs
+  if processor_output is None:
+    return 0
+
+  total_offset = 0
+  spatial_merge_size = config.spatial_merge_size_for_vit  # Default 2 for Qwen3-Omni
+  merge_length = spatial_merge_size**2
+
+  # Image tokens: <|image_pad|> expands to multiple image tokens
+  if processor_output.pixel_grid_thw is not None:
+    image_grid_thw = processor_output.pixel_grid_thw
+    for grid in image_grid_thw:
+      num_image_tokens = int((grid[0] * grid[1] * grid[2]) // merge_length)
+      total_offset += num_image_tokens - 1  # -1 for the original <|image_pad|> token
+
+  # Video tokens: <|video_pad|> expands to multiple video tokens
+  if processor_output.video_grid_thw is not None:
+    video_grid_thw = processor_output.video_grid_thw
+    for grid in video_grid_thw:
+      num_video_tokens = int((grid[0] * grid[1] * grid[2]) // merge_length)
+      total_offset += num_video_tokens - 1  # -1 for the original <|video_pad|> token
+
+  # Audio tokens: <|audio_pad|> expands based on audio_lengths
+  if processor_output.audio_lengths is not None:
+    audio_lengths = processor_output.audio_lengths
+    for audio_len in audio_lengths:
+      total_offset += int(audio_len) - 1  # -1 for the original <|audio_pad|> token
+
+  return total_offset
diff --git a/tests/unit/multimodal_utils_test.py b/tests/unit/multimodal_utils_test.py
@@ -13,9 +13,12 @@
 # limitations under the License.
 
 """ Tests for the common MaxText utilities """
+import os
 import unittest
 import numpy as np
 
+from MaxText import pyconfig
+from MaxText.globals import MAXTEXT_REPO_ROOT
 from maxtext.multimodal import processor as mm_processor
 from maxtext.multimodal import utils as mm_utils
 from maxtext.multimodal import processor_gemma3
@@ -195,8 +198,12 @@ def test_post_process_image_tokens(self):
         pixel_values=dummy_pixel_values,
         aspect_ratios=dummy_aspect_ratios,
     )
-
-    image_offsets = mm_processor.get_image_offsets(model_name=self.model_name, processor_output=processor_output)
+    base_config_path = os.path.join(MAXTEXT_REPO_ROOT, "src", "maxtext", "configs", "base.yml")
+    config = pyconfig.initialize(
+        ["", base_config_path],
+        model_name="llama4-17b-16e",
+    )
+    image_offsets = mm_processor.get_image_offsets(config=config, processor_output=processor_output)
     post_processed_tokens = processor_llama4.add_extra_tokens_for_images_llama4(dummy_tokens, processor_output)
     self.assertEqual(post_processed_tokens.shape[0], dummy_tokens.shape[0] + image_offsets)
 
diff --git a/tests/unit/qwen3_omni_layers_test.py b/tests/unit/qwen3_omni_layers_test.py

Original file line number	Diff line number	Diff line change
`@@ -1457,6 +1457,8 @@ class MultimodalGeneral(BaseModel):`
`1457`	`1457`	`)`
`1458`	`1458`	`video_path: PathStr = Field("", description="Path to a video for decoding.")`
`1459`	`1459`	`audio_path: PathStr = Field("", description="Path to an audio file for decoding.")`
	`1460`	`+ video_placeholder: str = Field("<\|video\|>", description="Placeholder string for video in text prompts.")`
	`1461`	`+ audio_placeholder: str = Field("<\|audio\|>", description="Placeholder string for audio in text prompts.")`
`1460`	`1462`	`use_audio_in_video: bool = Field(False, description="Extract and use audio from video files.")`
`1461`	`1463`	`use_mrope: bool = Field(False, description="Enable Multi-dimensional RoPE for Qwen3-Omni models.")`
`1462`	`1464`	`mrope_section: list[int] = Field([24, 20, 20], description="Dimensions for temporal, height, width in MRoPE.")`
Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ def vision_sft_preprocessing_pipeline(`
`126`	`126`	`)`
`127`	`127`	`dataset = dataset.map(`
`128`	`128`	`input_pipeline_utils.prepare_text_for_image_fusion,`
`129`		`- fn_kwargs={"column_name": text_columns[0], "model_name": config.model_name},`
	`129`	`+ fn_kwargs={"column_name": text_columns[0], "config": config},`
`130`	`130`	`)`
`131`	`131`
`132`	`132`	`dataset = input_pipeline_utils.HFDataSource(`