Merge pull request #2923 from AI-Hypercomputer:nicogrande/enable-gpt-oss-attention-vllm

Google-ML-Automation · Google-ML-Automation · commit 05a4a539fb19 · 2026-01-12T14:35:21.000-08:00
PiperOrigin-RevId: 855388150
diff --git a/src/MaxText/layers/attentions.py b/src/MaxText/layers/attentions.py
@@ -916,17 +916,19 @@ def forward_serve_vllm(
           "vLLM RPA attention ops require the vllm-tpu package. Please install it with `pip install vllm-tpu`."
       ) from e
 
-    if self.config.attention_sink:
-      raise NotImplementedError("Attention sink is not supported in MaxText vLLM RPA attention.")
-
     if rpa_kv_cache is None or rpa_metadata is None:
       raise ValueError("kv_cache and attention_metadata must be provided when using vLLM.")
 
     query = query.reshape(-1, query.shape[2], query.shape[3])
     key = key.reshape(-1, key.shape[2], key.shape[3])
     value = value.reshape(-1, value.shape[2], value.shape[3])
 
-    attention_chunk_size = self.config.chunk_attn_window_size if self.config.chunk_attn_window_size > 0 else None
+    if self.config.sliding_window_size > 0:
+      attention_chunk_size = self.config.sliding_window_size
+    else:
+      # Chunked attention currently not used in vLLM RPA.
+      attention_chunk_size = None
+
     q_scale, k_scale, v_scale = None, None, None
 
     md = rpa_metadata
@@ -941,7 +943,7 @@ def forward_serve_vllm(
         md.block_tables,
         md.query_start_loc,
         md.request_distribution,
-        None,
+        self.sinks.astype(jnp.float32) if self.sinks is not None else None,
         1.0,
         attention_chunk_size,
         q_scale,