Merge pull request #3283 from AI-Hypercomputer:agagik-shared-kv

Google-ML-Automation · Google-ML-Automation · commit 96b72fb8bfc8 · 2026-03-03T21:54:45.000-08:00
PiperOrigin-RevId: 878275312
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -331,6 +331,7 @@ param_scan_axis: 1
 # The attention_type parameter determines the variants of attention, e.g. global or local_sliding
 attention: 'autoselected' # Supported attention: autoselected, dot_product, flash, cudnn_flash_te
 attention_type: 'global' # Supported attention_type: global, local_sliding, chunk, mla
+share_kv_projections: False # Note: Not compatible with attention_type='mla'
 attention_bias: False # If True, adds a learnable bias to the query, key, and value projections
 attention_sink: False
 sliding_window_size: 0
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -476,6 +476,7 @@ class Attention(BaseModel):
       "autoselected",
       description="The attention algorithm to use (dot_product, flash, etc).",
   )
+  share_kv_projections: bool = Field(False, description="If True, Key and Value use the same projection.")
   attention_type: Literal["global", "local_sliding", "chunk", "mla", "full"] = Field(
       "global", description="The variant of attention to use."
   )
@@ -2505,6 +2506,11 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           "Please disable attn_logits_soft_cap when using use_qk_clip."
       )
 
+    if self.share_kv_projections and self.fused_qkv:
+      raise ValueError("`share_kv_projections` is not compatible with `fused_qkv`.")
+    if self.share_kv_projections and self.attention_type == "mla":
+      raise ValueError("`share_kv_projections` is not compatible with `attention_type='mla'`.")
+
     # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
     # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
     if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -130,6 +130,7 @@ def attention_as_linen(
     use_qk_norm: bool = False,
     query_pre_attn_scalar: float | None = None,
     use_bias_in_projections: bool = False,  # Set to True will enable bias in q, k, v, o projections
+    share_kv_projections: bool = False,  # If true, Key and Value use the same projection
     # Temperature tuning parameters used for Llama4
     temperature_tuning: bool = False,
     temperature_tuning_scale: float = 0.1,
@@ -199,6 +200,7 @@ def attention_as_linen(
       use_qk_norm=use_qk_norm,
       query_pre_attn_scalar=query_pre_attn_scalar,
       use_bias_in_projections=use_bias_in_projections,
+      share_kv_projections=share_kv_projections,
       temperature_tuning=temperature_tuning,
       temperature_tuning_scale=temperature_tuning_scale,
       temperature_tuning_floor_scale=temperature_tuning_floor_scale,
@@ -295,6 +297,7 @@ def __init__(
       use_qk_norm: bool = False,
       query_pre_attn_scalar: float | None = None,
       use_bias_in_projections: bool = False,  # Set to True will enable bias in q, k, v, o projections
+      share_kv_projections: bool = False,  # If true, Key and Value use the same projection
       # Temperature tuning parameters used for Llama4
       temperature_tuning: bool = False,
       temperature_tuning_scale: float = 0.1,
@@ -399,6 +402,7 @@ def __init__(
     self.use_qk_norm = use_qk_norm
     self.query_pre_attn_scalar = query_pre_attn_scalar
     self.use_bias_in_projections = use_bias_in_projections
+    self.share_kv_projections = share_kv_projections
     self.temperature_tuning = temperature_tuning
     self.temperature_tuning_scale = temperature_tuning_scale
     self.temperature_tuning_floor_scale = temperature_tuning_floor_scale
@@ -559,7 +563,8 @@ def _init_projections(self, inputs_q_shape: Tuple, inputs_kv_shape: Tuple) -> No
     else:
       self.query = self.init_query_w(inputs_q_shape=inputs_q_shape)
       self.key = self.init_kv_w(inputs_kv_shape=inputs_kv_shape)
-      self.value = self.init_kv_w(inputs_kv_shape=inputs_kv_shape)
+      if not self.share_kv_projections:
+        self.value = self.init_kv_w(inputs_kv_shape=inputs_kv_shape)
     self.out = self.init_out_w(output_dim=inputs_q_shape[-1])
 
   def init_query_w(self, inputs_q_shape: Tuple) -> nnx.Module:
@@ -1056,7 +1061,10 @@ def __call__(
     else:
       query = self.query_projection(inputs_q, out_sharding=qkv_sharding)
       key = self.kv_projection(inputs_kv, proj_name="key", out_sharding=qkv_sharding)
-      value = self.kv_projection(inputs_kv, proj_name="value", out_sharding=qkv_sharding)
+      if self.share_kv_projections:
+        value = key
+      else:
+        value = self.kv_projection(inputs_kv, proj_name="value", out_sharding=qkv_sharding)
 
     gate = None
     if self.is_qwen3_next:
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -154,6 +154,7 @@ def __call__(
         reshape_q=cfg.reshape_q,
         use_mrope=cfg.use_mrope,
         mrope_section=cfg.mrope_section,
+        share_kv_projections=cfg.share_kv_projections,
         model_mode=model_mode,
     )
 
diff --git a/tests/unit/attention_test.py b/tests/unit/attention_test.py
@@ -473,7 +473,15 @@ def test_tpu_kernel_attention_gqa(self):
   def test_tpu_kernel_attention_mqa(self):
     self.tpu_kernel_attention_helper(1)
 
-  def tpu_kernel_attention_helper(self, num_kv_heads):
+  @pytest.mark.tpu_only
+  def test_tpu_kernel_attention_mha_share_kv(self):
+    self.tpu_kernel_attention_helper(self.num_kv_heads, share_kv_projections=True)
+
+  @pytest.mark.tpu_only
+  def test_tpu_kernel_attention_gqa_share_kv(self):
+    self.tpu_kernel_attention_helper(self.num_kv_heads // 2, share_kv_projections=True)
+
+  def tpu_kernel_attention_helper(self, num_kv_heads, share_kv_projections=False):
     """Test equivalence between dot_product and TPU accelerated"""
 
     lnx, decoder_segment_ids, decoder_positions = self.get_data(self.dtype)
@@ -493,6 +501,7 @@ def tpu_kernel_attention_helper(self, num_kv_heads):
         attention_kernel="dot_product",
         dtype=self.dtype,
         dropout_rate=self.cfg.dropout_rate,
+        share_kv_projections=share_kv_projections,
         rngs=self.nnx_rng,
     )
 
@@ -522,6 +531,7 @@ def tpu_kernel_attention_helper(self, num_kv_heads):
         attention_kernel="flash",
         dtype=self.dtype,
         dropout_rate=self.cfg.dropout_rate,
+        share_kv_projections=share_kv_projections,
         rngs=self.nnx_rng,
     )
     nnx.update(attention_as_mha_flash, generic_state)
@@ -539,6 +549,84 @@ def tpu_kernel_attention_helper(self, num_kv_heads):
         jax.numpy.allclose(mha_generic_output, mha_generic_flash_output, rtol=1e-01, atol=1e-01, equal_nan=False)
     )
 
+  def test_share_kv_projections(self):
+    """Test that kv projections are shared."""
+    dummy_inputs_q = jnp.ones((self.global_batch_size, self.max_target_length, self.embed_dim))
+    dummy_inputs_kv = jnp.ones((self.global_batch_size, self.max_target_length, self.embed_dim))
+    attention_share_kv = Attention(
+        config=self.cfg,
+        num_query_heads=self.num_query_heads,
+        num_kv_heads=self.num_kv_heads,
+        head_dim=self.head_dim,
+        max_target_length=self.max_target_length,
+        max_prefill_predict_length=self.cfg.max_prefill_predict_length,
+        inputs_q_shape=dummy_inputs_q.shape,
+        inputs_kv_shape=dummy_inputs_kv.shape,
+        mesh=self.mesh,
+        attention_kernel="dot_product",
+        dtype=self.dtype,
+        dropout_rate=self.cfg.dropout_rate,
+        share_kv_projections=True,
+        rngs=self.nnx_rng,
+    )
+
+    self.assertFalse(hasattr(attention_share_kv, "value"))
+    self.assertTrue(hasattr(attention_share_kv, "key"))
+
+    # 1. Check NNX state
+    state_shared = nnx.state(attention_share_kv)
+    self.assertNotIn("value", state_shared)
+    self.assertIn("key", state_shared)
+
+    # 2. Forward Pass Verification
+    lnx, decoder_segment_ids, decoder_positions = self.get_data(self.dtype)
+
+    output_shared, _ = attention_share_kv(
+        lnx,
+        lnx,
+        decoder_segment_ids=decoder_segment_ids,
+        inputs_positions=decoder_positions,
+        deterministic=True,
+        model_mode=MODEL_MODE_TRAIN,
+    )
+
+    self.assertEqual(output_shared.shape, (self.global_batch_size, self.max_target_length, self.embed_dim))
+
+    # 3. Equivalence Check with standard unshared Attention
+    attention_no_share = Attention(
+        config=self.cfg,
+        num_query_heads=self.num_query_heads,
+        num_kv_heads=self.num_kv_heads,
+        head_dim=self.head_dim,
+        max_target_length=self.max_target_length,
+        max_prefill_predict_length=self.cfg.max_prefill_predict_length,
+        inputs_q_shape=dummy_inputs_q.shape,
+        inputs_kv_shape=dummy_inputs_kv.shape,
+        mesh=self.mesh,
+        attention_kernel="dot_product",
+        dtype=self.dtype,
+        dropout_rate=self.cfg.dropout_rate,
+        share_kv_projections=False,
+        rngs=self.nnx_rng,
+    )
+
+    # Force unshared layer to copy weights from shared layer, mapping 'key' to 'value'
+    attention_no_share.query.kernel.value = attention_share_kv.query.kernel.value
+    attention_no_share.key.kernel.value = attention_share_kv.key.kernel.value
+    attention_no_share.value.kernel.value = attention_share_kv.key.kernel.value
+    attention_no_share.out.kernel.value = attention_share_kv.out.kernel.value
+
+    output_no_share, _ = attention_no_share(
+        lnx,
+        lnx,
+        decoder_segment_ids=decoder_segment_ids,
+        inputs_positions=decoder_positions,
+        deterministic=True,
+        model_mode=MODEL_MODE_TRAIN,
+    )
+
+    self.assertTrue(jax.numpy.allclose(output_shared, output_no_share, rtol=1e-04, atol=1e-04, equal_nan=False))
+
   @parameterized.named_parameters(
       {
           "testcase_name": "cp_no_load_balance",

Original file line number	Diff line number	Diff line change
`@@ -476,6 +476,7 @@ class Attention(BaseModel):`
`476`	`476`	`"autoselected",`
`477`	`477`	`description="The attention algorithm to use (dot_product, flash, etc).",`
`478`	`478`	`)`
	`479`	`+ share_kv_projections: bool = Field(False, description="If True, Key and Value use the same projection.")`
`479`	`480`	`attention_type: Literal["global", "local_sliding", "chunk", "mla", "full"] = Field(`
`480`	`481`	`"global", description="The variant of attention to use."`
`481`	`482`	`)`
`@@ -2505,6 +2506,11 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de`
`2505`	`2506`	`"Please disable attn_logits_soft_cap when using use_qk_clip."`
`2506`	`2507`	`)`
`2507`	`2508`
	`2509`	`+ if self.share_kv_projections and self.fused_qkv:`
	`2510`	+ raise ValueError("`share_kv_projections` is not compatible with `fused_qkv`.")
	`2511`	`+ if self.share_kv_projections and self.attention_type == "mla":`
	`2512`	+ raise ValueError("`share_kv_projections` is not compatible with `attention_type='mla'`.")
	`2513`	`+`
`2508`	`2514`	`# I. FINAL TYPE CONVERSIONS AND DERIVED LISTS`
`2509`	`2515`	`# Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.`
`2510`	`2516`	`if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":`
Original file line number	Diff line number	Diff line change
`@@ -154,6 +154,7 @@ def __call__(`
`154`	`154`	`reshape_q=cfg.reshape_q,`
`155`	`155`	`use_mrope=cfg.use_mrope,`
`156`	`156`	`mrope_section=cfg.mrope_section,`
	`157`	`+ share_kv_projections=cfg.share_kv_projections,`
`157`	`158`	`model_mode=model_mode,`
`158`	`159`	`)`
`159`	`160`