support attention data parallelism

khatwanimohit · khatwanimohit · commit 351eebc2f6cb · 2026-01-21T17:56:47.000Z
diff --git a/src/MaxText/common_types.py b/src/MaxText/common_types.py
@@ -32,6 +32,10 @@
 
 BATCH = "activation_batch"
 BATCH_NO_EXP = "activation_batch_no_exp"
+
+ATTN_LENGTH = "activation_attn_length"
+ATTN_LENGTH_NO_EXP = "activation_attn_length_no_exp"
+
 LENGTH = "activation_length"
 LENGTH_NO_EXP = "activation_length_no_exp"
 PREFILL_LENGTH = "prefill_activation_length"
@@ -40,6 +44,7 @@
 Q_LORA_UP_PROJ = "q_lora_up_proj"
 KV_LENGTH = "activation_kv_length"
 KV_LORA_UP_PROJ = "kv_lora_up_proj"
+ATTN_EMBED = "activation_attn_embed"
 EMBED = "activation_embed"
 HEAD = "activation_heads"
 PREFILL_KV_BATCH = "activation_prefill_kv_batch"
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -393,6 +393,10 @@ logical_axis_rules: [
                       ['activation_kv_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence']],
                       ['activation_length', ['sequence', 'context', 'expert']],
                       ['activation_length', ['context', 'expert']],
+                      ['activation_attn_length', ['sequence', 'context', 'expert']],
+                      ['activation_attn_length', ['context', 'expert']],
+                      ['activation_attn_length_no_exp', ['sequence', 'context']],
+                      ['activation_attn_length_no_exp', ['context']],
                       ['activation_length_no_exp', ['sequence', 'context']],
                       ['activation_length_no_exp', ['context']],
                       ['activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
@@ -401,6 +405,7 @@ logical_axis_rules: [
                       ['prefill_activation_length', ['sequence', 'context']],
                       ['prefill_activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
                       ['activation_kv_length', []],
+                      ['activation_attn_embed', ['tensor', 'tensor_transpose']],
                       ['activation_embed', ['tensor', 'tensor_transpose']],
                       ['activation_mlp', ['tensor', 'tensor_transpose', 'tensor_sequence']],
                       ['activation_kv', ['tensor', 'tensor_transpose', 'tensor_sequence']],
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -2220,6 +2220,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           "model": self.ici_tensor_parallelism,
           "expert": self.ici_expert_parallelism,
           "autoregressive": self.ici_autoregressive_parallelism,
+          "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
       }
       self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]
 
@@ -2237,6 +2238,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           "model": self.dcn_tensor_parallelism,
           "expert": self.dcn_expert_parallelism,
           "autoregressive": self.dcn_autoregressive_parallelism,
+          "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
       }
       self.dcn_parallelism = [dcn_map[axis] for axis in self.mesh_axes]
 
diff --git a/src/MaxText/configs/vllm.yml b/src/MaxText/configs/vllm.yml
@@ -25,43 +25,49 @@ weight_dtype: bfloat16
 
 
 # -------------- Logical Axis Rules --------------
-mesh_axes: ['data', 'model', 'expert']
+mesh_axes: ['data', 'attn_dp', 'model', 'expert']
 logical_axis_rules: [
                       ['activation_batch', ['expert']],
                       ['activation_batch_no_exp', []],
                       ['activation_embed_and_logits_batch', ['expert']],
                       ['activation_embed_and_logits_batch_sequence', ['expert']],
                       ['activation_heads', ['model']],
                       ['activation_kv_heads', ['model']],
+                      ['activation_attn_length', ['expert']],
+                      ['activation_attn_length_no_exp', []],
                       ['activation_length', ['data', 'expert']],
-                      ['activation_q_length', ['data', 'expert']],
-                      ['activation_embed', ['model']],
-                      ['activation_mlp', ['model']],
+                      ['activation_length_no_exp', 'data'],
+                      ['activation_q_length', ['expert']],
+                      ['activation_attn_embed', 'model'],
+                      ['activation_embed', ['model', 'attn_dp']],
+                      ['activation_mlp', ['model', 'attn_dp']],
                       ['activation_kv', ['model']],
                       ['activation_prefill_kv_batch', ['expert']],
                       ['activation_kv_batch', ['expert']],
                       ['activation_kv_batch_no_exp', []],
                       ['activation_kv_head_dim', ['model']],
-                      ['activation_vocab', ['model']],
-                      ['activation_embed', ['model']],
+                      ['activation_vocab', ['model', 'attn_dp']],
+                      ['activation_norm_length', []],
                       ['activation_exp', ['expert']],
                       ['decode_batch', ['expert']],
-                      ['mlp', ['model']],
-                      ['mlp_no_fsdp', ['model']],
-                      ['vocab', ['model']],
+                      ['decode_length', []],
+                      ['mlp', ['model', 'attn_dp']],
+                      ['mlp_no_fsdp', ['model', 'attn_dp']],
+                      ['vocab', ['model', 'attn_dp']],
                       ['heads', ['model']],
                       ['q_heads', ['model']],
                       ['kv_heads', ['model']],
                       ['kv_head_dim', []],
                       ['kv', []],
                       ['embed', ['expert']],
+                      ['embed_tensor_transpose', ['attn_dp', 'model']],
                       ['embed_no_exp', []],
                       ['q_lora', ['expert']],
                       ['kv_lora', ['expert']],
-                      ['norm', ['model']],
+                      ['norm', []],
                       ['cache_heads', ['model']],
                       ['exp', ['expert']],
                       ['paged_kv_heads', ['model']],
                     ]
-data_sharding: [['data', 'model', 'expert']]
+data_sharding: [['data', 'attn_dp', 'model', 'expert']]
 input_data_sharding_logical_axes: ['activation_embed_and_logits_batch']
diff --git a/src/MaxText/layers/attentions.py b/src/MaxText/layers/attentions.py
@@ -34,8 +34,8 @@
     D_KV,
     AxisNames,
     AxisIdxes,
-    LENGTH,
-    LENGTH_NO_EXP,
+    ATTN_LENGTH,
+    ATTN_LENGTH_NO_EXP,
     DType,
     Config,
     Array,
@@ -46,7 +46,7 @@
     KV_HEAD_DIM,
     KV_BATCH,
     KV_BATCH_NO_EXP,
-    EMBED,
+    ATTN_EMBED,
     MODEL_MODE_AUTOREGRESSIVE,
     MODEL_MODE_TRAIN,
     MODEL_MODE_PREFILL,
@@ -141,18 +141,18 @@ def attention_as_linen(
     prefill_query_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
     prefill_key_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
     prefill_value_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
-    query_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-    key_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-    value_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-    ep_query_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    ep_key_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    ep_value_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    input_axis_names: AxisNames = (BATCH, LENGTH_NO_EXP, EMBED),
-    ep_input_axis_names: AxisNames = (BATCH_NO_EXP, LENGTH, EMBED),
-    out_axis_names: AxisNames = (BATCH, LENGTH_NO_EXP, HEAD, D_KV),
-    ep_out_axis_names: AxisNames = (BATCH_NO_EXP, LENGTH, HEAD, D_KV),
-    prefill_input_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, EMBED),
-    decode_input_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, EMBED),
+    query_axis_names: AxisNames = (KV_BATCH, ATTN_LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
+    key_axis_names: AxisNames = (KV_BATCH, ATTN_LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
+    value_axis_names: AxisNames = (KV_BATCH, ATTN_LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
+    ep_query_axis_names: AxisNames = (KV_BATCH_NO_EXP, ATTN_LENGTH, KV_HEAD, KV_HEAD_DIM),
+    ep_key_axis_names: AxisNames = (KV_BATCH_NO_EXP, ATTN_LENGTH, KV_HEAD, KV_HEAD_DIM),
+    ep_value_axis_names: AxisNames = (KV_BATCH_NO_EXP, ATTN_LENGTH, KV_HEAD, KV_HEAD_DIM),
+    input_axis_names: AxisNames = (BATCH, ATTN_LENGTH_NO_EXP, ATTN_EMBED),
+    ep_input_axis_names: AxisNames = (BATCH_NO_EXP, ATTN_LENGTH, ATTN_EMBED),
+    out_axis_names: AxisNames = (BATCH, ATTN_LENGTH_NO_EXP, HEAD, D_KV),
+    ep_out_axis_names: AxisNames = (BATCH_NO_EXP, ATTN_LENGTH, HEAD, D_KV),
+    prefill_input_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, ATTN_EMBED),
+    decode_input_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, ATTN_EMBED),
     prefill_out_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, HEAD, D_KV),
     decode_out_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, HEAD, D_KV),
     prefill_cache_axis_order: AxisIdxes = (1, 2, 0, 3),
@@ -300,18 +300,18 @@ def __init__(
       prefill_query_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
       prefill_key_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
       prefill_value_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
-      query_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-      key_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-      value_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-      ep_query_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-      ep_key_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-      ep_value_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-      input_axis_names: AxisNames = (BATCH, LENGTH_NO_EXP, EMBED),
-      ep_input_axis_names: AxisNames = (BATCH_NO_EXP, LENGTH, EMBED),
-      out_axis_names: AxisNames = (BATCH, LENGTH_NO_EXP, HEAD, D_KV),
-      ep_out_axis_names: AxisNames = (BATCH_NO_EXP, LENGTH, HEAD, D_KV),
-      prefill_input_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, EMBED),
-      decode_input_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, EMBED),
+      query_axis_names: AxisNames = (KV_BATCH, ATTN_LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
+      key_axis_names: AxisNames = (KV_BATCH, ATTN_LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
+      value_axis_names: AxisNames = (KV_BATCH, ATTN_LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
+      ep_query_axis_names: AxisNames = (KV_BATCH_NO_EXP, ATTN_LENGTH, KV_HEAD, KV_HEAD_DIM),
+      ep_key_axis_names: AxisNames = (KV_BATCH_NO_EXP, ATTN_LENGTH, KV_HEAD, KV_HEAD_DIM),
+      ep_value_axis_names: AxisNames = (KV_BATCH_NO_EXP, ATTN_LENGTH, KV_HEAD, KV_HEAD_DIM),
+      input_axis_names: AxisNames = (BATCH, ATTN_LENGTH_NO_EXP, ATTN_EMBED),
+      ep_input_axis_names: AxisNames = (BATCH_NO_EXP, ATTN_LENGTH, ATTN_EMBED),
+      out_axis_names: AxisNames = (BATCH, ATTN_LENGTH_NO_EXP, HEAD, D_KV),
+      ep_out_axis_names: AxisNames = (BATCH_NO_EXP, ATTN_LENGTH, HEAD, D_KV),
+      prefill_input_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, ATTN_EMBED),
+      decode_input_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, ATTN_EMBED),
       prefill_out_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, HEAD, D_KV),
       decode_out_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, HEAD, D_KV),
       prefill_cache_axis_order: AxisIdxes = (1, 2, 0, 3),
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -355,7 +355,7 @@ def __init__(
 
     if self.config.attention == "vllm_rpa":
       # vLLM uses 'model' as the tensor parallelism axis name
-      self._tensor_parallelism_name = "model"
+      self._tensor_parallelism_name = ("model", "attn_dp")
     else:
       self._tensor_parallelism_name = "tensor"
 
@@ -459,6 +459,11 @@ def get_expert_parallelism_size(self):
     return self.mesh.shape.get("expert", 1)
 
   def get_tensor_parallelism_size(self):
+    if isinstance(self._tensor_parallelism_name, tuple):
+      size = 1
+      for axis in self._tensor_parallelism_name:
+        size *= self.mesh.shape.get(axis, 1)
+      return size
     return self.mesh.shape.get(self._tensor_parallelism_name, 1)
 
   def get_tensor_transpose_parallelism_size(self):
diff --git a/src/MaxText/maxtext_utils.py b/src/MaxText/maxtext_utils.py
@@ -1175,12 +1175,12 @@ def schedule(step):
   return optax.join_schedules(pieces, boundaries)
 
 
-def print_state_mesh_shardings_params(state, state_sharding, mesh):
+def print_shardings_params(params, params_sharding, mesh):
   """Print state shardings."""
-  leaves_params, _ = jax.tree_util.tree_flatten_with_path(state.params)
-  leaves_sharding, _ = jax.tree_util.tree_flatten_with_path(state_sharding.params)
+  leaves_params, _ = jax.tree_util.tree_flatten_with_path(params)
+  leaves_sharding, _ = jax.tree_util.tree_flatten_with_path(params_sharding)
   for (path, leaf_val), (_, leaf_sharding) in zip(leaves_params, leaves_sharding):
-    path_str = "/".join(str(p.key) for p in path)
+    path_str = "/".join(str(p.key if hasattr(p, "key") else p.name) for p in path)
     shape = jax.typeof(leaf_val)
     pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
     max_logging.log(f"{path_str:.<80} {shape} {tuple(pspec)}")
diff --git a/src/MaxText/model_creation_utils.py b/src/MaxText/model_creation_utils.py
@@ -23,6 +23,7 @@
 import jax
 from jax.sharding import Mesh, AxisType
 from MaxText import maxtext_utils
+from MaxText import max_utils
 from MaxText import pyconfig
 from MaxText.layers import quantizations
 from MaxText.common_types import MODEL_MODE_TRAIN, ShardMode
@@ -153,7 +154,10 @@ def create_sharded_state():
     with nn.logical_axis_rules(config.logical_axis_rules):
       sharded_state = create_sharded_state()
     model = nnx.merge(graphdef, sharded_state)
-
+    # print weights sharding info under debug sharding mode
+    if config.debug_sharding:
+      max_utils.print_non_trivial_mesh_axis(model.mesh)
+      maxtext_utils.print_shardings_params(sharded_state, out_shardings, model.mesh)
     if config.load_parameters_path:
       try:
         ckptr = ocp.Checkpointer(
diff --git a/src/MaxText/train_compile.py b/src/MaxText/train_compile.py
@@ -228,7 +228,7 @@ def main(argv: Sequence[str]) -> None:
   # print weights sharding info under debug sharding mode
   if config.debug_sharding:
     max_utils.print_non_trivial_mesh_axis(topology_mesh)
-    maxtext_utils.print_state_mesh_shardings_params(shaped_train_args[0], state_mesh_shardings, topology_mesh)
+    maxtext_utils.print_shardings_params(shaped_train_args[0].params, state_mesh_shardings.params, topology_mesh)
 
   # Compile
   print("Jitting and compiling train step...", flush=True)
diff --git a/src/MaxText/train_utils.py b/src/MaxText/train_utils.py
@@ -218,7 +218,7 @@ def setup_train_loop(config, recorder, devices=None):
     # print weights sharding info under debug sharding mode
     if config.debug_sharding:
       max_utils.print_non_trivial_mesh_axis(model.mesh)
-      maxtext_utils.print_state_mesh_shardings_params(state, state_mesh_shardings, model.mesh)
+      maxtext_utils.print_shardings_params(state.params, state_mesh_shardings.params, model.mesh)
 
     if config.use_dpo:
       abstract_state, _, _ = maxtext_utils.get_abstract_state(model, tx, config, init_rng, mesh, is_training=True)
diff --git a/src/MaxText/vllm_decode.py b/src/MaxText/vllm_decode.py
@@ -65,6 +65,8 @@
 flags.DEFINE_integer("ici_data_parallelism", 1, "Size of the data parallelism dimension.")
 flags.DEFINE_integer("ici_tensor_parallelism", 1, "Size of the non-expert tensor parallelism dimension.")
 flags.DEFINE_integer("ici_expert_parallelism", 1, "Size of the MoE expert parallelism dimension.")
+flags.DEFINE_bool("enable_dp_attention", False, "Enable attention DP parallelism")
+flags.DEFINE_bool("debug_sharding", False, "Debug Shardings")
 
 # Model
 flags.DEFINE_string("model_name", "qwen3-30b-a3b", "Model name for MaxText.")
@@ -97,6 +99,7 @@ def decode_with_vllm(
     ici_data_parallelism: int,
     ici_tensor_parallelism: int,
     ici_expert_parallelism: int,
+    enable_dp_attention: bool,
     max_prefill_length: int,
     max_target_length: int,
     gpu_memory_utilization: float,
@@ -105,6 +108,7 @@ def decode_with_vllm(
     decode_sampling_temperature: float,
     decode_sampling_nucleus_p: float,
     decode_sampling_top_k: float,
+    debug_sharding: bool,
 ) -> None:
   """Decode using vLLM with a MaxText model implementation.
 
@@ -115,7 +119,8 @@ def decode_with_vllm(
     load_parameters_path: Path to load model parameters from.
     ici_data_parallelism: Size of the data parallelism dimension.
     ici_tensor_parallelism: Size of the non-expert tensor parallelism dimension.
-    ici_expert_parallelism: Size of the MoE expert parallelism dimension.
+    ici_expert_parallelism: Size of the MoE expert parallelism dimension
+    enable_dp_attention: Enable DP attention
     max_prefill_length: Maximum prefill length.
     max_target_length: Maximum total context length (MCL).
     gpu_memory_utilization: Fraction of GPU memory to be used for the model executor.
@@ -145,18 +150,20 @@ def decode_with_vllm(
       "max_target_length": max_target_length,
       "weight_dtype": "bfloat16",
       "allow_split_physical_axes": True,
+      "debug_sharding": debug_sharding,
   }
   if load_parameters_path is not None:
     vllm_args["additional_config"]["maxtext_config"]["load_parameters_path"] = load_parameters_path
   else:
     vllm_args["load_format"] = "dummy"
 
+  sharding_strategy = {
+      "enable_dp_attention": enable_dp_attention,
+  }
+  if enable_expert_parallel:
+    sharding_strategy["expert_parallelism"] = ici_expert_parallelism
   vllm_args["additional_config"]["sharding"] = {
-      "sharding_strategy": {
-          "tensor_parallelism": ici_tensor_parallelism,
-          "expert_parallelism": ici_expert_parallelism,
-          "data_parallelism": ici_data_parallelism,
-      },
+      "sharding_strategy": sharding_strategy,
   }
 
   if enable_expert_parallel:
@@ -278,6 +285,7 @@ def main(argv: Sequence[str]) -> None:
         ici_data_parallelism=FLAGS.ici_data_parallelism,
         ici_tensor_parallelism=FLAGS.ici_tensor_parallelism,
         ici_expert_parallelism=FLAGS.ici_expert_parallelism,
+        enable_dp_attention=FLAGS.enable_dp_attention,
         max_target_length=FLAGS.max_target_length,
         max_prefill_length=FLAGS.max_prefill_length,
         gpu_memory_utilization=FLAGS.gpu_memory_utilization,
@@ -286,6 +294,7 @@ def main(argv: Sequence[str]) -> None:
         decode_sampling_temperature=FLAGS.decode_sampling_temperature,
         decode_sampling_nucleus_p=FLAGS.decode_sampling_nucleus_p,
         decode_sampling_top_k=FLAGS.decode_sampling_top_k,
+        debug_sharding=FLAGS.debug_sharding,
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -2220,6 +2220,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de`
`2220`	`2220`	`"model": self.ici_tensor_parallelism,`
`2221`	`2221`	`"expert": self.ici_expert_parallelism,`
`2222`	`2222`	`"autoregressive": self.ici_autoregressive_parallelism,`
	`2223`	`+ "attn_dp": 1, # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads`
`2223`	`2224`	`}`
`2224`	`2225`	`self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]`
`2225`	`2226`
`@@ -2237,6 +2238,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de`
`2237`	`2238`	`"model": self.dcn_tensor_parallelism,`
`2238`	`2239`	`"expert": self.dcn_expert_parallelism,`
`2239`	`2240`	`"autoregressive": self.dcn_autoregressive_parallelism,`
	`2241`	`+ "attn_dp": 1, # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads`
`2240`	`2242`	`}`
`2241`	`2243`	`self.dcn_parallelism = [dcn_map[axis] for axis in self.mesh_axes]`
`2242`	`2244`