AI-Hypercomputer
diff --git a/‎src/maxdiffusion/models/ltx2/attention_ltx2.py‎
Lines changed: 21 additions & 10 deletions b/‎src/maxdiffusion/models/ltx2/attention_ltx2.py‎
Lines changed: 21 additions & 10 deletions
@@ -345,12 +345,14 @@ def __init__(
       dtype: DType = jnp.float32,
       attention_kernel: str = "flash",
       rope_type: str = "interleaved",
+      enable_jax_named_scopes: bool = False,
   ):
     self.heads = heads
     self.rope_type = rope_type
     self.dim_head = dim_head
     self.inner_dim = dim_head * heads
     self.dropout_rate = dropout
+    self.enable_jax_named_scopes = enable_jax_named_scopes
 
     # 1. Define Partitioned Initializers (Logical Axes)
     # Q, K, V kernels: [in_features (embed), out_features (heads)]
@@ -433,6 +435,11 @@ def __init__(
         axis_names_kv=("batch", "heads", "length", "kv"),
     )
 
+  def conditional_named_scope(self, name: str):
+    import jax
+    import contextlib
+    return jax.named_scope(name) if getattr(self, "enable_jax_named_scopes", False) else contextlib.nullcontext()
+
   def __call__(
       self,
       hidden_states: Array,
@@ -445,13 +452,15 @@ def __call__(
     context = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
 
     # 1. Project
-    query = self.to_q(hidden_states)
-    key = self.to_k(context)
-    value = self.to_v(context)
+    with self.conditional_named_scope("proj_in"):
+      query = self.to_q(hidden_states)
+      key = self.to_k(context)
+      value = self.to_v(context)
 
     # 2. Norm (Full Inner Dimension)
-    query = self.norm_q(query)
-    key = self.norm_k(key)
+    with self.conditional_named_scope("norm"):
+      query = self.norm_q(query)
+      key = self.norm_k(key)
 
     # 3. Apply RoPE to tensors of shape [B, S, InnerDim]
     # Frequencies are shape [B, S, InnerDim]
@@ -478,12 +487,14 @@ def __call__(
 
     # 4. Attention
     # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
-    attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
+    with self.conditional_named_scope("attention_op"):
+      attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
 
     # 7. Output Projection
-    hidden_states = self.to_out(attn_output)
-
-    if self.dropout_layer is not None:
-      hidden_states = self.dropout_layer(hidden_states)
+    with self.conditional_named_scope("proj_out"):
+      hidden_states = self.to_out(attn_output)
+  
+      if self.dropout_layer is not None:
+        hidden_states = self.dropout_layer(hidden_states)
 
     return hidden_states