comment explaining hardware specific sharding

prishajain1 · prishajain1 · commit af5b89e6f908 · 2026-04-20T21:36:25.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -364,6 +364,9 @@ def __init__(
     tpu_type = get_tpu_type()
     is_ironwood = tpu_type == TpuType.TPU_7X
 
+    # Hardware-aware sharding: Ironwood (v7x) uses 1D sharding along the heads dimension (leaving the embedding dimension replicated)
+    # to minimize cross-device communication, while other hardware defaults to 2D sharding along both heads and embed dimensions.
+    # This has currently only been tested on Trillium (v6e) and Ironwood (v7x).
     if qkv_sharding_spec is None:
       qkv_sharding_spec = (None, "heads") if is_ironwood else ("embed", "heads")
     if out_sharding_spec is None: