Merge pull request #3319 from AI-Hypercomputer:amandaliang

Google-ML-Automation · Google-ML-Automation · commit 5b3654d6dd3c · 2026-03-05T11:58:14.000-08:00
PiperOrigin-RevId: 879178990
diff --git a/src/maxtext/kernels/megablox/ops.py b/src/maxtext/kernels/megablox/ops.py
@@ -44,12 +44,14 @@ def gmm(
     weight_gather_axes: List[Tuple[str, int]] | None = None,
     input_buffer_count: tuple[int, int, int] = (2, 2, 2),
     combine_scopes: bool = False,
+    # TODO(amandaliang): get rid of the qwix_rule in favor of Qwix's interception feature
+    qwix_rule: qwix.QtRule | None = None,
 ):
   """Grouped matrix multiplication operation."""
   quantization_rule = None
   if use_qwix_quantization:
     # get_current_rule has to be called outside of the _gmm_fwd function.
-    quantization_rule = qpl.get_current_rule("gmm")
+    quantization_rule = qwix_rule if qwix_rule else qpl.get_current_rule("gmm")
     if quantization_rule and not isinstance(quantization_rule, qwix.QtRule):
       raise ValueError("Expect a QtRule for quantized training.")
   else:
diff --git a/src/maxtext/layers/quantizations.py b/src/maxtext/layers/quantizations.py
@@ -640,6 +640,19 @@ def dot_general(self, *args, **kwargs):
     return nn.NANOOFp8DotGeneralOp(name=op_id)(*args, **kwargs)
 
 
+def get_fp8_full_qwix_rule(config: Config):
+  return qwix.QtRule(
+      module_path="decoder/.*layers.*",
+      weight_qtype=jnp.float8_e4m3fn,
+      act_qtype=jnp.float8_e4m3fn,
+      bwd_qtype=jnp.float8_e5m2,
+      weight_calibration_method=config.weight_quantization_calibration_method,
+      act_calibration_method=config.act_quantization_calibration_method,
+      bwd_calibration_method=config.bwd_quantization_calibration_method,
+      op_names=("dot_general", "gmm", "ragged_dot"),
+  )
+
+
 def get_quantization_rule(config: Config):
   match config.quantization:
     case "int8":
@@ -661,16 +674,7 @@ def get_quantization_rule(config: Config):
           op_names=("dot_general",),
       )
     case "fp8_full":
-      return qwix.QtRule(
-          module_path="decoder/.*layers.*",
-          weight_qtype=jnp.float8_e4m3fn,
-          act_qtype=jnp.float8_e4m3fn,
-          bwd_qtype=jnp.float8_e5m2,
-          weight_calibration_method=config.weight_quantization_calibration_method,
-          act_calibration_method=config.act_quantization_calibration_method,
-          bwd_calibration_method=config.bwd_quantization_calibration_method,
-          op_names=("dot_general", "gmm", "ragged_dot"),
-      )
+      return get_fp8_full_qwix_rule(config)
     case "fp8_gpu":
       return qwix.QtRule(
           module_path="decoder/.*layers.*",
@@ -808,7 +812,7 @@ def generate_quantizer_set(self, postfix: str = ""):
             postfix=postfix,
             variable_collection=OVERWRITE_WITH_GRADIENT,
             quantization_checkpoint_name="quantization",
-            fp8_recipe=fp8_recipe
+            fp8_recipe=fp8_recipe,
         )
 
       @nn.compact
diff --git a/src/maxtext/models/deepseek_batchsplit.py b/src/maxtext/models/deepseek_batchsplit.py
@@ -815,6 +815,7 @@ def gmm(
           weight_gather_axes=weight_gather_axes,
           input_buffer_count=input_buffer_count,
           combine_scopes=combine_scopes,
+          qwix_rule=quantizations.get_fp8_full_qwix_rule(config),
       )
     else:
       output = tokamax.ragged_dot(

Original file line number	Diff line number	Diff line change
`@@ -815,6 +815,7 @@ def gmm(`
`815`	`815`	`weight_gather_axes=weight_gather_axes,`
`816`	`816`	`input_buffer_count=input_buffer_count,`
`817`	`817`	`combine_scopes=combine_scopes,`
	`818`	`+ qwix_rule=quantizations.get_fp8_full_qwix_rule(config),`
`818`	`819`	`)`
`819`	`820`	`else:`
`820`	`821`	`output = tokamax.ragged_dot(`