pytorch · danielvegamyhre · Jan 17, 2026 · tianyu-l · Feb 7, 2026 · tianyu-l
@@ -132,7 +132,7 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
 
         self.recipe_name = job_config.quantize.grouped_mm.mx.recipe_name
         self.enabled = True
-        logger.info("MXFP8 MoE training enabled")
+        logger.info(f"MXFP8 MoE training enabled with recipe: {self.recipe_name}")
 
     def convert(self, model: nn.Module):
         """
@@ -154,7 +154,7 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
                     return True
             return False
 
-        config = MoETrainingConfig(scaling_type=MoEScalingType.MXFP8)
+        config = MoETrainingConfig(scaling_type=MoEScalingType(self.recipe_name))
         quantize_(model, config=config, filter_fn=moe_module_filter_fn)
         logger.info(
             f"Converted MoE layers matching FQNS {self.moe_fqns} "

@@ -784,10 +784,15 @@ class MXLinear:
 
 @dataclass
 class MXGroupedMM:
-    recipe_name: Literal["mxfp8"] = "mxfp8"
+    recipe_name: Literal["mxfp8", "mxfp8_wgrad_with_hp"] = "mxfp8"
     """
-    Quantization recipe name for grouped GEMMs. Options: ["mxfp8"]
+    Quantization recipe name for grouped GEMMs. Options: ["mxfp8", "mxfp8_wgrad_with_hp"]
 
+    Recipes:
+        - "mxfp8": Use MXFP8 for all 3 grouped GEMMs in the forward and backward pass (output, dgrad, wgrad).
+        - "mxfp8_wgrad_with_hp": Use MXFP8 for forward output and dgrad, but keep wgrad in high-precision.
+             This can be used to trade-off some performance for improved accuracy. For some smaller expert shapes,
+             it is also better for performance.
     Example: --quantize.grouped_mm.mx.recipe_name="mxfp8"
     """