fix the outdated end2end training examples of moe+torchtitan

rakkit · rakkit · commit e4eb1261ceb9 · 2025-10-24T18:47:47.000+02:00
diff --git a/torchao/prototype/moe_training/examples/simple_moe_layer.py b/torchao/prototype/moe_training/examples/simple_moe_layer.py
@@ -10,8 +10,8 @@
 
 # this example uses torchtitan llama4 MoE, see
 try:
-    from torchtitan.experiments.llama4.model.args import TransformerModelArgs
-    from torchtitan.experiments.llama4.model.moe import MoE
+    from torchtitan.models.moe import MoE, MoEArgs
+    from torchtitan.models.moe.utils import set_token_group_alignment_size_m
 except ImportError as e:
     raise ImportError(
         "torchtitan not installed, see installation instructions at https://github.com/pytorch/torchtitan"
@@ -20,12 +20,10 @@
 
 # initialize model
 device = torch.device("cuda")
-model_args = TransformerModelArgs(
-    moe_enabled=True,
-    num_experts=8,
-    dim=256,
-)
-model = MoE(model_args).to(torch.bfloat16).to(device)
+model_args = MoEArgs(num_experts=8, top_k=2, use_grouped_mm=True)
+dim = 256
+hidden_dim = dim * 4
+model = MoE(model_args, dim, hidden_dim).to(torch.bfloat16).to(device)
 init_std = 0.02
 model.init_weights(init_std, device)
 
@@ -40,14 +38,17 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
     return False
 
 
-# quantize the model
+# quantize the model, by default it is rowwise fp8
 config = MoETrainingConfig()
 quantize_(model, config=config, filter_fn=moe_module_filter_fn)
 
+alignment_size = 32 if config.scaling_type == MoEScalingType.MXFP8 else 16
+set_token_group_alignment_size_m(alignment_size)
+
 # training loop
 optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
 for step in range(10):
-    batch, seq, dim = 8, 2048, 256
+    batch, seq, dim = 8, 2048, dim
     x = torch.randn(
         batch, seq, dim, dtype=torch.bfloat16, requires_grad=True, device=device
     )