Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
3ccd12c
[WIP] Integrate autoparallel into torchtitan
wconstab Jun 13, 2025
e6d2caf
Autoparallel support for DP-only, DP+TP, or TP-only
wconstab Jun 27, 2025
68476b3
Update CLI inductor configs for bucketing/reordering
wconstab Jul 25, 2025
9ee9f75
add back llama3_autoparallel_init_fn
wconstab Jul 25, 2025
f6e4099
Track API change from new AOTAutograd interface
ezyang Jul 28, 2025
4d7ee8a
Support forcing the model into bf16 for perf debugging
wconstab Jul 28, 2025
b801d0b
Integrate MixedPrecision with AutoParallel and fix example_inputs
wconstab Jul 29, 2025
b099cf9
Use in-place compile API
ezyang Jul 29, 2025
b3587d9
Fix bucketing pass configs
wconstab Jul 29, 2025
42c2c07
Support both eager and autoparallel init based on model.name
wconstab Jul 30, 2025
d93845e
Remove llama3 init weights hack
wconstab Aug 6, 2025
60f5f11
Print profiling manifold url
wconstab Aug 7, 2025
6c782eb
Support new compile API from autoparallel PR #77
wconstab Aug 8, 2025
4712163
Fix bucket sizes for AutoParallel 1D (#1545)
fmassa Aug 8, 2025
3f04d22
Add support for loss parallel (#1546)
fmassa Aug 10, 2025
8e50870
Add config for running simple-fsdp bucketing/reordering passes
wconstab Aug 18, 2025
91c5639
Hook up deepseekv3_auto_parallel
wconstab Aug 19, 2025
1233902
[dsv3] patch graph break fix, works up until sharding rules
xmfan Aug 19, 2025
4f8677b
update simplefsdp pass config
ruisizhang123 Aug 21, 2025
714cc5b
[dsv3] disable MoE while we fix local_map, works up until optimizer
xmfan Aug 22, 2025
45647b3
Merge branch 'main' into whc/merge_autoparallel
wconstab Aug 28, 2025
bfa9f7f
tweak ds3 model.py to reflect main branch for DS3 baseline can run (#…
bdhirsh Sep 5, 2025
75fb2eb
add simplefsdp's autobucketing pass entry (#1658)
ruisizhang123 Sep 6, 2025
8769396
[dsv3] 1D AP w/ local_map
xmfan Sep 11, 2025
db22479
[dsv3] Turn off Flex for AP
xmfan Sep 17, 2025
87ef4e0
Merge branch 'main' into autoparallel
xmfan Oct 27, 2025
9dc0bd8
Update to new model registration API
xmfan Oct 27, 2025
c6e25bd
Whc/knobs (#1994)
wconstab Nov 6, 2025
26410e8
Merge remote-tracking branch 'origin/main' into autoparallel
xmfan Nov 18, 2025
e6ea814
lint
xmfan Nov 18, 2025
7abede8
undo moe patching
xmfan Nov 18, 2025
d2e76b7
move inductor config into experiment folders
xmfan Nov 18, 2025
472b4ad
fix local_map moe patch
xmfan Nov 19, 2025
ac0def9
move flex disables into experiment folder
xmfan Nov 19, 2025
a24ef07
fix newline
xmfan Nov 19, 2025
da611e4
no longer necessary train.py changes
xmfan Nov 19, 2025
6cc8caa
restore comment
xmfan Nov 19, 2025
d54a6d4
temporarily extend hacky optimizer stuff to make dsv3 ap 1d run again
xmfan Nov 19, 2025
acd9588
Merge remote-tracking branch 'origin/main' into autoparallel
xmfan Nov 21, 2025
2b1fb92
fix moduledict with AP https://github.com/meta-pytorch/autoparallel/p…
xmfan Nov 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 23 additions & 8 deletions torchtitan/components/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,10 +340,21 @@ def build_optimizers_with_moe_load_balancing(
ft_manager=ft_manager,
)

# AP friendly methods
def is_moe_block(block):
moe_enabled = getattr(block, "moe_enabled", False)
has_moe_submod = hasattr(block, "moe") # AP
return moe_enabled or has_moe_submod

def should_manual_allreduce(tokens_per_expert_by_layer):
return not isinstance(
tokens_per_expert_by_layer, torch.distributed.tensor.DTensor
)

def _should_register_moe_balancing_hook(model_parts: list[nn.Module]) -> bool:
for model_part in model_parts:
for transformer_block in model_part.layers.values():
if transformer_block.moe_enabled:
if is_moe_block(transformer_block):
# Assumption: load_balance_coeff is set universally on all moe blocks.
return bool(transformer_block.moe.load_balance_coeff)
return False
Expand All @@ -359,12 +370,13 @@ def _update_expert_bias(
dp_cp_mesh = (
parallel_dims.world_mesh["dp_cp"] if parallel_dims.dp_cp_enabled else None
)

# TODO: Currently this sync is blocking (thus exposed) and happens on the
# default compute stream. Need to assess if this is OK performance-wise.
tokens_per_expert_list = []
for model_part in model_parts:
for transformer_block in model_part.layers.values():
if not transformer_block.moe_enabled:
if not is_moe_block(transformer_block):
continue
if transformer_block.moe.load_balance_coeff is None:
return
Expand All @@ -380,17 +392,20 @@ def _update_expert_bias(
tokens_per_expert_by_layer = torch.vstack(tokens_per_expert_list)

if dp_cp_mesh is not None:
# Perform single all-reduce to get global statistics across all processes
pg = dp_cp_mesh.get_group()
torch.distributed.all_reduce(
tokens_per_expert_by_layer, group=pg, op=torch.distributed.ReduceOp.SUM
)
if should_manual_allreduce(tokens_per_expert_by_layer):
# Perform single all-reduce to get global statistics across all processes
pg = dp_cp_mesh.get_group()
torch.distributed.all_reduce(
tokens_per_expert_by_layer,
group=pg,
op=torch.distributed.ReduceOp.SUM,
)

moe_layer_idx = 0
with torch.no_grad():
for model_part in model_parts:
for transformer_block in model_part.layers.values():
if not transformer_block.moe_enabled:
if not is_moe_block(transformer_block):
continue
moe = transformer_block.moe

Expand Down
5 changes: 5 additions & 0 deletions torchtitan/config/job_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,11 @@ class Experimental:
needs to ensure that the path can be imported.
"""

# "aten" (default), "inductor", "none"
comms_bucket_reorder_strategy: str = "aten"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this one might be worth landing in main. it's purely inductor-specific and used by simple-fsdp as well.
cc @ruisizhang123 @fegin @tianyu-l

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is shared by 2 different models/experiments, I think it is okay to add it to the core job_config. This will be used by full dtensor and compiler toolki iiuc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's inductor specific bucketing. Maybe we should upstream inductor bucketing code to pytorch?

Copy link
Contributor

@tianyu-l tianyu-l Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if users run core FSDP2 with this option? Is it a no op?

I think we encounter a tricky case where multiple experiments would share config that's not in core. I would say the "right" way for now might be just duplicating this config into their own custom job_config.py, but deeper reason is that we need to reinvent the config system -- the idea is to let each component have its own config, rather than sharing a central config.

This seems a bit urgent, as we are hitting such issues from different angles, recently.

cc @ailzhang

Copy link
Contributor

@ruisizhang123 ruisizhang123 Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing would happened. Because the graphs saw by compiler are only compute graphs. Then, the bucketing pass would not taking into effect (as no comms are bucketed/reordered).

Maybe we should have a config class specific for pt2-frontier lolll


autop_force_bf16: bool = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way for an experiment to add a config knob without polluting the top-level file?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.



@dataclass
class Validation:
Expand Down
2 changes: 2 additions & 0 deletions torchtitan/experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,7 @@
"compiler_toolkit.deepseek_v3",
"compiler_toolkit.llama3",
"transformers_backend",
"auto_parallel.llama3",
"auto_parallel.deepseek_v3",
]
)
11 changes: 11 additions & 0 deletions torchtitan/experiments/auto_parallel/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## Auto Parallel

requires installing [email protected]:pytorch-labs/autoparallel.git

`CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --model.name llama3_auto_parallel --parallelism.tensor_parallel_degree 4`

Use simplefsdp's autobucketing pass:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like it's quite confusing to put simplefsdp's inductor autobucketing pass here. Should we considering moving part of inductor bucketing utilities to pytorch and the algorithm to simplefsdp folder?

cc. @tianyu-l @wconstab


`CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --model.name llama3_auto_parallel --parallelism.tensor_parallel_degree 4 --experimental.enable_simplefsdp_passes --compile.enable`

(or llama3-8b.toml)
50 changes: 50 additions & 0 deletions torchtitan/experiments/auto_parallel/deepseek_v3/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# Copyright (c) Meta Platforms, Inc. All Rights Reserved.

import copy

from torchtitan.components.loss import build_cross_entropy_loss
from torchtitan.components.lr_scheduler import build_lr_schedulers
from torchtitan.components.optimizer import build_optimizers_with_moe_load_balancing
from torchtitan.components.tokenizer import build_hf_tokenizer
from torchtitan.distributed.pipeline_parallel import pipeline_llm
from torchtitan.hf_datasets.text_datasets import build_text_dataloader

from torchtitan.models.deepseek_v3 import deepseekv3_args, DeepSeekV3Model
from torchtitan.models.deepseek_v3.model.args import DeepSeekV3ModelArgs
from torchtitan.models.deepseek_v3.model.state_dict_adapter import (
DeepSeekV3StateDictAdapter,
)
from torchtitan.protocols.train_spec import TrainSpec

from .parallelize_deepseekv3 import parallelize_deepseekv3


def get_train_spec() -> TrainSpec:
model_args = copy.deepcopy(deepseekv3_args)

default_args = DeepSeekV3ModelArgs()
for config, args in model_args.items():
if "flex_attn" in config:
continue

use_flex_attn = (default_args.use_flex_attn,)
attn_mask_type = (default_args.attn_mask_type,)

return TrainSpec(
model_cls=DeepSeekV3Model,
model_args=model_args,
parallelize_fn=parallelize_deepseekv3,
pipelining_fn=pipeline_llm,
build_optimizers_fn=build_optimizers_with_moe_load_balancing,
build_lr_schedulers_fn=build_lr_schedulers,
build_dataloader_fn=build_text_dataloader,
build_tokenizer_fn=build_hf_tokenizer,
build_loss_fn=build_cross_entropy_loss,
state_dict_adapter=DeepSeekV3StateDictAdapter,
)
Loading
Loading