Skip to content

Commit 58fa181

Browse files
authored
3outeille/transformers backend (Dense model only) (#2048)
# Context Reference PR: huggingface#1 This PR enables: - Llama-like HF models to work with 4D parallelism: FSDP, CP, TP, PP (and the combinations between them). The following models were tested: - `meta-llama/Llama-3.2-1B` - `microsoft/phi-2` - `Qwen/Qwen2.5-7B` - `mistralai/Mistral-7B-v0.1` - `ByteDance-Seed/Seed-Coder-8B-Instruct` - `Qwen/Qwen3-4B-Instruct-2507` - `arcee-ai/AFM-4.5B` - `ibm-granite/granite-3b-code-base-2k` - `baidu/ERNIE-4.5-0.3B-Base-PT` - `kyutai/helium-1-preview-2b` - `allenai/OLMo-7B-hf` - `mistralai/Ministral-8B-Instruct-2410` - Patching HF models weights initialisation. Without this, the the `loss` and `grad_norm` starts very high # Usage - Requirements `transformers==4.57.1` - Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3.toml` ```diff ... [model] - name = "llama3" + name = "transformers_backend" flavor = "debugmodel" hf_assets_path = "./tests/assets/tokenizer" +[hf_transformers] +model = "Qwen/Qwen3-4B-Instruct-2507" ... ``` - Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATH>/torchtitan/experiments/transformers_backend/configs/qwen3.toml ./run_train.sh --job.custom_config_module=torchtitan.experiments.transformers_backend.job_config --compile.enable` <img width="1334" height="453" alt="image" src="https://github.com/user-attachments/assets/da459448-027b-4af9-8176-6a3e433a272c" /> # Testing methodology <img width="2672" height="2018" alt="image" src="https://github.com/user-attachments/assets/66d8689d-7ede-47e3-b389-d4fc1bdd70f7" /> - Following the [converging.md](https://github.com/pytorch/torchtitan/blob/main/docs/converging.md) guidelines, I am comparing the baseline `FSDP=2` vs `FSDP=2 & <other //-ism>` - More precisely, the `test_hf_integration.py`is going to do: ```bash results/ |_ meta-llama |_ Llama-3.2-1B |_ debugmodel/ |_ seed_checkpoint/ |_ config.toml |_ seed.slurm |_ step-0/ |_ .... |_ fsdp2_tp1_cp1_pp1/ |_ config.toml |_ nd_parallelism.slurm |_ nd_parallelism.log |_ fsdp2_tp2_cp1_pp1/ |_ config.toml |_ nd_parallelism.slurm |_ nd_parallelism.log |_ diff_baseline_vs_nd_parallelism.log |_ fsdp2_tp1_cp1_pp2/ |_ config.toml |_ nd_parallelism.slurm |_ nd_parallelism.log |_ diff_baseline_vs_nd_parallelism.log |_ fsdp2_tp1_cp2_pp1/ |_ config.toml |_ nd_parallelism.slurm |_ nd_parallelism.log |_ diff_baseline_vs_nd_parallelism.log |_ fsdp2_tp1_cp2_pp2/ |_ config.toml |_ nd_parallelism.slurm |_ nd_parallelism.log |_ diff_baseline_vs_nd_parallelism.log` |_ full/ ... ``` - Here is the grid search to test the HF modelling ```shell #!/usr/bin/bash model_names=( "meta-llama/Llama-3.2-1B" "microsoft/phi-2" "Qwen/Qwen2.5-7B" "mistralai/Mistral-7B-v0.1" "ByteDance-Seed/Seed-Coder-8B-Instruct" "Qwen/Qwen3-4B-Instruct-2507" "arcee-ai/AFM-4.5B" "ibm-granite/granite-3b-code-base-2k" "baidu/ERNIE-4.5-0.3B-Base-PT" "kyutai/helium-1-preview-2b" "allenai/OLMo-7B-hf" "mistralai/Ministral-8B-Instruct-2410" ) for model_name in "${model_names[@]}"; do rm -rf slurm_results/${model_name} python test_hf_integration.py create_configs --model_name "$model_name" --out_dir slurm_results --flavor debugmodel python test_hf_integration.py submit_jobs --inp_dir slurm_results/${model_name}/debugmodel/seed_checkpoint --qos high while [ ! -f slurm_results/${model_name}/debugmodel/seed_checkpoint/status.txt ] || [ "$(cat slurm_results/${model_name}/debugmodel/seed_checkpoint/status.txt)" != "completed" ]; do echo "Waiting for seed checkpoint from ${model_name} to complete ..." sleep 1 done python test_hf_integration.py submit_jobs --inp_dir slurm_results/${model_name}/debugmodel --qos high echo "================" done ``` # Further tasks - Moe (handle in PR huggingface#3) - Missing `build_optimizers_with_moe_load_balancing` support for MoE - Missing TP/PP/EP supports for MoE - When using HF modeling, the test `FSDP=2 vs FSDP=2 + PP=2`, the `loss` and `grad_norm` not bitwise matching (but converging) while it is the case with Torchtitan modeling. (issue is tracked in huggingface#4) - Add convergence tests to CI by doing tiny model + gloo backend (once PP is bitwise matching) - the HF modeling has lower MFU than Torchtitan MFU - NOTE: `import torch._dynamo.config; torch._dynamo.config.cache_size_limit = 128` to avoid recomputation for graph when using `torch.compile` and `activation checkpointing`
1 parent d167a20 commit 58fa181

File tree

17 files changed

+1936
-2
lines changed

17 files changed

+1936
-2
lines changed

.ci/docker/common/install_conda.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ install_pip_dependencies() {
4343
pip_install -r /opt/conda/requirements.txt
4444
pip_install -r /opt/conda/requirements-flux.txt
4545
pip_install -r /opt/conda/requirements-vlm.txt
46+
pip_install -r /opt/conda/requirements-transformers-backend.txt
4647
popd
4748
}
4849

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
transformers==4.57.1

.ci/docker/ubuntu/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ COPY requirements-dev.txt /opt/conda/
3333
COPY requirements.txt /opt/conda/
3434
COPY requirements-flux.txt /opt/conda/
3535
COPY requirements-vlm.txt /opt/conda/
36+
COPY requirements-transformers-backend.txt /opt/conda/
3637
COPY conda-env-ci.txt /opt/conda/
3738
COPY ./common/install_conda.sh install_conda.sh
3839
COPY ./common/utils.sh utils.sh
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: Transformers Backend 8 GPU Integration Tests
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
paths:
7+
- 'torchtitan/experiments/transformers_backend/**'
8+
pull_request:
9+
paths:
10+
- 'torchtitan/experiments/transformers_backend/**'
11+
schedule:
12+
# Runs every 12 hours
13+
- cron: '0 */12 * * *'
14+
15+
concurrency:
16+
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
17+
cancel-in-progress: true
18+
19+
defaults:
20+
run:
21+
shell: bash -l -eo pipefail {0}
22+
23+
jobs:
24+
build-test:
25+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
26+
with:
27+
runner: linux.g5.48xlarge.nvidia.gpu
28+
gpu-arch-type: cuda
29+
gpu-arch-version: "12.6"
30+
# This image is faster to clone than the default, but it lacks CC needed by triton
31+
# (1m25s vs 2m37s).
32+
docker-image: torchtitan-ubuntu-20.04-clang12
33+
repository: pytorch/torchtitan
34+
upload-artifact: outputs
35+
script: |
36+
set -eux
37+
38+
# The generic Linux job chooses to use base env, not the one setup by the image
39+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
40+
conda activate "${CONDA_ENV}"
41+
42+
# Log CUDA driver version for debugging.
43+
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
44+
echo "CUDA driver version: ${DRIVER_VERSION}"
45+
46+
pip config --user set global.progress_bar off
47+
48+
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
49+
50+
USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
51+
52+
mkdir artifacts-to-be-uploaded
53+
python -m torchtitan.experiments.transformers_backend.tests.integration_tests artifacts-to-be-uploaded --ngpu 8

torchtitan/experiments/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,4 @@ We provide this `experiments/` folder to host experiments that add significant v
3131
| [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
3232
| [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
3333
| [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
34+
| [transformers_backend](./transformers_backend/) | [![Transformers backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml?query=branch%3Amain) | [@3outeille](https://github.com/3outeille) |

torchtitan/experiments/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@
1212
"vlm",
1313
"compiler_toolkit.deepseek_v3",
1414
"compiler_toolkit.llama3",
15+
"transformers_backend",
1516
]
1617
)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Huggingface Transformers backend
2+
3+
## Quick start
4+
5+
- Requirements `transformers==4.57.1`
6+
7+
- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3.toml`
8+
```diff
9+
...
10+
[model]
11+
- name = "llama3"
12+
+ name = "transformers_backend"
13+
flavor = "debugmodel"
14+
hf_assets_path = "./tests/assets/tokenizer"
15+
16+
+[hf_transformers]
17+
+model = "Qwen/Qwen3-4B-Instruct-2507"
18+
...
19+
```
20+
- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATH>/torchtitan/experiments/transformers_backend/configs/qwen3.toml ./run_train.sh --job.custom_config_module=torchtitan.experiments.transformers_backend.job_config --compile.enable`
21+
- Make sure you have created the tokenizers beforehand
22+
<img width="1334" height="453" alt="image" src="https://github.com/user-attachments/assets/da459448-027b-4af9-8176-6a3e433a272c" />
23+
24+
## Supported Features
25+
26+
- The following models were tested:
27+
- Dense (FSDP/CP/TP/PP/`torch.compile`)
28+
- `meta-llama/Llama-3.2-1B`
29+
- `microsoft/phi-2`
30+
- `Qwen/Qwen2.5-7B`
31+
- `mistralai/Mistral-7B-v0.1`
32+
- `ByteDance-Seed/Seed-Coder-8B-Instruct`
33+
- `Qwen/Qwen3-4B-Instruct-2507`
34+
- `arcee-ai/AFM-4.5B`
35+
- `ibm-granite/granite-3b-code-base-2k`
36+
- `baidu/ERNIE-4.5-0.3B-Base-PT`
37+
- `kyutai/helium-1-preview-2b`
38+
- `allenai/OLMo-7B-hf`
39+
- `mistralai/Ministral-8B-Instruct-2410`
40+
- MoE (upcoming)
41+
42+
## Known issues to address later
43+
44+
- When using HF modeling, the test `FSDP=2 vs FSDP=2 + PP=2`, the `loss` and `grad_norm` not bitwise matching (but converging) while it is the case with Torchtitan modeling. This will be addressed in another PR but the culprit is probably `register_buffer` when loading `seed_checkpoint`
45+
- the HF modeling has lower MFU than Torchtitan MFU
46+
47+
## Further work
48+
49+
- Missing `build_optimizers_with_moe_load_balancing` support for MoE
50+
- Missing TP/PP/EP supports for MoE
51+
- Load HF weights
52+
- Add LORA support
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
from torchtitan.components.loss import build_cross_entropy_loss
7+
from torchtitan.components.lr_scheduler import build_lr_schedulers
8+
from torchtitan.components.optimizer import build_optimizers
9+
from torchtitan.components.tokenizer import build_hf_tokenizer
10+
from torchtitan.hf_datasets.text_datasets import build_text_dataloader
11+
from torchtitan.protocols.train_spec import TrainSpec
12+
13+
from .infra.parallelize import parallelize_hf_transformers
14+
15+
from .infra.pipeline import pipeline_hf_transformers
16+
from .model.args import HFTransformerModelArgs, TitanDenseModelArgs
17+
from .model.model import HFTransformerModel
18+
19+
__all__ = [
20+
"HFTransformerModelArgs",
21+
"HFTransformerModel",
22+
]
23+
24+
25+
flavors = {
26+
"debugmodel": HFTransformerModelArgs(
27+
titan_dense_args=TitanDenseModelArgs(
28+
dim=256,
29+
n_layers=2,
30+
n_heads=16,
31+
n_kv_heads=16,
32+
),
33+
),
34+
"full": HFTransformerModelArgs(
35+
titan_dense_args=TitanDenseModelArgs(),
36+
),
37+
}
38+
39+
40+
def get_train_spec() -> TrainSpec:
41+
return TrainSpec(
42+
model_cls=HFTransformerModel,
43+
model_args=flavors,
44+
parallelize_fn=parallelize_hf_transformers,
45+
pipelining_fn=pipeline_hf_transformers,
46+
build_optimizers_fn=build_optimizers,
47+
build_lr_schedulers_fn=build_lr_schedulers,
48+
build_dataloader_fn=build_text_dataloader,
49+
build_tokenizer_fn=build_hf_tokenizer,
50+
build_loss_fn=build_cross_entropy_loss,
51+
)
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# torchtitan Config.toml
2+
3+
[job]
4+
dump_folder = "./outputs"
5+
description = "Qwen 3 debug training"
6+
print_config = true
7+
8+
[profiling]
9+
enable_profiling = false
10+
save_traces_folder = "profile_trace"
11+
profile_freq = 5
12+
enable_memory_snapshot = false
13+
save_memory_snapshot_folder = "memory_snapshot"
14+
15+
[metrics]
16+
log_freq = 1
17+
disable_color_printing = false
18+
enable_tensorboard = false
19+
save_tb_folder = "tb"
20+
enable_wandb = false
21+
22+
[model]
23+
name = "transformers_backend"
24+
flavor = "debugmodel"
25+
# test folder with tokenizer.json, for debug purpose only
26+
hf_assets_path = "./tests/assets/tokenizer"
27+
# converters = ["float8"]
28+
29+
[hf_transformers]
30+
model = "Qwen/Qwen3-4B-Instruct-2507"
31+
32+
[optimizer]
33+
name = "AdamW"
34+
lr = 8e-4
35+
eps = 1e-8
36+
37+
[lr_scheduler]
38+
warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps
39+
decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps
40+
decay_type = "linear"
41+
min_lr_factor = 0.0
42+
43+
[training]
44+
local_batch_size = 2
45+
seq_len = 2048
46+
max_norm = 1.0 # grad norm clipping
47+
steps = 10
48+
dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M)
49+
dataset_path = "./tests/assets/c4_test"
50+
51+
[parallelism]
52+
data_parallel_replicate_degree = 1
53+
data_parallel_shard_degree = -1
54+
fsdp_reshard_after_forward = "default" # default / never / always
55+
tensor_parallel_degree = 1
56+
enable_async_tensor_parallel = false
57+
pipeline_parallel_degree = 1
58+
pipeline_parallel_schedule = "1F1B"
59+
context_parallel_degree = 1
60+
expert_parallel_degree = 1
61+
expert_tensor_parallel_degree = 1
62+
63+
[checkpoint]
64+
enable = false
65+
folder = "checkpoint"
66+
interval = 10
67+
last_save_model_only = false
68+
export_dtype = "float32"
69+
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
70+
71+
[activation_checkpoint]
72+
mode = "selective" # ["none", "selective", "full"]
73+
selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy
74+
75+
[compile]
76+
enable=false
77+
components = ["model", "loss"]
78+
79+
[quantize.linear.float8]
80+
enable_fsdp_float8_all_gather = false
81+
precompute_float8_dynamic_scale_for_fsdp = false
82+
filter_fqns = ["output"]
83+
84+
[validation]
85+
enable = false
86+
dataset = "c4_validation"
87+
freq = 5
88+
steps = 10
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# torchtitan Config.toml
2+
3+
[job]
4+
dump_folder = "./outputs"
5+
description = "Qwen 3 full training"
6+
print_config = true
7+
8+
[profiling]
9+
enable_profiling = false
10+
save_traces_folder = "profile_trace"
11+
profile_freq = 5
12+
enable_memory_snapshot = false
13+
save_memory_snapshot_folder = "memory_snapshot"
14+
15+
[metrics]
16+
log_freq = 1
17+
disable_color_printing = false
18+
enable_tensorboard = false
19+
save_tb_folder = "tb"
20+
enable_wandb = false
21+
22+
[model]
23+
name = "transformers_backend"
24+
flavor = "full"
25+
# test folder with tokenizer.json, for debug purpose only
26+
hf_assets_path = "./tests/assets/tokenizer"
27+
# converters = ["float8"]
28+
29+
[hf_transformers]
30+
model = "Qwen/Qwen3-4B-Instruct-2507"
31+
32+
[optimizer]
33+
name = "AdamW"
34+
lr = 8e-4
35+
eps = 1e-8
36+
37+
[lr_scheduler]
38+
warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps
39+
decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps
40+
decay_type = "linear"
41+
min_lr_factor = 0.0
42+
43+
[training]
44+
local_batch_size = 2
45+
seq_len = 2048
46+
max_norm = 1.0 # grad norm clipping
47+
steps = 10
48+
dataset = "c4" # supported datasets: c4_test (2K), c4 (177M)
49+
50+
[parallelism]
51+
data_parallel_replicate_degree = 1
52+
data_parallel_shard_degree = -1
53+
fsdp_reshard_after_forward = "default" # default / never / always
54+
tensor_parallel_degree = 1
55+
enable_async_tensor_parallel = false
56+
pipeline_parallel_degree = 1
57+
pipeline_parallel_schedule = "1F1B"
58+
context_parallel_degree = 1
59+
expert_parallel_degree = 1
60+
expert_tensor_parallel_degree = 1
61+
62+
[checkpoint]
63+
enable = false
64+
folder = "checkpoint"
65+
interval = 10
66+
last_save_model_only = false
67+
export_dtype = "float32"
68+
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
69+
70+
[activation_checkpoint]
71+
mode = "selective" # ["none", "selective", "full"]
72+
selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy
73+
74+
[compile]
75+
enable=false
76+
components = ["model", "loss"]
77+
78+
[quantize.linear.float8]
79+
enable_fsdp_float8_all_gather = false
80+
precompute_float8_dynamic_scale_for_fsdp = false
81+
filter_fqns = ["output"]
82+
83+
[validation]
84+
enable = false
85+
dataset = "c4_validation"
86+
freq = 5
87+
steps = 10

0 commit comments

Comments
 (0)