Skip to content

Commit b1d3067

Browse files
committed
Add a loss comparison script
ghstack-source-id: a86c4eb Pull-Request: #2029
1 parent 22e959a commit b1d3067

File tree

2 files changed

+896
-0
lines changed

2 files changed

+896
-0
lines changed

.github/workflows/integration_test_8gpu_features.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,5 +76,12 @@ jobs:
7676
export TEST_WITH_ROCM=$([[ "${{ matrix.gpu-arch-type }}" == "rocm" ]] && echo 1 || echo 0)
7777
python -m tests.integration_tests.run_tests --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
7878
79+
# Verify the accuracy.
80+
echo "Checking FSDP4 v.s. HSDP2FSDP2TP2 accuracy parity"
81+
export baseline_options="--parallelism.data_parallel_replicate_degree=1"
82+
export test_options="--parallelism.data_parallel_replicate_degree=2 --parallelism.tensor_parallel_degree=2"
83+
python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --baseline-ngpus=4 --test-ngpus=8 --steps=1
84+
85+
# Cleanup the checkpoints so that we don't waste network bandwidth and time.
7986
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
8087
rm -rf artifacts-to-be-uploaded/*/checkpoint

0 commit comments

Comments
 (0)