From d7beeb3d74bc463e4608632fbb17dc62c3f79f3c Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 19 Nov 2025 12:48:20 -0800 Subject: [PATCH 1/8] Update [ghstack-poisoned] --- .../integration_test_8gpu_features.yaml | 2 +- scripts/loss_compare.py | 154 +++++++++++++++++- 2 files changed, 149 insertions(+), 7 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index de0672eeef..ad0904dff3 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -96,7 +96,7 @@ jobs: echo "Checking FSDP4 v.s. HSDP2FSDP2TP2 accuracy parity" export baseline_options="--parallelism.data_parallel_replicate_degree=1" export test_options="--parallelism.data_parallel_replicate_degree=2 --parallelism.tensor_parallel_degree=2" - python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --baseline-ngpus=4 --test-ngpus=8 --steps=1 + python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --baseline-ngpus=4 --test-ngpus=8 --steps=1 --export-result "${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs/loss_compare_result.txt" # Cleanup the checkpoints so that we don't waste network bandwidth and time. rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint diff --git a/scripts/loss_compare.py b/scripts/loss_compare.py index 48a211c8f2..39c9c4b79e 100644 --- a/scripts/loss_compare.py +++ b/scripts/loss_compare.py @@ -168,6 +168,9 @@ def validate_arguments( test_train_file: str, test_options: str, steps: int, + assert_equal: bool, + export_result: str | None, + import_result: str | None, ) -> None: """Validate command line arguments.""" # Validate commit arguments - if one is ".", both must be "." @@ -201,6 +204,34 @@ def validate_arguments( log_print(f"Error: --steps must be a positive integer, got: {steps}") sys.exit(1) + # Validate export-result requires assert-equal + if export_result and not assert_equal: + log_print("Error: --export-result requires --assert-equal") + log_print(" Export only happens when losses are verified to match") + sys.exit(1) + + # Validate import-result requires assert-equal + if import_result and not assert_equal: + log_print("Error: --import-result requires --assert-equal") + log_print(" Import is used to verify all losses match") + sys.exit(1) + + # Validate export-result and import-result are mutually exclusive + if export_result and import_result: + log_print( + "Error: --export-result and --import-result cannot be " "used together" + ) + log_print( + " Use export to save results or import to compare " + "against saved results" + ) + sys.exit(1) + + # Validate import file exists + if import_result and not os.path.exists(import_result): + log_print(f"Error: Import file does not exist: {import_result}") + sys.exit(1) + # ============================================================================= # SETUP FUNCTIONS @@ -321,7 +352,10 @@ def check_git_clean_state() -> None: for line in result.stdout.strip().split("\n"): log_print(f" {line}") log_print("") - log_print("Please commit, stash, or discard your changes before running this script") + log_print( + "Please commit, stash, or discard your changes before " + "running this script" + ) log_print(" - To commit: git add -A && git commit -m 'message'") log_print(" - To stash: git stash") log_print(" - To discard: git checkout -- . && git clean -fd") @@ -431,6 +465,34 @@ def read_losses_from_file(loss_file: str) -> dict[int, float]: return losses +def export_losses_to_file(losses: dict[int, float], export_path: str) -> None: + """Export losses to file and stdout. + + Args: + losses: Dictionary mapping step numbers to loss values + export_path: Path to export file + """ + log_print(f"Exporting losses to {export_path}") + + # Write to file and collect output for stdout + with open(export_path, "w") as f: + for step in sorted(losses.keys()): + loss = losses[step] + line = f"{step} {loss}" + f.write(line + "\n") + + log_print(f"Exported {len(losses)} loss values:") + log_print() + + # Output to stdout in same format + for step in sorted(losses.keys()): + loss = losses[step] + print(f"{step} {loss}") + + log_print() + log_print(f"Losses saved to: {export_path}") + + def extract_loss_data(output_folder: str | None) -> None: """Extract loss data from logs.""" if not output_folder: @@ -554,13 +616,18 @@ def perform_loss_analysis( generate_summary_statistics(baseline_losses, test_losses, stats_file) -def assert_losses_equal(baseline_log: str, test_log: str) -> None: - """Assert that losses are equal between baseline and test using - unittest. +def assert_losses_equal( + baseline_log: str, test_log: str, import_result: str | None = None +) -> None: + """Assert that losses are equal between baseline and test using unittest. + + If import_result is provided, also compares baseline with imported losses. """ log_print("Asserting losses are equal...") log_print(f"Baseline log: {baseline_log}") log_print(f"Test log: {test_log}") + if import_result: + log_print(f"Import file: {import_result}") # Extract losses from both logs baseline_losses = extract_losses_from_log(baseline_log) @@ -577,6 +644,15 @@ def assert_losses_equal(baseline_log: str, test_log: str) -> None: log_print("Error: No losses found in test log") sys.exit(1) + # Load imported losses if provided + imported_losses = None + if import_result: + imported_losses = read_losses_from_file(import_result) + log_print(f"Loaded {len(imported_losses)} steps from import file") + if not imported_losses: + log_print("Error: No losses found in import file") + sys.exit(1) + # Create a test case class LossEqualityTest(unittest.TestCase): def test_losses_equal(self): @@ -591,10 +667,22 @@ def test_losses_equal(self): f"test has {len(test_steps)} steps", ) + # If imported losses exist, check steps match + if imported_losses: + imported_steps = set(imported_losses.keys()) + self.assertEqual( + baseline_steps, + imported_steps, + f"Steps mismatch: baseline has {len(baseline_steps)} steps, " + f"imported has {len(imported_steps)} steps", + ) + # Check that losses are equal for each step for step in sorted(baseline_steps): baseline_loss = baseline_losses[step] test_loss = test_losses[step] + + # Compare baseline vs test self.assertEqual( baseline_loss, test_loss, @@ -602,6 +690,18 @@ def test_losses_equal(self): f"baseline={baseline_loss}, test={test_loss}", ) + # Compare baseline vs imported (if provided) + # No need to compare test vs imported since: + # baseline==test and baseline==imported implies test==imported + if imported_losses: + imported_loss = imported_losses[step] + self.assertEqual( + baseline_loss, + imported_loss, + f"Loss mismatch at step {step}: " + f"baseline={baseline_loss}, imported={imported_loss}", + ) + # Run the test suite = unittest.TestLoader().loadTestsFromTestCase(LossEqualityTest) runner = unittest.TextTestRunner(verbosity=2) @@ -611,7 +711,13 @@ def test_losses_equal(self): log_print("Loss assertion failed!") sys.exit(1) else: - log_print("All losses are equal. Assertion passed!") + if import_result: + log_print( + "All losses are equal (baseline, test, and imported). " + "Assertion passed!" + ) + else: + log_print("All losses are equal. Assertion passed!") def cleanup_temp_files(output_folder: str | None) -> None: @@ -754,6 +860,24 @@ def parse_arguments() -> argparse.Namespace: "Script exits with error if losses differ." ), ) + parser.add_argument( + "--export-result", + default="", + help=( + "Export losses to specified file path (requires --assert-equal). " + "Exports only when losses match. Format: '{step} {loss}' per line." + ), + ) + parser.add_argument( + "--import-result", + default="", + help=( + "Import losses from specified file path for comparison " + "(requires --assert-equal). " + "Compares imported losses with both baseline and test " + "(all 3 must match)." + ), + ) parser.add_argument( "--job-dump-folder", default="outputs", @@ -785,6 +909,14 @@ def parse_arguments() -> argparse.Namespace: if not args.output_folder: args.output_folder = None + # Convert empty export_result to None + if not args.export_result: + args.export_result = None + + # Convert empty import_result to None + if not args.import_result: + args.import_result = None + return args @@ -848,6 +980,9 @@ def main() -> None: args.test_train_file, args.test_options, args.steps, + args.assert_equal, + args.export_result, + args.import_result, ) # Setup environment @@ -910,7 +1045,14 @@ def main() -> None: # Assert losses are equal if requested if args.assert_equal: - assert_losses_equal(baseline_log, test_log) + # Pass import_result if provided for 3-way comparison + assert_losses_equal(baseline_log, test_log, args.import_result) + + # Export losses if requested (only after assertion passes) + if args.export_result: + # Extract baseline losses (they equal test losses since assertion passed) + baseline_losses = extract_losses_from_log(baseline_log) + export_losses_to_file(baseline_losses, args.export_result) # Analysis and reporting perform_loss_analysis(baseline_log, test_log, stats_file) From 25d13811975d24ad82a59c952c75a4a83afc48d7 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 19 Nov 2025 12:48:20 -0800 Subject: [PATCH 2/8] Update (base update) [ghstack-poisoned] --- scripts/loss_compare.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/loss_compare.py b/scripts/loss_compare.py index 31573c3bce..48a211c8f2 100644 --- a/scripts/loss_compare.py +++ b/scripts/loss_compare.py @@ -301,6 +301,33 @@ def print_configuration( # ============================================================================= +def check_git_clean_state() -> None: + """Check if git working directory is clean before switching commits. + + Raises SystemExit if there are uncommitted changes or untracked files. + """ + result = subprocess.run( + ["git", "status", "--porcelain"], + capture_output=True, + text=True, + check=True, + ) + + if result.stdout.strip(): + log_print("Error: Git working directory is not clean") + log_print(" Cannot switch commits with uncommitted changes") + log_print("") + log_print("Modified/untracked files:") + for line in result.stdout.strip().split("\n"): + log_print(f" {line}") + log_print("") + log_print("Please commit, stash, or discard your changes before running this script") + log_print(" - To commit: git add -A && git commit -m 'message'") + log_print(" - To stash: git stash") + log_print(" - To discard: git checkout -- . && git clean -fd") + sys.exit(1) + + def checkout_commit(commit: str, commit_name: str) -> None: """Checkout git commit.""" if commit != ".": @@ -840,6 +867,12 @@ def main() -> None: args.job_dump_folder, ) + # Check if git working directory is clean before switching commits + # Skip check if both commits are "." (comparing configs on same commit) + needs_git_checkout = args.baseline_commit != "." or args.test_commit != "." + if needs_git_checkout: + check_git_clean_state() + create_seed_checkpoint( enable_seed_checkpoint, args.baseline_config, From 7274b815fbb352b4978eaffde58bcbb379175d32 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 19 Nov 2025 15:21:28 -0800 Subject: [PATCH 3/8] Update [ghstack-poisoned] --- .github/workflows/integration_test_8gpu_features.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index ad0904dff3..0011cd514b 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -93,10 +93,10 @@ jobs: python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 # Verify the accuracy. - echo "Checking FSDP4 v.s. HSDP2FSDP2TP2 accuracy parity" + echo "Checking FSDP4 v.s. HSDP2FSDP4 accuracy parity" export baseline_options="--parallelism.data_parallel_replicate_degree=1" - export test_options="--parallelism.data_parallel_replicate_degree=2 --parallelism.tensor_parallel_degree=2" - python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --baseline-ngpus=4 --test-ngpus=8 --steps=1 --export-result "${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs/loss_compare_result.txt" + export test_options="--parallelism.data_parallel_replicate_degree=4" + python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10 --export-result "${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs/loss_compare_result.txt" # Cleanup the checkpoints so that we don't waste network bandwidth and time. rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint From a699162088bab5e01b44e7660ef5e5b5c2ea97b5 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 19 Nov 2025 16:03:22 -0800 Subject: [PATCH 4/8] Update (base update) [ghstack-poisoned] --- scripts/loss_compare.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/loss_compare.py b/scripts/loss_compare.py index 48a211c8f2..42ad3a81be 100644 --- a/scripts/loss_compare.py +++ b/scripts/loss_compare.py @@ -321,7 +321,9 @@ def check_git_clean_state() -> None: for line in result.stdout.strip().split("\n"): log_print(f" {line}") log_print("") - log_print("Please commit, stash, or discard your changes before running this script") + log_print( + "Please commit, stash, or discard your changes before running this script" + ) log_print(" - To commit: git add -A && git commit -m 'message'") log_print(" - To stash: git stash") log_print(" - To discard: git checkout -- . && git clean -fd") From 30bc03c2e7268d72c8745755d7fa4ac5025a8b0d Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 19 Nov 2025 17:37:40 -0800 Subject: [PATCH 5/8] Update [ghstack-poisoned] --- assets/ci_llama3_losses.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/ci_llama3_losses.txt b/assets/ci_llama3_losses.txt index 5ccea64b17..56ed044beb 100644 --- a/assets/ci_llama3_losses.txt +++ b/assets/ci_llama3_losses.txt @@ -7,4 +7,4 @@ 7 4.5606 8 4.3724 9 4.347 -10 4.2004 +10 4.2001 From 40b07a550732843b67e3d6743442888c6ababbf8 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 19 Nov 2025 17:39:40 -0800 Subject: [PATCH 6/8] Update [ghstack-poisoned] --- .github/workflows/integration_test_8gpu_features.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 39409e8d82..9af3ce9292 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -90,14 +90,14 @@ jobs: sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" - python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 - # Verify the accuracy. echo "Checking FSDP4 v.s. HSDP2FSDP4 accuracy parity" export baseline_options="--parallelism.data_parallel_replicate_degree=1" export test_options="--parallelism.data_parallel_replicate_degree=4" python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10 --import-result assets/ci_llama3_losses.txt + python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 + # Cleanup the checkpoints so that we don't waste network bandwidth and time. rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint rm -rf artifacts-to-be-uploaded/*/checkpoint From 0a94bfc64380dc41ea57d145a8541073b9de888f Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 19 Nov 2025 17:55:22 -0800 Subject: [PATCH 7/8] Update [ghstack-poisoned] --- assets/ci_llama3_losses.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/ci_llama3_losses.txt b/assets/ci_llama3_losses.txt index 56ed044beb..5ccea64b17 100644 --- a/assets/ci_llama3_losses.txt +++ b/assets/ci_llama3_losses.txt @@ -7,4 +7,4 @@ 7 4.5606 8 4.3724 9 4.347 -10 4.2001 +10 4.2004 From 49ce58c97b187dbf8ce58e83e190a361181f0950 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 19 Nov 2025 22:49:19 -0800 Subject: [PATCH 8/8] Update [ghstack-poisoned] --- .github/workflows/integration_test_8gpu_features.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 9af3ce9292..bf17194efd 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -90,11 +90,12 @@ jobs: sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" - # Verify the accuracy. - echo "Checking FSDP4 v.s. HSDP2FSDP4 accuracy parity" + # Verify the accuracy first. + echo "Checking FSDP8 v.s. HSDP (4, 2) accuracy parity" export baseline_options="--parallelism.data_parallel_replicate_degree=1" export test_options="--parallelism.data_parallel_replicate_degree=4" python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10 --import-result assets/ci_llama3_losses.txt + rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/* python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8