support eval of float8_a1x128_w128x128

vkuzo · vkuzo · commit 9ef86f8aa77b · 2025-10-31T12:43:38.000-07:00
Summary: Adds support for the new float8 scaling recipe in the official eval scripts used to generate accuracy numbers in the README. For now, I am using this as a smoke test that the scaling is working on a real model - it is. We can add official benchmark results after we hook up slayton's cuBLAS binding on H100, which should make the UEX of running evals a lot better. Test Plan: Smoke test on LLama-3.1-8B, accuracy looks good ``` // download checkpoint with-proxy python scripts/download.py --hf_token {token} --repo_id meta-llama/Meta-Llama-3.1-8B // prepare checkpoint python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3.1-8B // run bf16 eval on a single task with-proxy time python torchao/_models/llama/eval.py --checkpoint_path checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --tasks 'winogrande' ... winogrande: {'alias': 'winogrande', 'acc,none': 0.7426992896606156, 'acc_stderr,none': 0.012285989618865697} // run float8 eval on the same task with-proxy time python torchao/_models/llama/eval.py --checkpoint_path checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --tasks 'winogrande' --quantization float8_a1x128_w128x128 --compile ... winogrande: {'alias': 'winogrande', 'acc,none': 0.7419100236779794, 'acc_stderr,none': 0.012298278833972477} ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: e87609a ghstack-comment-id: 3474380821 Pull-Request: #3269
diff --git a/scripts/download.py b/scripts/download.py
@@ -38,7 +38,7 @@ def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) -
     parser.add_argument(
         "--repo_id",
         type=str,
-        default="checkpoints/meta-llama/llama-2-7b-chat-hf",
+        default="meta-llama/llama-2-7b-chat-hf",
         help="Repository ID to download from.",
     )
     parser.add_argument(
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -23,6 +23,7 @@
     Int4WeightOnlyConfig,
     Int8DynamicActivationInt8WeightConfig,
     Int8WeightOnlyConfig,
+    PerBlock,
     PerRow,
     PerTensor,
     UIntXWeightOnlyConfig,
@@ -44,6 +45,7 @@ def run_evaluation(
     calibration_limit: Optional[int] = None,
     calibration_seq_length: Optional[int] = None,
     pad_calibration_inputs: bool = False,
+    print_model: bool = False,
 ):
     """Runs the evaluation of a model using LM Eval."""
     print(
@@ -169,6 +171,13 @@ def run_evaluation(
                 model,
                 Float8DynamicActivationFloat8WeightConfig(granularity=granularity),
             )
+        if quantization == "float8_a1x128_w128x128":
+            config = Float8DynamicActivationFloat8WeightConfig(
+                granularity=(PerBlock((1, 128)), PerBlock((128, 128))),
+            )
+            # TODO(future): all workflows in this file should be skipping quantization
+            # of `lm_head`
+            quantize_(model, config)
         if "autoround" in quantization:
             from transformers import AutoTokenizer
 
@@ -273,7 +282,16 @@ def run_evaluation(
             )
 
     if compile:
-        model = torch.compile(model, mode="max-autotune", fullgraph=True)
+        # TODO(future PR): clean this up
+        if quantization == "float8_a1x128_w128x128":
+            # we don't need max-autotune for float8 blockwise quant
+            model = torch.compile(model)
+        else:
+            model = torch.compile(model, mode="max-autotune", fullgraph=True)
+
+    if print_model:
+        print(model)
+
     with torch.no_grad():
         print("Running evaluation ...")
         # avoid circular imports
@@ -371,6 +389,9 @@ def run_evaluation(
         default=False,
         help="pads sequences shorter than calibration_seq_length to that length, yielding more calibration inputs but running much slower",
     )
+    parser.add_argument(
+        "--print_model", action="store_true", help="Whether to print the model."
+    )
 
     args = parser.parse_args()
     run_evaluation(
@@ -387,4 +408,5 @@ def run_evaluation(
         args.calibration_limit,
         args.calibration_seq_length,
         args.pad_calibration_inputs,
+        args.print_model,
     )

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) -`
`38`	`38`	`parser.add_argument(`
`39`	`39`	`"--repo_id",`
`40`	`40`	`type=str,`
`41`		`- default="checkpoints/meta-llama/llama-2-7b-chat-hf",`
	`41`	`+ default="meta-llama/llama-2-7b-chat-hf",`
`42`	`42`	`help="Repository ID to download from.",`
`43`	`43`	`)`
`44`	`44`	`parser.add_argument(`