Improve QAT nvfp4 numerics

andrewor14 · andrewor14 · commit d256ce2b437e · 2025-09-24T13:07:05.000-07:00
**Summary:** Similar to #2986, this commit improves the prepare vs convert SQNR of NVFP4 QAT from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without. This is achieved by mimicking the PTQ flow more closely, in particular, in descending order of significance: 1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`, but in `torch.int32` instead of `torch.uint8` 2. Do not cast intermediate fake quantized values to original dtype, e.g. bf16 which loses some fidelity from fp32 3. Fake round blockwise scales to float8 **Test Plan:** ``` python test/quantization/test_qat.py -k test_qat_nvfp4 python test/quantization/test_qat.py -k test_quantize_api_nvfp4 ``` End-to-end tests TBD. ghstack-source-id: d8f7eff Pull Request resolved: #3050
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1910,7 +1910,6 @@ def _test_quantize_api_against_ptq(
         quantize_(m, QATConfig(base_config, step="prepare"), filter_fn)
         out_prepared = m(*example_inputs)
         prepare_sqnr = compute_error(out_prepared, out_baseline)
-
         self.assertGreaterEqual(prepare_sqnr, target_prepare_sqnr)
 
         # compare convert
@@ -2086,9 +2085,14 @@ def test_quantize_api_nvfp4(self, use_per_tensor_scale: bool):
         """
         from torchao.prototype.mx_formats import NVFP4InferenceConfig
 
+        if use_per_tensor_scale:
+            target_prepare_sqnr = 36
+        else:
+            target_prepare_sqnr = float("inf")
+
         self._test_quantize_api_against_ptq(
             NVFP4InferenceConfig(use_dynamic_per_tensor_scale=use_per_tensor_scale),
-            target_prepare_sqnr=12,
+            target_prepare_sqnr=target_prepare_sqnr,
             target_convert_sqnr=float("inf"),
         )
 
@@ -2098,11 +2102,16 @@ def test_qat_nvfp4(self, use_per_tensor_scale: bool):
         """
         Test QAT with `NVFP4FakeQuantizeConfig`.
         """
+        from torchao.prototype.mx_formats import NVFP4InferenceConfig
         from torchao.prototype.qat import NVFP4FakeQuantizeConfig
 
         torch.manual_seed(self.SEED)
         m = M().cuda()
         baseline_model = copy.deepcopy(m)
+        quantize_(
+            baseline_model,
+            NVFP4InferenceConfig(use_dynamic_per_tensor_scale=use_per_tensor_scale),
+        )
         qat_config = QATConfig(
             activation_config=NVFP4FakeQuantizeConfig(use_per_tensor_scale),
             weight_config=NVFP4FakeQuantizeConfig(use_per_tensor_scale),
@@ -2116,7 +2125,11 @@ def test_qat_nvfp4(self, use_per_tensor_scale: bool):
         out = m(*x)
         baseline_out = baseline_model(*x)
         sqnr = compute_error(out, baseline_out).item()
-        self.assertGreater(sqnr, 24)
+        if use_per_tensor_scale:
+            target_sqnr = 130
+        else:
+            target_sqnr = float("inf")
+        self.assertGreaterEqual(sqnr, target_sqnr)
 
     @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
     @unittest.skipIf(
diff --git a/torchao/prototype/custom_fp_utils.py b/torchao/prototype/custom_fp_utils.py
@@ -24,7 +24,12 @@ def _n_ones(n: int) -> int:
 F32_EXP_BIAS = _n_ones(EBITS_F32 - 1)
 
 
-def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
+def _f32_to_floatx_unpacked(
+    x: Tensor,
+    ebits: int,
+    mbits: int,
+    compute_dtype: torch.dtype = torch.uint8,
+) -> Tensor:
     """Convert FP32 numbers to sub-byte floating point numbers with the given
     number of exponent and mantissa bits.
 
@@ -44,6 +49,7 @@ def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
     Background 2: Computer Organization and Design, RISC-V edition, Chapter 3.5
     """
     assert x.dtype == torch.float
+    assert compute_dtype in [torch.uint8, torch.int32]
     assert 1 + ebits + mbits <= 8
 
     # calculate constants
@@ -105,7 +111,7 @@ def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
     denormal_x = x + denorm_mask_float
     denormal_x = denormal_x.view(torch.int32)
     denormal_x -= denorm_mask_int
-    denormal_x = denormal_x.to(torch.uint8)
+    denormal_x = denormal_x.to(compute_dtype)
 
     #
     # branch 3: stay in normal range, adjust the exponent and round
@@ -120,26 +126,26 @@ def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
     normal_x += mant_odd
     # take the bits!
     normal_x = normal_x >> (MBITS_F32 - mbits)
-    normal_x = normal_x.to(torch.uint8)
+    normal_x = normal_x.to(compute_dtype)
 
     #
     # combine the branches
     #
-    x = torch.full_like(x, max_int, dtype=torch.uint8)
+    x = torch.full_like(x, max_int, dtype=compute_dtype)
     x = torch.where(denormal_mask, denormal_x, x)
     x = torch.where(normal_mask, normal_x, x)
 
     # add sign back
     sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits)
-    sign_lp = sign_lp.to(torch.uint8)
+    sign_lp = sign_lp.to(compute_dtype)
     # Right shift of a negative signed integer can fill the least significant
     # bits with either 1s or 0s, depending on the implementation. Since PyTorch
     # doesn't have an uint32 dtype, we mask out these bits to get just the
     # f4 sign bit
     sign_lp = sign_lp & sign_mask
     x = x | sign_lp
 
-    return x.to(torch.uint8)
+    return x.to(compute_dtype)
 
 
 # TODO(future): check if LUT for everything is faster than bit shifting,
@@ -154,7 +160,7 @@ def _floatx_unpacked_to_f32(x: Tensor, ebits: int, mbits: int) -> Tensor:
       fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
     Output: torch.Tensor of dtype fp32 with the dequantized value
     """
-    assert x.dtype == torch.uint8
+    assert x.dtype in [torch.uint8, torch.int32]
     assert 1 + ebits + mbits <= 8
 
     sign_mask = 1 << (ebits + mbits)
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -798,7 +798,6 @@ def _nvfp4_quantize(
     assert data_hp.is_contiguous(), "Only support contiguous data for now"
     assert block_size == 16, "NVFP4 requires block_size=16"
 
-    orig_dtype = data_hp.dtype
     orig_shape = data_hp.shape
     # Convert to float32 early for consistent precision with Triton implementation
     data_hp = data_hp.float().reshape(orig_shape[0], -1, block_size)
@@ -834,7 +833,7 @@ def _nvfp4_quantize(
     data_scaled = torch.clamp(data_scaled, -F4_E2M1_MAX, F4_E2M1_MAX)
     data_scaled = data_scaled.view(orig_shape)
     if skip_dtype_cast_and_packing:
-        return out_scales.to(torch.float32), data_scaled.to(orig_dtype)
+        return _Float8Round.apply(out_scales), data_scaled
     else:
         data_lp = f32_to_f4_unpacked(data_scaled)
         # TODO: NotImplementedError: "copy_kernel" not implemented for 'Float4_e2m1fn_x2'
diff --git a/torchao/prototype/qat/nvfp4.py b/torchao/prototype/qat/nvfp4.py
@@ -2,6 +2,14 @@
 
 import torch
 
+from torchao.prototype.custom_fp_utils import (
+    _f32_to_floatx_unpacked,
+    _floatx_unpacked_to_f32,
+)
+from torchao.prototype.mx_formats.kernels import (
+    EBITS_F4_E2M1,
+    MBITS_F4_E2M1,
+)
 from torchao.prototype.mx_formats.nvfp4_tensor import (
     _nvfp4_quantize,
     per_tensor_amax_to_scale,
@@ -12,6 +20,24 @@
 )
 
 
+class _FP4Round(torch.autograd.Function):
+    """
+    Cast an fp32 tensor to fp4 and back with backward STE.
+    """
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor) -> torch.Tensor:
+        q = _f32_to_floatx_unpacked(
+            x, EBITS_F4_E2M1, MBITS_F4_E2M1, compute_dtype=torch.int32
+        )
+        dq = _floatx_unpacked_to_f32(q, EBITS_F4_E2M1, MBITS_F4_E2M1)
+        return dq
+
+    @staticmethod
+    def backward(ctx, gy: torch.Tensor) -> torch.Tensor:
+        return gy
+
+
 @dataclass
 class NVFP4FakeQuantizeConfig(FakeQuantizeConfigBase):
     """
@@ -56,9 +82,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             per_tensor_scale=per_tensor_scale,
             skip_dtype_cast_and_packing=True,
         )
+        q = _FP4Round.apply(q)
         if self.config.use_per_tensor_scale:
             scale = scale * per_tensor_scale
-        assert q.dtype == x.dtype
         assert scale.dtype == torch.float32
 
         # dequantize