add bias handling for a_1_128_w_128_128 float8 scaling

vkuzo · vkuzo · commit 3f904ed62ece · 2025-10-29T12:13:23.000-07:00
Summary: As titled, adds support for bias and a unit test Test Plan: ``` pytest test/quantization/quantize_/workflows/float8/test_float8_tensor.py -s -k fp8_linear_variants ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 50f7a26 ghstack-comment-id: 3463298384 Pull-Request: #3259
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -39,10 +39,10 @@
 
 
 class ToyLinearModel(torch.nn.Module):
-    def __init__(self, in_features, out_features):
+    def __init__(self, in_features, out_features, bias):
         super().__init__()
-        self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
-        self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)
+        self.linear1 = torch.nn.Linear(in_features, out_features, bias=bias)
+        self.linear2 = torch.nn.Linear(out_features, in_features, bias=bias)
 
     def forward(self, x):
         x = self.linear1(x)
@@ -81,6 +81,8 @@ def setUp(self):
             ((32, 128), 256, 512),
         ],
     )
+    @common_utils.parametrize("bias", [False, True])
+    @torch.no_grad()
     def test_fp8_linear_variants(
         self,
         dtype: torch.dtype,
@@ -89,6 +91,7 @@ def test_fp8_linear_variants(
         granularity,
         kernel_preference: KernelPreference,
         sizes: Tuple,
+        bias: bool,
     ):
         if isinstance(granularity, PerTensor):
             if kernel_preference is KernelPreference.FBGEMM:
@@ -106,6 +109,16 @@ def test_fp8_linear_variants(
             elif kernel_preference is KernelPreference.FBGEMM:
                 return unittest.skip("unimplemented")
 
+            if bias is True:
+                sizes_to_keep = ((128,), 256, 128)
+                if (
+                    sizes != sizes_to_keep
+                    or kernel_preference is not KernelPreference.TORCH
+                ):
+                    return unittest.skip(
+                        "cut down on number of options to save test time"
+                    )
+
         error_message = None
         if isinstance(granularity, PerRow):
             if mode == "dynamic" and dtype != torch.bfloat16:
@@ -134,7 +147,7 @@ def test_fp8_linear_variants(
             input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
 
             # Create a linear layer with bfloat16 dtype
-            model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+            model = ToyLinearModel(K, N, bias).eval().to(dtype).to("cuda")
 
             quantized_model = copy.deepcopy(model)
 
@@ -257,7 +270,7 @@ def test_kernel_preference_numerical_equivalence(self, granularity, sizes):
         dtype = torch.bfloat16
         input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
         # Create a linear layer with bfloat16 dtype
-        model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+        model = ToyLinearModel(K, N, bias=False).eval().to(dtype).to("cuda")
 
         # reference kernel preference and results
         # we are using KerenelPreference.TORCH as the reference
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -362,14 +362,15 @@ def _(func, types, args, kwargs):
                 # TODO(future PR): add fbgemm_gpu_genai path if available
                 # TODO(before land): proper out_dtype handling
                 assert _is_1_128_scaled(input_tensor), "unsupported"
-                # breakpoint()
                 res = blockwise_fp8_gemm(
                     inpt_data,
                     input_scale,
                     w_data.t(),
                     w_scale.t(),
                     block_size=128,
                 )
+                if bias is not None:
+                    res = res + bias
             else:
                 res = addmm_float8_unwrapped_inference(
                     inpt_data,