add bias handling for a_1_128_w_128_128 float8 scaling

vkuzo · vkuzo · commit 194adfac59f9 · 2025-10-31T09:59:23.000-07:00
Summary: As titled, adds support for bias and a unit test Test Plan: ``` pytest test/quantization/quantize_/workflows/float8/test_float8_tensor.py -s -k fp8_linear_variants ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 48d5806 ghstack-comment-id: 3463298384 Pull-Request: #3259
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -39,10 +39,10 @@
 
 
 class ToyLinearModel(torch.nn.Module):
-    def __init__(self, in_features, out_features):
+    def __init__(self, in_features, out_features, bias):
         super().__init__()
-        self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
-        self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)
+        self.linear1 = torch.nn.Linear(in_features, out_features, bias=bias)
+        self.linear2 = torch.nn.Linear(out_features, in_features, bias=bias)
 
     def forward(self, x):
         x = self.linear1(x)
@@ -81,6 +81,8 @@ def setUp(self):
             ((32, 128), 256, 512),
         ],
     )
+    @common_utils.parametrize("bias", [False, True])
+    @torch.no_grad()
     def test_fp8_linear_variants(
         self,
         dtype: torch.dtype,
@@ -89,6 +91,7 @@ def test_fp8_linear_variants(
         granularity,
         kernel_preference: KernelPreference,
         sizes: Tuple,
+        bias: bool,
     ):
         if isinstance(granularity, PerTensor):
             if kernel_preference is KernelPreference.FBGEMM:
@@ -109,6 +112,16 @@ def test_fp8_linear_variants(
             ):
                 return unittest.skip("unimplemented")
 
+            if bias is True:
+                sizes_to_keep = ((128,), 256, 128)
+                if (
+                    sizes != sizes_to_keep
+                    or kernel_preference is not KernelPreference.TORCH
+                ):
+                    return unittest.skip(
+                        "cut down on number of options to save test time"
+                    )
+
         error_message = None
         if isinstance(granularity, PerRow):
             if mode == "dynamic" and dtype != torch.bfloat16:
@@ -137,7 +150,7 @@ def test_fp8_linear_variants(
             input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
 
             # Create a linear layer with bfloat16 dtype
-            model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+            model = ToyLinearModel(K, N, bias).eval().to(dtype).to("cuda")
 
             quantized_model = copy.deepcopy(model)
 
@@ -260,7 +273,7 @@ def test_kernel_preference_numerical_equivalence(self, granularity, sizes):
         dtype = torch.bfloat16
         input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
         # Create a linear layer with bfloat16 dtype
-        model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+        model = ToyLinearModel(K, N, bias=False).eval().to(dtype).to("cuda")
 
         # reference kernel preference and results
         # we are using KerenelPreference.TORCH as the reference
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -369,6 +369,8 @@ def _(func, types, args, kwargs):
                     w_scale.t(),
                     block_size=128,
                 )
+                if bias is not None:
+                    res = res + bias
             else:
                 res = addmm_float8_unwrapped_inference(
                     inpt_data,

Original file line number	Diff line number	Diff line change
`@@ -369,6 +369,8 @@ def _(func, types, args, kwargs):`
`369`	`369`	`w_scale.t(),`
`370`	`370`	`block_size=128,`
`371`	`371`	`)`
	`372`	`+ if bias is not None:`
	`373`	`+ res = res + bias`
`372`	`374`	`else:`
`373`	`375`	`res = addmm_float8_unwrapped_inference(`
`374`	`376`	`inpt_data,`