Makes fallback float8 1x128 by 128x128 gemm output bfloat16

vkuzo · vkuzo · commit 401c2b79708e · 2025-10-31T06:26:44.000-07:00
Summary: For now, we just care about bf16 output. We can add fp32 and a flag to control it later, if needed. Test Plan: ``` pytest test/quantization/quantize_/workflows/float8/test_float8_tensor.py -s -k fp8_linear_variants -x ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: f00cd47 ghstack-comment-id: 3469836810 Pull-Request: #3265
diff --git a/test/kernel/test_blockwise_triton.py b/test/kernel/test_blockwise_triton.py
@@ -66,6 +66,7 @@ def test_blockwise_fp8_gemm(M, N, K, dtype):
     A_q, A_s = fp8_blockwise_act_quant(A, dtype=dtype)
     B_q, B_s = fp8_blockwise_weight_quant(B, dtype=dtype)
     C_q = blockwise_fp8_gemm(A_q, A_s, B_q, B_s)
+    assert C_q.dtype == torch.bfloat16, "unsupported"
     error = torch.linalg.vector_norm(C - C_q) / torch.linalg.vector_norm(C)
     print(f"Relative Error: {error.item():.6f}")
 
diff --git a/torchao/kernel/blockwise_quantization.py b/torchao/kernel/blockwise_quantization.py
@@ -92,7 +92,7 @@ def blockwise_fp8_gemm(
         M = a.numel() // K
         N = b.size(0)
         M_BUCKET = math.ceil(math.log2(M))
-        c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
+        c = a.new_empty(*a.size()[:-1], N, dtype=torch.bfloat16)
         grid = lambda META: (
             triton.cdiv(M, META["BLOCK_SIZE_M"]),
             triton.cdiv(N, META["BLOCK_SIZE_N"]),
@@ -105,7 +105,7 @@ def blockwise_fp8_gemm(
     @blockwise_fp8_gemm.register_fake
     def _(a, a_s, b, b_s, block_size=128):
         N = b.size(0)
-        c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
+        c = a.new_empty(*a.size()[:-1], N, dtype=torch.bfloat16)
         return c
 
     @triton.jit