Update Float8Tensor for GRPO training in unsloth

andrewor14 · andrewor14 · commit a323bbeb9f86 · 2025-10-31T14:27:46.000-07:00
**Summary:** Support a few extra ops called during GRPO loop in unsloth/vllm for Float8Tensor.

**Test Plan:**

```
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_fp8_matmul_lora
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_to_dtype_layout
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_has_compatible_shallow_copy_type
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_transpose
```
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -7,7 +7,7 @@
 import copy
 import unittest
 from contextlib import nullcontext
-from typing import Tuple
+from typing import Tuple, Type
 
 import torch
 from torch._inductor.utils import run_and_get_code
@@ -18,6 +18,7 @@
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
     Float8WeightOnlyConfig,
+    Granularity,
     PerRow,
     PerTensor,
     quantize_,
@@ -72,6 +73,24 @@ def forward(self, x):
         return self.conv(x)
 
 
+class ToyLoRAModel(torch.nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        lora_rank: int = 8,
+    ):
+        super().__init__()
+        self.linear = torch.nn.Linear(in_features, out_features, bias=False)
+        self.lora_A = torch.nn.Parameter(torch.randn(in_features, lora_rank))
+        self.lora_B = torch.nn.Parameter(torch.randn(lora_rank, out_features))
+
+    def forward(self, x):
+        matmul_out = torch.matmul(x, self.linear.weight.t())
+        lora_out = x @ self.lora_A @ self.lora_B
+        return matmul_out + lora_out
+
+
 # TODO: move tests in test_affine_quantized_float.py here after we migrated all implementations
 @unittest.skipIf(not torch_version_at_least("2.8.0"), "Need pytorch 2.8+")
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -105,16 +124,75 @@ def test_fp8_linear_variants(
         dtype: torch.dtype,
         mode: str,
         compile: bool,
-        granularity,
+        granularity: Granularity,
         kernel_preference: KernelPreference,
         sizes: Tuple,
+    ):
+        self._test_fp8_matmul_variants(
+            dtype,
+            mode,
+            compile,
+            granularity,
+            kernel_preference,
+            sizes,
+            ToyLinearModel,
+        )
+
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+    )
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @common_utils.parametrize("mode", ["dynamic", "weight-only"])
+    @common_utils.parametrize("compile", [True, False])
+    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
+    @common_utils.parametrize(
+        "kernel_preference",
+        [KernelPreference.AUTO, KernelPreference.TORCH, KernelPreference.FBGEMM],
+    )
+    # Inputs are (M,..), K, N
+    @common_utils.parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 64, 256),
+        ],
+    )
+    def test_fp8_matmul_lora(
+        self,
+        dtype: torch.dtype,
+        mode: str,
+        compile: bool,
+        granularity: Granularity,
+        kernel_preference: KernelPreference,
+        sizes: Tuple,
+    ):
+        self._test_fp8_matmul_variants(
+            dtype,
+            mode,
+            compile,
+            granularity,
+            kernel_preference,
+            sizes,
+            ToyLoRAModel,
+        )
+
+    def _test_fp8_matmul_variants(
+        self,
+        dtype: torch.dtype,
+        mode: str,
+        compile: bool,
+        granularity: Granularity,
+        kernel_preference: KernelPreference,
+        sizes: Tuple,
+        model_class: Type[torch.nn.Module],
     ):
         if (
             isinstance(granularity, PerTensor)
             and kernel_preference == KernelPreference.FBGEMM
         ):
             return unittest.skip(
-                "per tensor with fbgemm kernel preferece does not work yet"
+                "per tensor with fbgemm kernel preference does not work yet"
             )
 
         error_message = None
@@ -145,7 +223,7 @@ def test_fp8_linear_variants(
             input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
 
             # Create a linear layer with bfloat16 dtype
-            model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+            model = model_class(K, N).eval().to(dtype).to("cuda")
 
             quantized_model = copy.deepcopy(model)
 
@@ -758,6 +836,38 @@ def test_slice_3d_operation(self, granularity, slice_dim, tensor_shape):
 
         self.assertEqual(sliced_dequantized, sliced_original)
 
+    def test_to_dtype_layout(self):
+        x = torch.randn(128, 512, device="cuda", dtype=torch.bfloat16)
+        x_fp8 = Float8Tensor.from_hp(x)
+        y_fp8 = torch.ops.aten.to.dtype_layout(
+            x_fp8, dtype=x_fp8.dtype, layout=x_fp8.layout, device="cpu"
+        )
+        self.assertEqual(y_fp8.dtype, x_fp8.dtype)
+        self.assertEqual(y_fp8.layout, x_fp8.layout)
+        self.assertEqual(y_fp8.device, torch.device("cpu"))
+
+    def test_has_compatible_shallow_copy_type(self):
+        x1 = torch.randn(128, 512, device="cuda", dtype=torch.bfloat16)
+        x2 = torch.randn(128, 512, device="cuda", dtype=torch.bfloat16)
+        x3 = torch.randn(128, 256, device="cuda", dtype=torch.bfloat16)
+        x1_fp8 = Float8Tensor.from_hp(x1)
+        x2_fp8 = Float8Tensor.from_hp(x2)
+        x3_fp8 = Float8Tensor.from_hp(x3)
+        self.assertFalse(torch._has_compatible_shallow_copy_type(x1, x2_fp8))
+        self.assertFalse(torch._has_compatible_shallow_copy_type(x1_fp8, x2))
+        self.assertTrue(torch._has_compatible_shallow_copy_type(x1_fp8, x2_fp8))
+        # Wrong shape
+        self.assertFalse(torch._has_compatible_shallow_copy_type(x1_fp8, x3_fp8))
+
+    def test_transpose(self):
+        x = torch.randn(128, 512, device="cuda", dtype=torch.bfloat16)
+        x_fp8 = Float8Tensor.from_hp(x)
+        x_fp8_t = x_fp8.t()
+        torch.testing.assert_close(x_fp8_t.qdata, x_fp8.qdata.t(), atol=0, rtol=0)
+        torch.testing.assert_close(x_fp8_t.scale, x_fp8.scale.t(), atol=0, rtol=0)
+        self.assertEqual(x_fp8.block_size, (1, 512), atol=0, rtol=0)
+        self.assertEqual(x_fp8_t.block_size, (512, 1), atol=0, rtol=0)
+
 
 common_utils.instantiate_parametrized_tests(TestFloat8Tensor)
 
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -249,21 +249,59 @@ def from_hp(
 implements_torch_function = Float8Tensor.implements_torch_function
 
 
-@implements([aten.linear.default])
-@implements_torch_function([torch.nn.functional.linear])
+@implements(aten.linear.default)
+@implements_torch_function(torch.nn.functional.linear)
 def _(func, types, args, kwargs):
     input_tensor, weight_tensor, bias = (
         args[0],
         args[1],
         args[2] if len(args) > 2 else None,
     )
+    return _float8_mm_impl(input_tensor, weight_tensor.t(), bias)
+
+
+@implements(aten.matmul.default)
+@implements_torch_function(torch.matmul)
+def _(func, types, args, kwargs):
+    input_tensor, weight_tensor = args[0], args[1]
+    return _float8_mm_impl(input_tensor, weight_tensor)
+
+
+@implements(aten.mm.default)
+@implements_torch_function(torch.mm)
+def _(func, types, args, kwargs):
+    input_tensor, weight_tensor = args[0], args[1]
+    return _float8_mm_impl(input_tensor, weight_tensor)
+
+
+@implements(aten.addmm_.default)
+def _(func, types, args, kwargs):
+    bias_tensor, input_tensor, weight_tensor = (
+        args[0],
+        args[1],
+        args[2],
+    )
+    assert kwargs.get("alpha", 1) == 1, "only alpha=1 is supported"
+    assert kwargs.get("beta", 1) == 1, "only beta=1 is supported"
+    out = _float8_mm_impl(input_tensor, weight_tensor)
+    return bias_tensor.add_(out)
+
+
+def _float8_mm_impl(
+    input_tensor: Float8Tensor,
+    weight_tensor: Float8Tensor,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
     assert isinstance(weight_tensor, Float8Tensor), (
         f"Don't expect to reach here with an override other than weight currently, {type(input_tensor)} {type(weight_tensor)}"
     )
 
     act_quant_kwargs = weight_tensor.act_quant_kwargs
     # quantize activation, if `act_quant_kwargs` is specified
     if act_quant_kwargs is not None:
+        assert not isinstance(input_tensor, TorchAOBaseTensor), (
+            "input tensor was already quantized"
+        )
         input_tensor = _choose_quant_func_and_quantize_tensor(
             input_tensor, act_quant_kwargs
         )
@@ -290,6 +328,7 @@ def _(func, types, args, kwargs):
             assert is_sm_at_least_90(), "Expected SM90+ for fbgemm_gpu_genai"
             mm_config = weight_tensor.mm_config
             assert mm_config is not None
+            weight_tensor = weight_tensor.t()
 
             out_shape = get_out_shape(input_tensor.shape, weight_tensor.shape)
             xq = input_tensor.qdata.reshape(-1, input_tensor.qdata.shape[-1])
@@ -324,23 +363,16 @@ def _(func, types, args, kwargs):
             assert kernel_choice == "torch"
             scaled_mm_config = weight_tensor.mm_config
             assert scaled_mm_config is not None
-            out_shape = get_out_shape(input_tensor.shape, weight_tensor.shape)
+            out_shape = (*input_tensor.shape[:-1], weight_tensor.shape[1])
 
             # Extract tensor data and scales
             inpt_data = input_tensor.qdata.reshape(-1, input_tensor.qdata.shape[-1])
             w_data = weight_tensor.qdata
             input_scale = input_tensor.scale
             w_scale = weight_tensor.scale
 
-            # Handle rowwise scaling
-            if _is_rowwise_scaled(weight_tensor):
-                assert _is_rowwise_scaled(input_tensor), (
-                    "Input tensor must be rowwise block size"
-                )
-                w_scale = w_scale.transpose(-1, -2)
-
             input_scale = preprocess_scale(input_scale, input_tensor.shape)
-            inpt_data, w_data = preprocess_data(inpt_data, w_data.T, scaled_mm_config)
+            inpt_data, w_data = preprocess_data(inpt_data, w_data, scaled_mm_config)
 
             return addmm_float8_unwrapped_inference(
                 inpt_data,
@@ -357,9 +389,11 @@ def _(func, types, args, kwargs):
         )
         # when input is not `Float8Tensor`, we expect that it is not quantized
         # so this is float8 weight only quantization
-        return torch.nn.functional.linear(
-            input_tensor, weight_tensor.dequantize(), bias
-        )
+        out = torch.matmul(input_tensor, weight_tensor.dequantize())
+        if bias is not None:
+            return out + bias
+        else:
+            return out
 
 
 @implements_torch_function(torch.bmm)
@@ -677,6 +711,7 @@ def _(func, types, args, kwargs):
         assert original_shape[-1] == size[-1], (
             f"Only support reshaping when last dimension matches, requested: reshaping from {original_shape} to {size}"
         )
+        # TODO: this seems wrong, we should merge the first two dimensions instead
         qdata = self.qdata.reshape(*size)
         scale = self.scale.reshape(*size)
         block_size = self.block_size.copy()
@@ -785,6 +820,23 @@ def _(func, types, args, kwargs):
     return return_and_correct_aliasing(func, args, kwargs, new)
 
 
+@implements(aten.t.default)
+def _(func, types, args, kwargs):
+    assert len(args) == 1
+    self = args[0]
+    assert len(self.block_size) == 2
+    new_tensor = self.__class__(
+        self.qdata.t(),
+        self.scale.t(),
+        (self.block_size[1], self.block_size[0]),
+        self.mm_config,
+        self.act_quant_kwargs,
+        self.kernel_preference,
+        self.dtype,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new_tensor)
+
+
 Float8Tensor.__module__ = "torchao.quantization"
 
 # Allow a model with Float8Tensor weights to be loaded with `weights_only=True`
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -508,6 +508,36 @@ def _implements_common_tensor_ops(cls):
     implements_torch_function = cls.implements_torch_function
     aten = torch.ops.aten
 
+    @implements(torch.ops.aten.to.dtype_layout)
+    def _(func, types, args, kwargs):
+        # only support kwargs for now
+        assert len(args) == 1
+        self = args[0]
+        # only support dtype, layout, and device for now
+        for k in kwargs.keys():
+            assert k in ["dtype", "layout", "device"]
+        # only support same dtype and layout
+        # different dtype and layout has undefined behavior
+        if "dtype" in kwargs:
+            assert kwargs["dtype"] == self.dtype
+        if "layout" in kwargs:
+            assert kwargs["layout"] == self.layout
+        # if device is the same, treat this like a no-op
+        device = kwargs.get("device")
+        if device == self.device:
+            return self
+        new_tensor = args[0]._apply_fn_to_data(lambda x: func(x, device=device))
+        return return_and_correct_aliasing(func, args, kwargs, new_tensor)
+
+    # This is called during _apply() to see if we can shallow
+    # copy the content of one tensor into another. For now,
+    # we only allow shallow copy if both tensors are of the
+    # same type and have the same shape.
+    @implements_torch_function(torch._has_compatible_shallow_copy_type)
+    def _(func, types, args, kwargs):
+        assert len(args) == 2
+        return type(args[0]) == type(args[1]) and args[0].shape == args[1].shape
+
     @implements_torch_function(
         [
             torch.Tensor.contiguous,