Added op support for float8

jainapurva · jainapurva · commit bcbec9ec0a06 · 2024-10-03T10:22:20.000-07:00
diff --git a/test/dtypes/test_affine_quantized_tensor_parallel.py b/test/dtypes/test_affine_quantized_tensor_parallel.py
@@ -1,12 +1,17 @@
 from torchao.testing.utils import copy_tests, TorchAOTensorParallelTestCase
 from torch.testing._internal.common_utils import run_tests
-from torchao.quantization import int8_weight_only
+from torchao.quantization import int8_weight_only, float8_weight_only
 
-class TestAffineQuantizedTensorParallel(TorchAOTensorParallelTestCase):
-    pass
+# class TestAffineQuantizedTensorParallel(TorchAOTensorParallelTestCase):
+#     pass
 
+class TestAffineQuantizedTensorParallel(TorchAOTensorParallelTestCase):
+    QUANT_METHOD_FN = staticmethod(float8_weight_only)
 
-copy_tests(TorchAOTensorParallelTestCase, TestAffineQuantizedTensorParallel, "aqt_tp")
+print('Copy test started...')
+copy_tests(TorchAOTensorParallelTestCase, TestAffineQuantizedTensorParallel, "fp8wo_tp")
+print('Copy test finished')
 
 if __name__ == "__main__":
+    print("Running TestAffineQuantizedTensorParallel")
     run_tests()
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -1094,20 +1094,31 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             return return_and_correct_aliasing(
                 func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
             )
-        if func is aten.clone.default:
+        elif func is aten.clone.default:
             return return_and_correct_aliasing(
                 func, args, kwargs, args[0]._apply_fn_to_data(torch.clone)
             )
-        if func is aten.t.default:
+        elif func is aten.t.default:
             """we don't need to repack the weight and just rely on external
             shape being changed and record the status of transpose/no-transpose
             """
             args[0].transposed = not args[0].transposed
             return return_and_correct_aliasing(func, args, kwargs, args[0])
-
-        raise NotImplementedError(
-            f"Float8AQTLayout dispatch: attempting to run {func}, this is not supported"
-        )
+        elif func is aten.slice.Tensor:
+            self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
+            if dim == 0:
+                return return_and_correct_aliasing(
+                    func, args, kwargs, args[0]._apply_fn_to_data(lambda x: aten.slice.Tensor(x, dim, start, end, step))
+                )
+            elif dim == 1:
+                assert len(self.scale.shape) == 1, f"slice dim==1 only works when len(scale.shape) == 1 currently, got: {self.scale.shape}"
+                return Float8AQTLayout(aten.slice.Tensor(self.float8_data, dim, start, end, step), self.scale, None, self.layout_type)
+            else:
+                raise NotImplementedError(f"Float8AQTLayout dispatch: attempting to run {func}, with dim={dim}, that is not supported")
+        else:
+            raise NotImplementedError(
+                f"Float8AQTLayout dispatch: attempting to run {func}, this is not supported"
+            )
 
     __torch_function__ = torch._C._disabled_torch_function_impl
 
@@ -1644,6 +1655,28 @@ def _linear_fp8_act_fp8_weight_impl(
         use_fast_accum=scaled_mm_config.use_fast_accum,
     ).reshape(out_shape)
 
+def _linear_fp_act_fp8_weight_check(
+    input_tensor: Union[torch.Tensor, AffineQuantizedTensor],
+    weight_tensor: Union[torch.Tensor, AffineQuantizedTensor],
+    bias: Optional[torch.Tensor],
+) -> bool:
+    return (
+        # input is native float tensor
+        not is_traceable_wrapper_subclass(input_tensor) and
+        input_tensor.is_floating_point() and
+        # weight is float8 quantized affine quantized tensor
+        isinstance(weight_tensor, AffineQuantizedTensor) and
+        isinstance(weight_tensor.layout_type, Float8LayoutType)
+        and weight_tensor.layout_tensor.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
+        and (weight_tensor.shape == weight_tensor.block_size or _is_rowwise_scaled(weight_tensor))
+    )
+
+def _linear_fp_act_fp8_weight_impl(
+    input_tensor: torch.Tensor,
+    weight_tensor: AffineQuantizedTensor,
+    bias: Optional[torch.Tensor],
+):
+    return torch.nn.functional.linear(input_tensor, weight_tensor.dequantize(), bias)
 
 def _linear_fp_act_int4_weight_sparse_marlin_check(input_tensor, weight_tensor, bias):
     return (
@@ -1694,6 +1727,7 @@ def _register_aqt_quantized_linear_dispatches():
         (_linear_int8_act_int8_weight_semi_structured_sparse_check, _linear_int8_act_int8_weight_semi_structured_sparse_impl),
         (_linear_int8_act_int8_weight_block_sparse_check, _linear_int8_act_int8_weight_block_sparse_impl),
         (_linear_fp8_act_fp8_weight_check, _linear_fp8_act_fp8_weight_impl),
+        (_linear_fp_act_fp8_weight_check, _linear_fp_act_fp8_weight_impl),
         (_linear_bf16_act_uint4_weight_check, _linear_bf16_act_uint4_weight_impl),
         (_linear_fp_act_int8_weight_check, _linear_fp_act_int8_weight_impl),
         (_linear_f16_act_floatx_weight_check, _linear_f16_act_floatx_weight_impl),
diff --git a/torchao/testing/utils.py b/torchao/testing/utils.py
@@ -285,49 +285,53 @@ def test_tp(self, dtype):
         device = "cuda"
         # To make sure different ranks create the same module
         torch.manual_seed(5)
-
+        print('Step 1')
         class M(torch.nn.Module):
             def __init__(self, in_features, out_features, **kwargs) -> None:
                 super().__init__(**kwargs)
                 self.linear = torch.nn.Linear(in_features, out_features, bias=False, device="cuda")
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 return self.linear(x)
-
+        print('Step 2')
         # Get rank and device
         device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
-
+        print('Step 3')
         # Original model
         proj_up = M(1024, 2048).to(device).to(dtype)
         proj_dn = M(2048, 1024).to(device).to(dtype)
         example_input = 100 * torch.randn(128, 1024, device=device, dtype=dtype)
         y = proj_dn(proj_up(example_input))
-
+        print('Step 4')
         # Quantize the model
         up_quant = self.quantize(proj_up)
         dn_quant = self.quantize(proj_dn)
         y_q = dn_quant(up_quant(example_input))
-
+        print('Step 5')
         mesh = self.build_device_mesh()
         # Shard the models
         up_dist = self.colwise_shard(up_quant, mesh)
         dn_dist = self.rowwise_shard(dn_quant, mesh)
-
+        print('Step 6')
         # We need to turn inputs into DTensor form as well -- just a format change
         input_dtensor = DTensor.from_local(
             example_input, mesh, [Replicate()]
         )
-
+        print('Step 7')
         y_d = dn_dist(up_dist(input_dtensor))
-
+        print('Step 8')
         if not TORCH_VERSION_AT_LEAST_2_5:
             # Need torch 2.5 to support compiled tensor parallelism
             return
-
+        print('Step 9')
         up_compiled = torch.compile(up_dist)
+        print('Step 10')
         y_up = up_compiled(input_dtensor)
+        print('Step 11')
         dn_compiled = torch.compile(dn_dist)
+        print('Step 12')
         y_dn = dn_compiled(y_up)
+        print('Step 13')
 
 common_utils.instantiate_parametrized_tests(TorchAOBasicTestCase)
 common_utils.instantiate_parametrized_tests(TorchAOCompileTestCase)