Float8 autoquant weight only

jainapurva · jainapurva · commit e88c01dc3033 · 2024-09-20T13:15:33.000-07:00
diff --git a/scripts/hf_eval.py b/scripts/hf_eval.py
@@ -89,7 +89,7 @@ def all_linear(mod, name):
     with torch.no_grad():
         result = evaluate(
             HFLM(
-                pretrained=model,
+                pretrained=model.to(device),
                 tokenizer=tokenizer,
                 batch_size=batch_size,
                 max_length=max_length),
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -479,12 +479,19 @@ def from_float(cls, weight):
 
 class AQFloat8WeightOnlyQuantizedLinearWeight(AffineQuantizedTensor, AQMixin):
     """
-    AutoQuantizable version of Float8WeightOnlyQuantizedLinearWeight
+    AutoQuantizable version of Float8WeightOnlyQuantizedLinearWeight for target_dtype=torch.float8_e4m3fn
     """
+    target_dtype: torch.dtype = torch.float8_e4m3fn
+
+    @staticmethod
+    def _quantized_linear_op(act_mat, w_qtensor, bias):
+        return torch.nn.functional.linear(act_mat, w_qtensor.dequantize(), bias)
+
     @classmethod
     def from_float(cls, weight):
         block_size = (1, weight.shape[1])
-        return super(AQFloat8WeightOnlyQuantizedLinearWeight, cls).from_hp_to_floatx(weight, block_size, target_dtype=torch.float8_e4m3fn, layout_type=Float8LayoutType())
+        return super(AQFloat8WeightOnlyQuantizedLinearWeight, cls).from_hp_to_floatx(weight, block_size, target_dtype=cls.target_dtype, layout_type=Float8LayoutType())
+
 
 # here we don't include int4 quantization in since int8 tends to be a better apples to apples comparison
 DEFAULT_AUTOQUANT_CLASS_LIST = [
@@ -500,7 +507,7 @@ def from_float(cls, weight):
 DEFAULT_INT4_AUTOQUANT_CLASS_LIST = [
     AQFloatLinearWeight,
     AQInt8DynamicallyQuantizedLinearWeight,
-    AQInt4G64WeightOnlyQuantizedLinearWeight,
+    AQInt4G64WeightOnlyQuantizedLinearWeight
 ]
 
 def _change_linears_to_autoquantizable(model, **kwargs):