[ALGO-801] Add Fake Quant option in linear and matmul layers

Asaf Karnieli · ulivne · commit a529cf4a885f · 2024-08-12T16:15:43.000Z
Change-Id: I9888c92ffc33035f75d434044f4ef41b58f51e62
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -68,11 +68,6 @@ def __init__(self, num_inputs, param_names, num_outputs, required_output):
     "softmax": ModuleType(1, [], 1, True),
     "fused_sdpa": ModuleType(3, [], 2, True),
 }
-descale_fcn = lambda x, scale: torch.mul(x, scale)
-scale_fcn = lambda x, scale: torch.div(x, scale)
-cast_fcn = lambda x, dtype: x.to(dtype=dtype)
-cast_to_fp8_fcn = lambda x, dtype, scale_inv=None: torch.ops.hpu.cast_to_fp8_v2(x, scale_inv, False, False, dtype)[0]
-cast_from_fp8_fcn = lambda x, dtype, scale=None: torch.ops.hpu.cast_from_fp8(x, scale, dtype)
 
 
 class ShapeList:
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 import habana_frameworks.torch.core as htcore
 import habana_frameworks.torch.utils.experimental as htexp
-import torch
-
-from .common import *
+from .common import ModuleConfig
+from .quant_dequant import cast_to_fp8_fcn, cast_fcn, descale_fcn, scale_fcn
 
 GAUDI2 = htexp.synDeviceType.synDeviceGaudi2
 GAUDI3 = htexp.synDeviceType.synDeviceGaudi3
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
@@ -12,11 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch.nn as nn
+import torch
 from abc import abstractmethod
+import habana_frameworks.torch.core as htcore
 
-import torch.nn as nn
 
-from .common import *
+descale_fcn = lambda x, scale: torch.mul(x, scale)
+scale_fcn = lambda x, scale: torch.div(x, scale)
+cast_fcn = lambda x, dtype: x.to(dtype=dtype)
+cast_to_fp8_fcn = lambda x, dtype, scale_inv=None: torch.ops.hpu.cast_to_fp8_v2(x, scale_inv, False, False, dtype)[0]
+cast_from_fp8_fcn = lambda x, dtype, scale=None: torch.ops.hpu.cast_from_fp8(x, scale, dtype)
 
 
 class QuantDequantBase(nn.Module):
@@ -69,3 +75,22 @@ def forward(self, x):
     def extra_repr(self) -> str:
         repr = super(DequantOutput, self).extra_repr()
         return f"{repr}, scale dtype={self.scale.dtype}"
+
+
+class QuantDequant(QuantDequantBase):
+    def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
+        super(QuantDequant, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        self.scale_inv = nn.Parameter(scale_inv)
+        self.scale = nn.Parameter(1 / scale_inv)
+
+    def forward(self, x, *args, **kwargs):
+        y = cast_to_fp8_fcn(x, self.lp_dtype, self.scale_inv)
+        # mark_step is needed so fuser won't remove 2 consecutive casts.
+        # will be removed once SW-196431 is implemented
+        htcore.mark_step()
+        z = cast_from_fp8_fcn(y, self.hp_dtype, self.scale)
+        return z
+
+    def extra_repr(self) -> str:
+        repr = super(QuantDequant, self).extra_repr()
+        return f"{repr}, Quantize, and then dequantize"
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -97,7 +97,10 @@ def prepare_model(model, qconfig, mod_list, hp_dtype=torch.float):
             apply_hf_hook(mod)
             if name in mod_list:
                 mod_extra_config = qconfig[name]
-                quantize_params(mod, mod_extra_config)
+
+                if config.cfg["fake_quant"] == False:
+                    quantize_params(mod, mod_extra_config)
+
                 patch_module(mod, mod_extra_config, mod_default_dict)
                 patched_modules.append(name)
                 patched_module_types.add(type(mod))
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -16,6 +16,7 @@
 import torch.nn as nn
 
 from .quant_config import QuantMode, get_hqt_config
+from .._core.quant_dequant import QuantDequant as qdq
 
 try:  # backwards compatibility for 1.16
     from habana_frameworks.torch.hpex.kernels import fp8_fused_sdpa
@@ -122,6 +123,7 @@ def set_attrs_from_orig_model(cls_instance, mod, mod_extra_config, *func_names):
     cls_instance.class_name_org = mod.__class__.__name__
     cls_instance._mod_extra_config = mod_extra_config
     cls_instance.quantization_mode = config.cfg["mode"]
+    cls_instance.fake_quant = config.cfg["fake_quant"]
     # store original module in order to invoke its functions during measurements.
     # this may be omitted of torch remove the related validation from dynamo. see SW-187731.
     cls_instance.__dict__["orig_mod"] = mod
@@ -160,14 +162,25 @@ def __init__(self, mod, mod_extra_config, *args, **kwargs):
         super().__init__()
         set_attrs_from_orig_model(self, mod, mod_extra_config)
         if self.quantization_mode == QuantMode.QUANTIZE:
-            self.quant_input_0 = self._mod_extra_config.inputs[0]
-            self.quant_input_1 = self._mod_extra_config.inputs[1]
-            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
-            self.scale_other = nn.Parameter(mod_extra_config.scale.inputs[1])
+            if self.fake_quant == False:
+                self.forward = self.forward_quant
+                self.quant_input_0 = self._mod_extra_config.inputs[0]
+                self.quant_input_1 = self._mod_extra_config.inputs[1]
+                self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+                self.scale_other = nn.Parameter(mod_extra_config.scale.inputs[1])
+            else:
+                self.forward = self.forward_fakequant
+
+                # override quantization to quant-dequant
+                mec = self._mod_extra_config.inputs[0]
+                self.quant_input_0 = qdq(mec.scale_inv, mec.lp_dtype, mec.hp_dtype)
+                mec = self._mod_extra_config.inputs[1]
+                self.quant_input_1 = qdq(mec.scale_inv, mec.lp_dtype, mec.hp_dtype)
+
         elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
             self.forward = self.forward_measure
 
-    def forward(self, input, other):
+    def forward_quant(self, input, other):
         qinput = self.quant_input_0(input)
         qother = self.quant_input_1(other)
         output = matmul_fp8(
@@ -179,6 +192,12 @@ def forward(self, input, other):
         )
         return output
 
+    def forward_fakequant(self, input, other):
+        qinput = self.quant_input_0(input)
+        qother = self.quant_input_1(other)
+        output = torch.matmul(qinput, qother)
+        return output
+
     def forward_measure(self, input, other):
         measure_input((input, other), observer=self._mod_extra_config.inputs)
         output = self.orig_mod(input, other)
@@ -198,21 +217,40 @@ def __init__(self, mod, mod_extra_config, *args, **kwargs):
         super().__init__()
         set_attrs_from_orig_model(self, mod, mod_extra_config)
         if self.quantization_mode == QuantMode.QUANTIZE:
-            # When offloading weights to disk using device_map, the module forward is overridden.
-            # __dict__.update call again overrides the PatchedLinear forward with the forward that device_map planted.
-            # So need to set PatchedLinear forawrd to be the right forward.
-            self.forward = self.forward_quant
-            self.quant_input = self._mod_extra_config.inputs[0]
             self.weight = nn.Parameter(self.weight.t().contiguous())
             self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
             if isinstance(mod_extra_config.scale.params["weight"], (torch.Tensor, float)):
                 self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
             elif isinstance(mod_extra_config.scale.params["weight"], dict):
                 # PCQ weight is calculated with actual weight [0] and ones [1]
                 self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"][0])
+
+            if self.fake_quant == False:
+                # When offloading weights to disk using device_map, the module forward is overridden.
+                # __dict__.update call again overrides the PatchedLinear forward with the forward that device_map planted.
+                # So need to set PatchedLinear forawrd to be the right forward.
+                self.forward = self.forward_quant
+                self.quant_input = self._mod_extra_config.inputs[0]
+
+            else:
+                self.forward = self.forward_fakequant
+                # override quantization to quant-dequant
+                mec = self._mod_extra_config.inputs[0]
+                self.quant_input = qdq(mec.scale_inv, mec.lp_dtype, mec.hp_dtype)
+                mec = self._mod_extra_config.params['weight']
+                self.quant_weights = qdq(mec.scale_inv, mec.lp_dtype, mec.hp_dtype)
+
+
         elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
             self.forward = self.forward_measure
 
+    def forward_fakequant(self, input):
+        qweight = self.quant_weights(self.weight, )
+        qinput =  self.quant_input(input)
+        y = torch.matmul(qinput, qweight)
+        output = y + self.bias if (self.bias is not None) else y
+        return output
+
     def forward_quant(self, input):
         qinput = self.quant_input(input)
         y = matmul_fp8(
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -36,7 +36,6 @@ class QuantMode(Enum):
     MEASURE = 2
     SHAPE = 3
 
-
 class MeasureExclude(Flag):
     NONE = auto()
     INPUT = auto()
@@ -68,7 +67,6 @@ class ScaleMethod(Enum):
     MAXABS_HW_OPT_WEIGHT = 12
     MAXABS_POW2_OPT_WEIGHT = 13
 
-
 class TrueFalse(Enum):
     TRUE = True
     FALSE = False
@@ -82,10 +80,11 @@ class TrueFalse(Enum):
     "scale_method": ScaleMethod,
     "recalc_scales": TrueFalse,
     "ignore_modules_wo_measures": TrueFalse,
+    "fake_quant": TrueFalse
 }
 
 
-_configs_that_use_enum_value = ["fp8_config", "hp_dtype", "ignore_modules_wo_measures", "recalc_scales"]
+_configs_that_use_enum_value = ["fp8_config", "hp_dtype", "ignore_modules_wo_measures", "recalc_scales", "fake_quant"]
 
 
 def get_hqt_config(mod) -> Fp8cfg:
@@ -121,6 +120,7 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
                 "types": (),
             },  # types and names to be quantized. Allowlist by names is not yet implemented
             "mode": QuantMode.QUANTIZE,  # Quantize or Measure
+            "fake_quant": False, # Fake or Real Quant
             "scale_method": ScaleMethod.UNIT_SCALE,  # Method to quantize with
             "scale_params": {},  # scaling parameters that are different then the default ones
             "observer": "maxabs",  # Supported ['shape', 'maxabs', 'maxabs_per_channel', 'save']
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -1256,6 +1256,7 @@ def __init__(
         observer: str = "maxabs",
         mod_dict: dict = {},
         measure_exclude: str = "OUTPUT",
+        fake_quant: bool = False,
         **kwargs,
     ):
         """Init FP8 config."""
@@ -1271,6 +1272,7 @@ def __init__(
         self.observer = observer
         self.mod_dict = mod_dict
         self._json_file = None
+        self.fake_quant = fake_quant
 
     @property
     def measure(self):
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_fakequant.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_fakequant.py
@@ -0,0 +1,78 @@
+import typing
+import pytest
+import copy
+import torch
+
+import habana_frameworks.torch.core as htcore
+
+htcore.hpu_set_env()
+
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import Matmul
+
+torch.manual_seed(1)
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 200, bias=False)
+        self.fc2 = torch.nn.Linear(10, 200, bias=True)
+        self.matmul = Matmul()
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(inp)
+        x3 = self.matmul(x1, x2.t())
+        return x3
+
+
+def test_fakequant():
+    # Run both real and fake quantization, and compare
+
+    model = M().eval().to("hpu").to(torch.bfloat16)
+    model_fake = copy.deepcopy(model)
+    htcore.hpu_initialize()
+
+    config_dict_fake = {
+        "mode": "AUTO",
+        "observer": "maxabs",
+        "scale_method": "maxabs_hw",
+        "allowlist": {"types": [], "names":  []},
+        "blocklist": {"types": [], "names":  []},
+        "dump_stats_path": "./inc_output/measure_fake",
+        "fake_quant": "True",
+    }
+
+    config_dict = {
+        "mode": "AUTO",
+        "observer": "maxabs",
+        "scale_method": "maxabs_hw",
+        "allowlist": {"types": [], "names":  []},
+        "blocklist": {"types": [], "names":  []},
+        "dump_stats_path": "./inc_output/measure",
+        "fake_quant": "False",
+    }
+
+    config = FP8Config.from_dict(config_dict)
+    config_fake = FP8Config.from_dict(config_dict_fake)
+
+    model = prepare(model, config)
+    model_fake = prepare(model_fake, config_fake)
+    inp_calib = torch.arange(0, 100, 0.1, dtype=torch.bfloat16).to("hpu").reshape(-1, 10)
+    inp_test = torch.rand(10000, dtype=torch.bfloat16).reshape(-1, 10).to("hpu") * 100
+
+    # for calibration
+    with torch.no_grad():
+        a = model(inp_calib)
+        b = model_fake(inp_calib)
+
+    model = convert(model)
+    model_fake = convert(model_fake)
+
+    # for benchmark
+    with torch.no_grad():
+        output = model(inp_test).cpu()
+        output_fake = model_fake(inp_test).cpu()
+    assert torch.allclose(output, output_fake, rtol=0.01), f"FakeQuant failed"
+
+