[SW-192809] fix json_file bug when instantiating FP8Config class

zyuwen-habana · xinhe3 · commit dc4b5f5ef972 · 2024-07-16T05:56:25.000Z
Change-Id: I4a715d0a706efe20ccdb49033755cabbc729ccdc
Signed-off-by: Zhou Yuwen &lt;zyuwen@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -1,13 +1,28 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import json
 import os
-import torch
-from enum import Enum, Flag, auto
 from dataclasses import dataclass
+from enum import Enum, Flag, auto
 from json.decoder import JSONDecodeError
 from typing import Any, Mapping
+
 import habana_frameworks.torch.utils.experimental as htexp
+import torch
 
 from ..utils.logger import logger
 
@@ -121,6 +136,16 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
                 else:
                     raise ValueError("invalid fp8_config in custom config. Enter E4M3 or E5M2")
 
+            if keys == "hp_dtype":
+                if custom_config[keys].lower() == "bf16":
+                    custom_config[keys] = torch.bfloat16
+                elif custom_config[keys].lower() == "fp16":
+                    custom_config[keys] = torch.float16
+                elif custom_config[keys].lower() == "fp32":
+                    custom_config[keys] = torch.float32
+                else:
+                    raise ValueError("invalid hp_dtype in custom config. Enter bf16, fp16 or fp32")
+
             if keys == "scale_method":
                 if custom_config[keys].lower() == "unit_scale":
                     custom_config[keys] = ScaleMethod.UNIT_SCALE
@@ -176,7 +201,7 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
         # If seperate_measure_files is True (default value), then it is assumed that there are multiple distinct measure and scale files
         # and they are stored in / loaded from paths with the correct index as a suffix. Else, only one is searched for.
         measured_global_config["local_rank"] = (
-            local_rank if local_rank >= 0 and (custom_config.get("seperate_measure_files", True) == True) else None
+            local_rank if local_rank >= 0 and custom_config.get("seperate_measure_files", True) else None
         )
 
         base_name = measured_global_config["dump_stats_path"].split("/")[-1]
@@ -185,7 +210,7 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
         os.makedirs(folder_name, exist_ok=True)
         worker_st = (
             ""
-            if measured_global_config["local_rank"] == None
+            if measured_global_config["local_rank"] is None
             else "_" + str(measured_global_config["local_rank"]) + "_" + str(measured_global_config["world_size"])
         )
         measured_global_config["shape_file"] = measured_global_config["dump_stats_path"] + "_hooks_shape" + worker_st
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -1259,7 +1259,7 @@ def __init__(
         self,
         dump_stats_path: str = "./hqt_output/measure",
         fp8_config: str = "E4M3",
-        hp_dtype: torch.dtype = torch.bfloat16,
+        hp_dtype: str = "bf16",
         blocklist: dict = {'names': [], 'types': ()},
         allowlist: dict = {'names': [], 'types': FP8_WHITE_LIST},
         mode: str = "AUTO",
@@ -1294,13 +1294,6 @@ def quantize(self):
 
     @property
     def json_file(self):
-        if self._json_file is None:
-            import tempfile
-            from pathlib import Path
-
-            json_file_tmp = tempfile.NamedTemporaryFile(suffix=".json")
-            self.to_json_file(json_file_tmp.name)
-            self.json_file(json_file_tmp.name)
         return self._json_file
 
     @json_file.setter
@@ -1315,6 +1308,14 @@ def from_json_file(cls, filename):
         config.json_file = filename
         return config
 
+    def save_temp_json_file(self):
+        import tempfile
+        from pathlib import Path
+
+        json_file_tmp = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
+        self.to_json_file(json_file_tmp.name)
+        self._json_file = json_file_tmp.name
+
     @classmethod
     def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]:
         # just a simple example here
@@ -1361,6 +1362,8 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]:
     def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
     ):
+        if self.json_file is None:
+            self.save_temp_json_file()
         config_mapping = OrderedDict()
         if config_list is None:
             config_list = [self]
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_hw_quant.json b/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_hw_quant.json
@@ -0,0 +1,16 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": "./test_outputs/unit_test"
+}
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_measure.json b/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_measure.json
@@ -0,0 +1,15 @@
+{
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": "./test_outputs/unit_test"
+}
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
@@ -0,0 +1,66 @@
+import copy
+import shutil
+
+import pytest
+import torch
+import transformers
+
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import PatchedLinear
+from neural_compressor.torch.quantization import (
+    FP8Config,
+    convert,
+    finalize_calibration,
+    get_default_fp8_config,
+    prepare,
+    quantize,
+)
+from neural_compressor.torch.utils import is_hpex_available
+
+
+@torch.no_grad()
+def calib_func(model):
+    example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to("hpu")
+    for i in range(2):
+        model(example_inputs)
+
+
+@pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
+class TestFP8StaticQuant:
+    def setup_class(self):
+        self.tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-GPTJForCausalLM",
+            device_map="cpu",
+        )
+        self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long)
+
+    def teardown_class(self):
+        shutil.rmtree("test_ouputs", ignore_errors=True)
+
+    def test_one_step_quant(self):
+        model = copy.deepcopy(self.tiny_gptj)
+        qconfig = FP8Config(fp8_config="E4M3")
+        model = prepare(model, qconfig)
+        assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not prepared."
+        calib_func(model)
+        model = convert(model)
+        assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not quantized."
+        assert (
+            model.transformer.h[0].attn.k_proj.quant_input.lp_dtype == torch.float8_e4m3fn
+        ), "k_proj input dtype is not torch.float8_e4m3fn."
+
+    def test_two_step_quant(self):
+        # step 1: measurement
+        model = copy.deepcopy(self.tiny_gptj)
+        config = FP8Config.from_json_file("test_fp8_jsons/test_measure.json")
+        model = prepare(model, config)
+        calib_func(model)
+        finalize_calibration(model)
+        assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not observed."
+        # step 2: quantize based on measurement
+        model = copy.deepcopy(self.tiny_gptj)
+        config = FP8Config.from_json_file("test_fp8_jsons/test_hw_quant.json")
+        model = convert(model, config)
+        assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not quantized."
+        assert (
+            model.transformer.h[0].attn.k_proj.quant_input.lp_dtype == torch.float8_e4m3fn
+        ), "k_proj input dtype is not torch.float8_e4m3fn."