From 4e33f9675718afa6dd0da2609e59a7c3ad6d2b13 Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Tue, 9 Jan 2024 18:33:43 +0800
Subject: [PATCH 01/12] update onnxrt calibration

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 neural_compressor/adaptor/onnxrt.py               | 2 +-
 neural_compressor/adaptor/ox_utils/calibration.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 738aa7833d0..990051e7808 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -765,7 +765,7 @@ def _get_quantize_params(self, model, data_loader, quantize_config, iterations,
             black_nodes=black_nodes,
             white_nodes=white_nodes,
             iterations=list(range(0, iterations)),
-            backend=self.backend if self.backend != "DmlExecutionProvider" else "CPUExecutionProvider",
+            backend=self.backend,
             reduce_range=self.reduce_range,
             **kwargs,
         )
diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index 34c899f9090..90992c0cc7c 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -242,6 +242,7 @@ def get_intermediate_outputs(self, q_config=None):
         """Gather intermediate model outputs after running inference."""
         # conduct inference session and get intermediate outputs
         so = onnxruntime.SessionOptions()
+        so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
         if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"):  # pragma: no cover
             from onnxruntime_extensions import get_library_path
 

From 53945b5524aab816851d1f831a3369c50b644cff Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Tue, 9 Jan 2024 17:41:24 -0800
Subject: [PATCH 02/12] fix onnxrt calibration for dml ep

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 neural_compressor/adaptor/onnxrt.py           |  2 +-
 .../adaptor/ox_utils/calibration.py           | 50 ++++++++++++++++---
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 738aa7833d0..990051e7808 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -765,7 +765,7 @@ def _get_quantize_params(self, model, data_loader, quantize_config, iterations,
             black_nodes=black_nodes,
             white_nodes=white_nodes,
             iterations=list(range(0, iterations)),
-            backend=self.backend if self.backend != "DmlExecutionProvider" else "CPUExecutionProvider",
+            backend=self.backend,
             reduce_range=self.reduce_range,
             **kwargs,
         )
diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index 34c899f9090..a3af241ba6c 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -104,7 +104,7 @@ def __init__(
     def dataloder_for_next_split_model(self):
         """Return dataloader for next split model for layer-wise quantization."""
         return self._dataloder_for_next_split_model
-
+            
     def augment_graph(self, activation_only=False, weight_only=False):
         """Augment_graph.
 
@@ -133,7 +133,10 @@ def augment_graph(self, activation_only=False, weight_only=False):
 
         added_nodes = []
         added_outputs = []
+
+        # calibrate initializer tensors (like weight & bias) and output tensors seperatly
         tensors_to_dump = set()
+        initializer_tensors_to_dump = set()
 
         for augment_node_type in self.augment_nodes:
             if augment_node_type not in ["DequantizeLinear"]:  # pragma: no cover
@@ -159,9 +162,12 @@ def augment_graph(self, activation_only=False, weight_only=False):
             )
             if should_be_dump:
                 if not weight_only and not activation_only:
-                    tensors_to_dump.update([input for input in node.input if len(input) != 0])
+                    # update input tensors which should be dump
+                    self._update_input_tensor_to_dump([input for input in node.input if len(input) != 0],
+                                                    initializer_tensors_to_dump,
+                                                    tensors_to_dump)
+                    # update output tensors which should be dump
                     tensors_to_dump.update([output for output in node.output if len(output) != 0])
-                    tensors_to_dump.update(node.output)
                 elif weight_only:
                     for input in node.input:
                         if (
@@ -169,16 +175,22 @@ def augment_graph(self, activation_only=False, weight_only=False):
                             and input.replace("_dequantized", "_quantized") in initializers
                             and len(input) != 0
                         ):
-                            tensors_to_dump.add(input)
+                            self._update_input_tensor_to_dump(input,
+                                                              initializer_tensors_to_dump,
+                                                              tensors_to_dump)
                         elif not self.already_quantized and input in initializers and len(input) != 0:
-                            tensors_to_dump.add(input)
+                            self._update_input_tensor_to_dump(input,
+                                                              initializer_tensors_to_dump,
+                                                              tensors_to_dump)
                 elif activation_only:
                     if len(node.input[0]) != 0:
                         tensors_to_dump.update([node.input[0]])
 
+        self.initializer_tensors_to_dump = initializer_tensors_to_dump
         model_inputs = [i.name for i in model.graph.input]
+
         for tensor in tensors_to_dump:
-            if tensor not in node_outputs and tensor not in initializers and tensor not in model_inputs:
+            if tensor not in node_outputs and tensor not in model_inputs:
                 continue
             if self.augment_nodes:
                 for augment_node_type in self.augment_nodes:
@@ -242,6 +254,7 @@ def get_intermediate_outputs(self, q_config=None):
         """Gather intermediate model outputs after running inference."""
         # conduct inference session and get intermediate outputs
         so = onnxruntime.SessionOptions()
+        so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
         if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"):  # pragma: no cover
             from onnxruntime_extensions import get_library_path
 
@@ -280,6 +293,7 @@ def get_intermediate_outputs(self, q_config=None):
             assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name)
             name_to_node[data_name] = node.name
 
+        # step 1: calibrate output tensors
         output_dicts = {}
         intermediate_tensor = {}
         name_to_calibrator = {}
@@ -373,10 +387,34 @@ def _collect_data(ort_inputs):
             calibrator.clear()
             del calibrator
 
+        # step 2: calibrate initializer tensors (like weight & bias) using minmax method
+        for initializer_tensor_name in self.initializer_tensors_to_dump:
+            initializer_tensor = augment_model_wrapper.get_initializer(initializer_tensor_name)
+            if initializer_tensor is None: # pragma: no cover
+                continue
+            initializer_tensor = numpy_helper.to_array(initializer_tensor)
+            calibrator = CALIBRATOR["minmax"]()
+            calibrator.collect(initializer_tensor)
+            output_dicts[initializer_tensor_name] = [list(calibrator.calib_range)]
+            calibrator.clear()
+            del calibrator
+
         self._dataloder_for_next_split_model = ort_inputs_for_next_split_model
 
         return list(output_dicts.keys()), output_dicts
 
+    def _update_input_tensor_to_dump(self, tensor_names, initializer_tensors_to_dump, tensors_to_dump):
+        """Update input tensor to dump accroding to whether it is in initializer."""
+        if isinstance(tensor_names, str):
+            tensor_names = [tensor_names]
+        tensor_in_initializer, tensor_not_in_initializer = [], []
+        for tensor_name in tensor_names:
+            initializer_tensor = self.model_wrapper.get_initializer(tensor_name)
+            if initializer_tensor is None:
+                tensors_to_dump.update([tensor_name])
+            else:
+                initializer_tensors_to_dump.update([tensor_name])
+
     def _dequantize(self, tensor, scale_tensor, zo_tensor):
         """Helper function to dequantize tensor."""
         int_tensor = self.model_wrapper.get_initializer(tensor)

From 3bad612101a3598fcc4a12a4792564ab2a82e73f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 10 Jan 2024 05:20:30 +0000
Subject: [PATCH 03/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../adaptor/ox_utils/calibration.py            | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index a3af241ba6c..b6abc6c7dfe 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -104,7 +104,7 @@ def __init__(
     def dataloder_for_next_split_model(self):
         """Return dataloader for next split model for layer-wise quantization."""
         return self._dataloder_for_next_split_model
-            
+
     def augment_graph(self, activation_only=False, weight_only=False):
         """Augment_graph.
 
@@ -163,9 +163,9 @@ def augment_graph(self, activation_only=False, weight_only=False):
             if should_be_dump:
                 if not weight_only and not activation_only:
                     # update input tensors which should be dump
-                    self._update_input_tensor_to_dump([input for input in node.input if len(input) != 0],
-                                                    initializer_tensors_to_dump,
-                                                    tensors_to_dump)
+                    self._update_input_tensor_to_dump(
+                        [input for input in node.input if len(input) != 0], initializer_tensors_to_dump, tensors_to_dump
+                    )
                     # update output tensors which should be dump
                     tensors_to_dump.update([output for output in node.output if len(output) != 0])
                 elif weight_only:
@@ -175,13 +175,9 @@ def augment_graph(self, activation_only=False, weight_only=False):
                             and input.replace("_dequantized", "_quantized") in initializers
                             and len(input) != 0
                         ):
-                            self._update_input_tensor_to_dump(input,
-                                                              initializer_tensors_to_dump,
-                                                              tensors_to_dump)
+                            self._update_input_tensor_to_dump(input, initializer_tensors_to_dump, tensors_to_dump)
                         elif not self.already_quantized and input in initializers and len(input) != 0:
-                            self._update_input_tensor_to_dump(input,
-                                                              initializer_tensors_to_dump,
-                                                              tensors_to_dump)
+                            self._update_input_tensor_to_dump(input, initializer_tensors_to_dump, tensors_to_dump)
                 elif activation_only:
                     if len(node.input[0]) != 0:
                         tensors_to_dump.update([node.input[0]])
@@ -390,7 +386,7 @@ def _collect_data(ort_inputs):
         # step 2: calibrate initializer tensors (like weight & bias) using minmax method
         for initializer_tensor_name in self.initializer_tensors_to_dump:
             initializer_tensor = augment_model_wrapper.get_initializer(initializer_tensor_name)
-            if initializer_tensor is None: # pragma: no cover
+            if initializer_tensor is None:  # pragma: no cover
                 continue
             initializer_tensor = numpy_helper.to_array(initializer_tensor)
             calibrator = CALIBRATOR["minmax"]()

From 81ed91346310474d5465dccf4f341d7b68a064c6 Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Wed, 10 Jan 2024 13:47:02 +0800
Subject: [PATCH 04/12] fix typo

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 neural_compressor/adaptor/ox_utils/calibration.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index b6abc6c7dfe..813db137342 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -134,7 +134,7 @@ def augment_graph(self, activation_only=False, weight_only=False):
         added_nodes = []
         added_outputs = []
 
-        # calibrate initializer tensors (like weight & bias) and output tensors seperatly
+        # calibrate initializer tensors (like weight & bias) and output tensors separately
         tensors_to_dump = set()
         initializer_tensors_to_dump = set()
 
@@ -400,7 +400,7 @@ def _collect_data(ort_inputs):
         return list(output_dicts.keys()), output_dicts
 
     def _update_input_tensor_to_dump(self, tensor_names, initializer_tensors_to_dump, tensors_to_dump):
-        """Update input tensor to dump accroding to whether it is in initializer."""
+        """Update input tensor to dump according to whether it is in initializer."""
         if isinstance(tensor_names, str):
             tensor_names = [tensor_names]
         tensor_in_initializer, tensor_not_in_initializer = [], []

From 4a2281d00dff5df0c1f126b847c989bea40f19c9 Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Wed, 10 Jan 2024 18:15:33 +0800
Subject: [PATCH 05/12] update calibration.py

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 .../adaptor/ox_utils/calibration.py           | 157 +++++++++++-------
 1 file changed, 97 insertions(+), 60 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index 813db137342..178ba361cbd 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -105,7 +105,7 @@ def dataloder_for_next_split_model(self):
         """Return dataloader for next split model for layer-wise quantization."""
         return self._dataloder_for_next_split_model
 
-    def augment_graph(self, activation_only=False, weight_only=False):
+    def augment_graph(self):
         """Augment_graph.
 
         Adds nodes to all quantization_candidates op type nodes in model and
@@ -118,7 +118,7 @@ def augment_graph(self, activation_only=False, weight_only=False):
         self.dequantized_output.clear()
         onnx_version = Version(onnx.__version__)
         if onnx_version < ONNX18_VERSION:
-            logger.warning("Static quantization for NLP model is supported " "at onnx 1.8.0 and newer.")
+            logger.warning("Static quantization for NLP model is supported at onnx 1.8.0 and newer.")
         if self.already_quantized and any(
             [i.dims in [1, 2] for i in self.model_wrapper.initializer() if i.name.endswith("_scale")]
         ):
@@ -133,15 +133,12 @@ def augment_graph(self, activation_only=False, weight_only=False):
 
         added_nodes = []
         added_outputs = []
-
-        # calibrate initializer tensors (like weight & bias) and output tensors separately
         tensors_to_dump = set()
-        initializer_tensors_to_dump = set()
 
         for augment_node_type in self.augment_nodes:
             if augment_node_type not in ["DequantizeLinear"]:  # pragma: no cover
                 raise ValueError(
-                    "Unexpected augment_node {} only DequantizeLinear is " "supported".format(augment_node_type)
+                    "Unexpected augment_node {} only DequantizeLinear is supported".format(augment_node_type)
                 )
 
         if self.already_quantized:
@@ -149,11 +146,11 @@ def augment_graph(self, activation_only=False, weight_only=False):
             new_white_nodes = []
             for white_node in self.white_nodes:
                 new_white_node = white_node + "_quant"
-                assert new_white_node in model_nodes_names, "no quantized {} in the " "graph".format(white_node)
+                assert new_white_node in model_nodes_names, "no quantized {} in the graph".format(white_node)
                 new_white_nodes.append(new_white_node)
             self.white_nodes = new_white_nodes
 
-        initializers = {i.name: i.data_type for i in model.graph.initializer}
+        # initializers = {i.name: i.data_type for i in model.graph.initializer}
         node_outputs = []
         for node in model.graph.node:  # pylint: disable=no-member
             node_outputs.extend(node.output)
@@ -161,30 +158,42 @@ def augment_graph(self, activation_only=False, weight_only=False):
                 node.name in self.white_nodes
             )
             if should_be_dump:
-                if not weight_only and not activation_only:
-                    # update input tensors which should be dump
-                    self._update_input_tensor_to_dump(
-                        [input for input in node.input if len(input) != 0], initializer_tensors_to_dump, tensors_to_dump
-                    )
-                    # update output tensors which should be dump
-                    tensors_to_dump.update([output for output in node.output if len(output) != 0])
-                elif weight_only:
-                    for input in node.input:
-                        if (
-                            self.already_quantized
-                            and input.replace("_dequantized", "_quantized") in initializers
-                            and len(input) != 0
-                        ):
-                            self._update_input_tensor_to_dump(input, initializer_tensors_to_dump, tensors_to_dump)
-                        elif not self.already_quantized and input in initializers and len(input) != 0:
-                            self._update_input_tensor_to_dump(input, initializer_tensors_to_dump, tensors_to_dump)
-                elif activation_only:
-                    if len(node.input[0]) != 0:
-                        tensors_to_dump.update([node.input[0]])
-
-        self.initializer_tensors_to_dump = initializer_tensors_to_dump
-        model_inputs = [i.name for i in model.graph.input]
+                # add input tensors which should be dump
+                for input in node.input:
+                    if len(input) != 0: # to prevent input is ""
+                        initializer_tensor = self.model_wrapper.get_initializer(input)
+                        if initializer_tensor is None:
+                            tensors_to_dump.add(input)
+                # add output tensors which should be dump
+                tensors_to_dump.update([output for output in node.output if len(output) != 0])
+                
+                # # calibrate output tensors 
+                # if not weight_only and not activation_only:
+                #     # update input tensors which should be dump
+                #     for input in node.input:
+                #         if len(input) != 0:
+                #             initializer_tensor = self.model_wrapper.get_initializer(input)
+                #             if initializer_tensor is None:
+                #                 tensors_to_dump.add(input)
+                #     # update output tensors which should be dump
+                #     tensors_to_dump.update([output for output in node.output if len(output) != 0])
+                # elif weight_only:
+                #     for input in node.input:
+                #         if (
+                #             self.already_quantized
+                #             and input.replace("_dequantized", "_quantized") in initializers
+                #             and len(input) != 0
+                #         ):
+                #             initializer_tensor = self.model_wrapper.get_initializer(input)
+                #             if initializer_tensor is None:
+                #                 tensors_to_dump.add(input)
+                # elif activation_only:
+                #     if len(node.input[0]) != 0:
+                #         tensors_to_dump.update([node.input[0]])
 
+        model_inputs = [i.name for i in model.graph.input]
+        logger.debug("tensors to dump:")
+        logger.debug(tensors_to_dump)
         for tensor in tensors_to_dump:
             if tensor not in node_outputs and tensor not in model_inputs:
                 continue
@@ -246,8 +255,7 @@ def augment_graph(self, activation_only=False, weight_only=False):
                 convert_attribute=False,
             )
 
-    def get_intermediate_outputs(self, q_config=None):
-        """Gather intermediate model outputs after running inference."""
+    def get_activation_tensors_calib_range(self, q_config=None):
         # conduct inference session and get intermediate outputs
         so = onnxruntime.SessionOptions()
         so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
@@ -289,8 +297,7 @@ def get_intermediate_outputs(self, q_config=None):
             assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name)
             name_to_node[data_name] = node.name
 
-        # step 1: calibrate output tensors
-        output_dicts = {}
+        activation_tensors_calib_range = {}
         intermediate_tensor = {}
         name_to_calibrator = {}
         ort_inputs_for_next_split_model = []
@@ -304,8 +311,8 @@ def get_intermediate_outputs(self, q_config=None):
                 else:
                     ort_inputs.update({inputs_names[0]: to_numpy(inputs)})
             else:
+                # skip check input length for layer-wise calibration
                 if not self.layer_wise:
-                    # for layer-wise calibration
                     assert len_inputs == len(inputs), "number of input tensors must align with graph inputs"
 
                 if isinstance(inputs, dict):
@@ -345,14 +352,15 @@ def _collect_data(ort_inputs):
                         # per iteration in the future.
                         if calibrator.method_name == "minmax":
                             calibrator.collect(output)
-                            output_dicts[node_output_names[output_idx]] = [list(calibrator.calib_range)]
+                            activation_tensors_calib_range[node_output_names[output_idx]] = \
+                                [list(calibrator.calib_range)]
                             name_to_calibrator[node_output_names[output_idx]] = calibrator
                         else:
                             intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append(
                                 output
                             )
                     elif q_config is None:
-                        output_dicts.setdefault(node_output_names[output_idx], []).append(output)
+                        activation_tensors_calib_range.setdefault(node_output_names[output_idx], []).append(output)
 
                     if self.layer_wise:
                         # for layer-wise calibration
@@ -379,38 +387,67 @@ def _collect_data(ort_inputs):
             )
             calibrator = CALIBRATOR[calib_method]()
             calibrator.collect(datas)
-            output_dicts.setdefault(output_name, []).append(list(calibrator.calib_range))
+            activation_tensors_calib_range.setdefault(output_name, []).append(list(calibrator.calib_range))
             calibrator.clear()
             del calibrator
 
-        # step 2: calibrate initializer tensors (like weight & bias) using minmax method
-        for initializer_tensor_name in self.initializer_tensors_to_dump:
-            initializer_tensor = augment_model_wrapper.get_initializer(initializer_tensor_name)
+        # set for layer-wise quant
+        self._dataloder_for_next_split_model = ort_inputs_for_next_split_model
+
+        return activation_tensors_calib_range
+    
+    def get_weight_tensors_calib_range(self):
+        initializer_tensors_to_dump = set()
+        initializers = [init.name for init in self.model.graph.initializer]
+        for node in self.model.graph.node:  # pylint: disable=no-member
+            should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or (
+                node.name in self.white_nodes
+            )
+            if should_be_dump:
+                for input in node.input:
+                    if (
+                        self.already_quantized
+                        and input.replace("_dequantized", "_quantized") in initializers
+                        and len(input) != 0
+                    ) or (
+                        not self.already_quantized
+                        and input in initializers
+                        and len(input) != 0
+                    ):
+                        initializer_tensors_to_dump.add(input)
+
+        logger.debug("initializer tensors to dump:")
+        logger.debug(initializer_tensors_to_dump)
+        weight_tensors_calib_range = {}
+        for initializer_tensor_name in initializer_tensors_to_dump:
+            initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name)
+
+            # double check initializer tensor is not None
             if initializer_tensor is None:  # pragma: no cover
                 continue
+
             initializer_tensor = numpy_helper.to_array(initializer_tensor)
-            calibrator = CALIBRATOR["minmax"]()
+            calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors
             calibrator.collect(initializer_tensor)
-            output_dicts[initializer_tensor_name] = [list(calibrator.calib_range)]
+            weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)]
             calibrator.clear()
             del calibrator
-
-        self._dataloder_for_next_split_model = ort_inputs_for_next_split_model
+        return weight_tensors_calib_range
+    
+    def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False):
+        """Gather intermediate model outputs after running inference."""
+        
+        output_dicts = {}
+        if not activation_only and not weight_only:
+            output_dicts = self.get_activation_tensors_calib_range(q_config)
+            output_dicts.update(self.get_weight_tensors_calib_range())
+        elif weight_only:
+            output_dicts = self.get_weight_tensors_calib_range()
+        elif activation_only:
+            output_dicts = self.get_activation_tensors_calib_range(q_config)
 
         return list(output_dicts.keys()), output_dicts
 
-    def _update_input_tensor_to_dump(self, tensor_names, initializer_tensors_to_dump, tensors_to_dump):
-        """Update input tensor to dump according to whether it is in initializer."""
-        if isinstance(tensor_names, str):
-            tensor_names = [tensor_names]
-        tensor_in_initializer, tensor_not_in_initializer = [], []
-        for tensor_name in tensor_names:
-            initializer_tensor = self.model_wrapper.get_initializer(tensor_name)
-            if initializer_tensor is None:
-                tensors_to_dump.update([tensor_name])
-            else:
-                initializer_tensors_to_dump.update([tensor_name])
-
     def _dequantize(self, tensor, scale_tensor, zo_tensor):
         """Helper function to dequantize tensor."""
         int_tensor = self.model_wrapper.get_initializer(tensor)
@@ -587,8 +624,8 @@ def dump_tensor(self, activation=True, weight=False, format=None):
             self.already_quantized = True
             self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]
             is_qdq = format == "qdq"
-        self.augment_graph(activation_only=not weight, weight_only=not activation)
-        _, output_dicts = self.get_intermediate_outputs()
+        self.augment_graph()
+        _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation)
         iters = len(list(output_dicts.values())[-1])
         map_node_activation = [{} for _ in range(iters)]
         map_node_weight = {}

From 019c6ba482a3302f623ae3ac93f9ca82a60824b9 Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Fri, 12 Jan 2024 09:54:23 +0800
Subject: [PATCH 06/12] update calibration.py

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 .../adaptor/ox_utils/calibration.py           | 107 ++++++++++--------
 1 file changed, 59 insertions(+), 48 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index 178ba361cbd..3866b9df196 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -150,7 +150,6 @@ def augment_graph(self):
                 new_white_nodes.append(new_white_node)
             self.white_nodes = new_white_nodes
 
-        # initializers = {i.name: i.data_type for i in model.graph.initializer}
         node_outputs = []
         for node in model.graph.node:  # pylint: disable=no-member
             node_outputs.extend(node.output)
@@ -166,34 +165,8 @@ def augment_graph(self):
                             tensors_to_dump.add(input)
                 # add output tensors which should be dump
                 tensors_to_dump.update([output for output in node.output if len(output) != 0])
-                
-                # # calibrate output tensors 
-                # if not weight_only and not activation_only:
-                #     # update input tensors which should be dump
-                #     for input in node.input:
-                #         if len(input) != 0:
-                #             initializer_tensor = self.model_wrapper.get_initializer(input)
-                #             if initializer_tensor is None:
-                #                 tensors_to_dump.add(input)
-                #     # update output tensors which should be dump
-                #     tensors_to_dump.update([output for output in node.output if len(output) != 0])
-                # elif weight_only:
-                #     for input in node.input:
-                #         if (
-                #             self.already_quantized
-                #             and input.replace("_dequantized", "_quantized") in initializers
-                #             and len(input) != 0
-                #         ):
-                #             initializer_tensor = self.model_wrapper.get_initializer(input)
-                #             if initializer_tensor is None:
-                #                 tensors_to_dump.add(input)
-                # elif activation_only:
-                #     if len(node.input[0]) != 0:
-                #         tensors_to_dump.update([node.input[0]])
 
         model_inputs = [i.name for i in model.graph.input]
-        logger.debug("tensors to dump:")
-        logger.debug(tensors_to_dump)
         for tensor in tensors_to_dump:
             if tensor not in node_outputs and tensor not in model_inputs:
                 continue
@@ -201,7 +174,7 @@ def augment_graph(self):
                 for augment_node_type in self.augment_nodes:
                     if augment_node_type in ["DequantizeLinear"]:
                         # insert DequantizeLinear node as output
-                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"):
+                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover
                             continue
 
                         if not self.dynamically_quantized:
@@ -256,6 +229,14 @@ def augment_graph(self):
             )
 
     def get_activation_tensors_calib_range(self, q_config=None):
+        """Get calib ranges of activation tensors.
+
+        Args:
+            q_config (dict, optional): quantization config. Defaults to None.
+
+        Returns:
+            dict: calib ranges
+        """
         # conduct inference session and get intermediate outputs
         so = onnxruntime.SessionOptions()
         so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
@@ -397,7 +378,26 @@ def _collect_data(ort_inputs):
         return activation_tensors_calib_range
     
     def get_weight_tensors_calib_range(self):
-        initializer_tensors_to_dump = set()
+        """Get calib ranges of weight tensors.
+
+        Returns:
+            dict: calib ranges
+        """
+        model_nodes_names = [node.name for node in self.model.graph.node]
+
+        # if augmented_model is not None, it means self.white_nodes is already updated in augment_graph func
+        # then skip update here
+        if self.already_quantized and self.augmented_model is None:
+            # mapping between fp32 node and int8 node
+            new_white_nodes = []
+            for white_node in self.white_nodes:
+                new_white_node = white_node + "_quant"
+                assert new_white_node in model_nodes_names, "no quantized {} in the " "graph".format(white_node)
+                new_white_nodes.append(new_white_node)
+            self.white_nodes = new_white_nodes
+
+        added_outputs = set()
+        initializer_tensors_to_dump = []
         initializers = [init.name for init in self.model.graph.initializer]
         for node in self.model.graph.node:  # pylint: disable=no-member
             should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or (
@@ -405,19 +405,21 @@ def get_weight_tensors_calib_range(self):
             )
             if should_be_dump:
                 for input in node.input:
-                    if (
-                        self.already_quantized
-                        and input.replace("_dequantized", "_quantized") in initializers
-                        and len(input) != 0
-                    ) or (
-                        not self.already_quantized
-                        and input in initializers
-                        and len(input) != 0
-                    ):
-                        initializer_tensors_to_dump.add(input)
-
-        logger.debug("initializer tensors to dump:")
-        logger.debug(initializer_tensors_to_dump)
+                    if ((self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) or
+                        (not self.already_quantized and input in initializers)) and len(input) != 0:
+                        added_outputs.add(input)
+
+        for tensor in added_outputs:
+            if tensor not in initializers:
+                continue
+            if self.augment_nodes:
+                for augment_node_type in self.augment_nodes:
+                    if augment_node_type in ["DequantizeLinear"]:
+                        if not (tensor.endswith("_scale") or tensor.endswith("_zero_point")):
+                            initializer_tensors_to_dump.append(tensor)
+            else:
+                initializer_tensors_to_dump.append(tensor)
+
         weight_tensors_calib_range = {}
         for initializer_tensor_name in initializer_tensors_to_dump:
             initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name)
@@ -436,7 +438,6 @@ def get_weight_tensors_calib_range(self):
     
     def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False):
         """Gather intermediate model outputs after running inference."""
-        
         output_dicts = {}
         if not activation_only and not weight_only:
             output_dicts = self.get_activation_tensors_calib_range(q_config)
@@ -543,7 +544,12 @@ def _map_calibration(self, node_output_names, output_dicts):
         return final_dict
 
     def dump_minmax(self, q_config):
-        """Get min/max values of tensors."""
+        """Get calib ranges of tensors."""
+        # pipeline of getting calib ranges of tensors during calibration:
+        # 1. augment_graph(): insert activation tensors to model output
+        # 2. get_intermediate_outputs():
+        #   2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augmnet graph
+        #   2.2 get_weight_tensors_calib_range(): get calib ranges of weight tensors
         self.augment_graph()
         node_output_names, output_dicts = self.get_intermediate_outputs(q_config)
         return self._map_calibration(node_output_names, output_dicts)
@@ -624,15 +630,20 @@ def dump_tensor(self, activation=True, weight=False, format=None):
             self.already_quantized = True
             self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]
             is_qdq = format == "qdq"
-        self.augment_graph()
+        if activation:
+            self.augment_graph(inspect_tensor=True) # add activation tensors to model output
         _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation)
         iters = len(list(output_dicts.values())[-1])
         map_node_activation = [{} for _ in range(iters)]
         map_node_weight = {}
         self.white_nodes = [node.replace("_quant", "") for node in self.white_nodes]
-        augmengted_wrapper = ONNXModel(self.augmented_model)
-        map_output = augmengted_wrapper.output_name_to_node
-        map_input = augmengted_wrapper.input_name_to_nodes
+
+        if activation and self.augmented_model is None:
+            raise ValueError("augmented model should not be None when dump activation tensors.")
+        # if activation tensors are not dumped, then use origin model wrapper
+        model_wrapper = ONNXModel(self.augmented_model) if activation else self.model_wrapper
+        map_output = model_wrapper.output_name_to_node
+        map_input = model_wrapper.input_name_to_nodes
         model_output_names = [t.name for t in self.model.graph.output]
         model_input_names = [t.name for t in self.model.graph.input]
         model_initializer_names = [t.name for t in self.model.graph.initializer]

From b0f980377692d7ce7a0f8aa46805ebaa854a2b5a Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Fri, 12 Jan 2024 10:26:11 +0800
Subject: [PATCH 07/12] update calibration.py

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 neural_compressor/adaptor/ox_utils/calibration.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index 3866b9df196..a57138089e9 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -548,7 +548,7 @@ def dump_minmax(self, q_config):
         # pipeline of getting calib ranges of tensors during calibration:
         # 1. augment_graph(): insert activation tensors to model output
         # 2. get_intermediate_outputs():
-        #   2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augmnet graph
+        #   2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augment graph
         #   2.2 get_weight_tensors_calib_range(): get calib ranges of weight tensors
         self.augment_graph()
         node_output_names, output_dicts = self.get_intermediate_outputs(q_config)
@@ -631,7 +631,7 @@ def dump_tensor(self, activation=True, weight=False, format=None):
             self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]
             is_qdq = format == "qdq"
         if activation:
-            self.augment_graph(inspect_tensor=True) # add activation tensors to model output
+            self.augment_graph() # add activation tensors to model output
         _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation)
         iters = len(list(output_dicts.values())[-1])
         map_node_activation = [{} for _ in range(iters)]

From 862fbbd153c5ea372042b0dedd2819942a4c27a8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jan 2024 02:27:34 +0000
Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../adaptor/ox_utils/calibration.py           | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index a57138089e9..7c14f2b65f0 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -159,7 +159,7 @@ def augment_graph(self):
             if should_be_dump:
                 # add input tensors which should be dump
                 for input in node.input:
-                    if len(input) != 0: # to prevent input is ""
+                    if len(input) != 0:  # to prevent input is ""
                         initializer_tensor = self.model_wrapper.get_initializer(input)
                         if initializer_tensor is None:
                             tensors_to_dump.add(input)
@@ -174,7 +174,7 @@ def augment_graph(self):
                 for augment_node_type in self.augment_nodes:
                     if augment_node_type in ["DequantizeLinear"]:
                         # insert DequantizeLinear node as output
-                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover
+                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"):  # pragma: no cover
                             continue
 
                         if not self.dynamically_quantized:
@@ -333,8 +333,9 @@ def _collect_data(ort_inputs):
                         # per iteration in the future.
                         if calibrator.method_name == "minmax":
                             calibrator.collect(output)
-                            activation_tensors_calib_range[node_output_names[output_idx]] = \
-                                [list(calibrator.calib_range)]
+                            activation_tensors_calib_range[node_output_names[output_idx]] = [
+                                list(calibrator.calib_range)
+                            ]
                             name_to_calibrator[node_output_names[output_idx]] = calibrator
                         else:
                             intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append(
@@ -376,7 +377,7 @@ def _collect_data(ort_inputs):
         self._dataloder_for_next_split_model = ort_inputs_for_next_split_model
 
         return activation_tensors_calib_range
-    
+
     def get_weight_tensors_calib_range(self):
         """Get calib ranges of weight tensors.
 
@@ -405,8 +406,10 @@ def get_weight_tensors_calib_range(self):
             )
             if should_be_dump:
                 for input in node.input:
-                    if ((self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) or
-                        (not self.already_quantized and input in initializers)) and len(input) != 0:
+                    if (
+                        (self.already_quantized and input.replace("_dequantized", "_quantized") in initializers)
+                        or (not self.already_quantized and input in initializers)
+                    ) and len(input) != 0:
                         added_outputs.add(input)
 
         for tensor in added_outputs:
@@ -429,13 +432,13 @@ def get_weight_tensors_calib_range(self):
                 continue
 
             initializer_tensor = numpy_helper.to_array(initializer_tensor)
-            calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors
+            calibrator = CALIBRATOR["minmax"]()  # use minmax method to calibrate initializer tensors
             calibrator.collect(initializer_tensor)
             weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)]
             calibrator.clear()
             del calibrator
         return weight_tensors_calib_range
-    
+
     def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False):
         """Gather intermediate model outputs after running inference."""
         output_dicts = {}
@@ -631,7 +634,7 @@ def dump_tensor(self, activation=True, weight=False, format=None):
             self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]
             is_qdq = format == "qdq"
         if activation:
-            self.augment_graph() # add activation tensors to model output
+            self.augment_graph()  # add activation tensors to model output
         _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation)
         iters = len(list(output_dicts.values())[-1])
         map_node_activation = [{} for _ in range(iters)]

From ae681700acdc0a74a5c7d521b53e624d3bc2ab8b Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Fri, 12 Jan 2024 17:30:18 +0800
Subject: [PATCH 09/12] fix bug

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 .../adaptor/ox_utils/calibration.py           | 28 +++++++++----------
 .../onnxrt_adaptor/test_onnxrt_augment.py     | 23 ++++++++-------
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index 7c14f2b65f0..fa2773da3e1 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -159,7 +159,7 @@ def augment_graph(self):
             if should_be_dump:
                 # add input tensors which should be dump
                 for input in node.input:
-                    if len(input) != 0:  # to prevent input is ""
+                    if len(input) != 0: # to prevent input is ""
                         initializer_tensor = self.model_wrapper.get_initializer(input)
                         if initializer_tensor is None:
                             tensors_to_dump.add(input)
@@ -174,7 +174,7 @@ def augment_graph(self):
                 for augment_node_type in self.augment_nodes:
                     if augment_node_type in ["DequantizeLinear"]:
                         # insert DequantizeLinear node as output
-                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"):  # pragma: no cover
+                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover
                             continue
 
                         if not self.dynamically_quantized:
@@ -333,9 +333,8 @@ def _collect_data(ort_inputs):
                         # per iteration in the future.
                         if calibrator.method_name == "minmax":
                             calibrator.collect(output)
-                            activation_tensors_calib_range[node_output_names[output_idx]] = [
-                                list(calibrator.calib_range)
-                            ]
+                            activation_tensors_calib_range[node_output_names[output_idx]] = \
+                                [list(calibrator.calib_range)]
                             name_to_calibrator[node_output_names[output_idx]] = calibrator
                         else:
                             intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append(
@@ -377,7 +376,7 @@ def _collect_data(ort_inputs):
         self._dataloder_for_next_split_model = ort_inputs_for_next_split_model
 
         return activation_tensors_calib_range
-
+    
     def get_weight_tensors_calib_range(self):
         """Get calib ranges of weight tensors.
 
@@ -406,10 +405,8 @@ def get_weight_tensors_calib_range(self):
             )
             if should_be_dump:
                 for input in node.input:
-                    if (
-                        (self.already_quantized and input.replace("_dequantized", "_quantized") in initializers)
-                        or (not self.already_quantized and input in initializers)
-                    ) and len(input) != 0:
+                    if ((self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) or
+                        (not self.already_quantized and input in initializers)) and len(input) != 0:
                         added_outputs.add(input)
 
         for tensor in added_outputs:
@@ -425,20 +422,23 @@ def get_weight_tensors_calib_range(self):
 
         weight_tensors_calib_range = {}
         for initializer_tensor_name in initializer_tensors_to_dump:
+            if self.layer_wise:
+                self.model_wrapper.load_model_initializer_by_tensor()
             initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name)
 
             # double check initializer tensor is not None
             if initializer_tensor is None:  # pragma: no cover
                 continue
 
-            initializer_tensor = numpy_helper.to_array(initializer_tensor)
-            calibrator = CALIBRATOR["minmax"]()  # use minmax method to calibrate initializer tensors
+            initializer_tensor = numpy_helper.to_array(initializer_tensor, 
+                                                       base_dir=os.path.dirname(self.model_wrapper.model_path))
+            calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors
             calibrator.collect(initializer_tensor)
             weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)]
             calibrator.clear()
             del calibrator
         return weight_tensors_calib_range
-
+    
     def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False):
         """Gather intermediate model outputs after running inference."""
         output_dicts = {}
@@ -634,7 +634,7 @@ def dump_tensor(self, activation=True, weight=False, format=None):
             self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]
             is_qdq = format == "qdq"
         if activation:
-            self.augment_graph()  # add activation tensors to model output
+            self.augment_graph() # add activation tensors to model output
         _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation)
         iters = len(list(output_dicts.values())[-1])
         map_node_activation = [{} for _ in range(iters)]
diff --git a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py
index 41b668f182a..9c342f05c15 100644
--- a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py
+++ b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py
@@ -330,7 +330,7 @@ def test_augment_graph(self):
         attn_output_scale = generate_input_initializer([1], np.float32, "attn_output_scale")
         Q_zo = helper.make_tensor_value_info("attn_output_zero_point", TensorProto.INT8, [1])
         attn_output_zero_point = generate_input_initializer([1], np.int8, "attn_output_zero_point")
-        Output = helper.make_tensor_value_info("output", TensorProto.INT8, [13, 7])
+        Output = helper.make_tensor_value_info("attn_output_quantized", TensorProto.INT8, [13, 7])
         attention_node = onnx.helper.make_node(
             "QAttention",
             [
@@ -386,15 +386,17 @@ def test_augment_graph(self):
         augment.augment_nodes = ["DequantizeLinear"]
         augment.already_quantized = True
 
-        augment.augment_graph(activation_only=True, weight_only=False)
+        augment.augment_graph()
         augmented_model = augment.augmented_model
 
         augmented_model_node_names = [node.name for node in augmented_model.graph.node]
         augmented_model_outputs = [output.name for output in augmented_model.graph.output]
-        added_node_names = ["attention_quant", "attn_output_QuantizeLinear"]
-        added_outputs = ["input_quantized_output", "output"]
+        added_node_names = ['attention_quant', 
+                            'attn_output_QuantizeLinear', 
+                            'input_quantized_DequantizeLinear']
+        added_outputs = ['attn_output_quantized', 'input_quantized_output', 'attn_output']
         self.assertEqual(len(augmented_model_node_names), 3)
-        self.assertEqual(len(augmented_model_outputs), 2)
+        self.assertEqual(len(augmented_model_outputs), 3)
         for name in added_node_names:
             self.assertTrue(name in augmented_model_node_names)
         for output in added_outputs:
@@ -470,15 +472,16 @@ def test_augment_graph(self):
         augment = ONNXRTAugment(ONNXModel(model), data_reader, [], white_nodes=["conv"])
         augment.augment_nodes = ["DequantizeLinear"]
         augment.already_quantized = True
-        augment.augment_graph(activation_only=True, weight_only=False)
+        augment.augment_graph()
         augmented_model = augment.augmented_model
 
         augmented_model_node_names = [node.name for node in augmented_model.graph.node]
         augmented_model_outputs = [output.name for output in augmented_model.graph.output]
-        added_node_names = ["A_QuantizeLinear", "conv_quant", "D_DequantizeLinear", "A_quantized_DequantizeLinear"]
-        added_outputs = ["D", "A_quantized_output"]
-        self.assertEqual(len(augmented_model_node_names), 4)
-        self.assertEqual(len(augmented_model_outputs), 2)
+        added_node_names = ['A_QuantizeLinear', 'conv_quant', 'D_DequantizeLinear', 
+                            'D_quantized_DequantizeLinear', 'A_quantized_DequantizeLinear']
+        added_outputs = ['D', 'D_quantized_output', 'A_quantized_output']
+        self.assertEqual(len(augmented_model_node_names), 5)
+        self.assertEqual(len(augmented_model_outputs), 3)
         for name in added_node_names:
             self.assertTrue(name in augmented_model_node_names)
         for output in added_outputs:

From 98e4daab7f057a978eca4ff2f17085ce5ce0c785 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jan 2024 09:31:42 +0000
Subject: [PATCH 10/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../adaptor/ox_utils/calibration.py           | 28 +++++++++++--------
 .../onnxrt_adaptor/test_onnxrt_augment.py     | 17 ++++++-----
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index fa2773da3e1..f3ae82c3cfe 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -159,7 +159,7 @@ def augment_graph(self):
             if should_be_dump:
                 # add input tensors which should be dump
                 for input in node.input:
-                    if len(input) != 0: # to prevent input is ""
+                    if len(input) != 0:  # to prevent input is ""
                         initializer_tensor = self.model_wrapper.get_initializer(input)
                         if initializer_tensor is None:
                             tensors_to_dump.add(input)
@@ -174,7 +174,7 @@ def augment_graph(self):
                 for augment_node_type in self.augment_nodes:
                     if augment_node_type in ["DequantizeLinear"]:
                         # insert DequantizeLinear node as output
-                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover
+                        if tensor.endswith("_scale") or tensor.endswith("_zero_point"):  # pragma: no cover
                             continue
 
                         if not self.dynamically_quantized:
@@ -333,8 +333,9 @@ def _collect_data(ort_inputs):
                         # per iteration in the future.
                         if calibrator.method_name == "minmax":
                             calibrator.collect(output)
-                            activation_tensors_calib_range[node_output_names[output_idx]] = \
-                                [list(calibrator.calib_range)]
+                            activation_tensors_calib_range[node_output_names[output_idx]] = [
+                                list(calibrator.calib_range)
+                            ]
                             name_to_calibrator[node_output_names[output_idx]] = calibrator
                         else:
                             intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append(
@@ -376,7 +377,7 @@ def _collect_data(ort_inputs):
         self._dataloder_for_next_split_model = ort_inputs_for_next_split_model
 
         return activation_tensors_calib_range
-    
+
     def get_weight_tensors_calib_range(self):
         """Get calib ranges of weight tensors.
 
@@ -405,8 +406,10 @@ def get_weight_tensors_calib_range(self):
             )
             if should_be_dump:
                 for input in node.input:
-                    if ((self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) or
-                        (not self.already_quantized and input in initializers)) and len(input) != 0:
+                    if (
+                        (self.already_quantized and input.replace("_dequantized", "_quantized") in initializers)
+                        or (not self.already_quantized and input in initializers)
+                    ) and len(input) != 0:
                         added_outputs.add(input)
 
         for tensor in added_outputs:
@@ -430,15 +433,16 @@ def get_weight_tensors_calib_range(self):
             if initializer_tensor is None:  # pragma: no cover
                 continue
 
-            initializer_tensor = numpy_helper.to_array(initializer_tensor, 
-                                                       base_dir=os.path.dirname(self.model_wrapper.model_path))
-            calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors
+            initializer_tensor = numpy_helper.to_array(
+                initializer_tensor, base_dir=os.path.dirname(self.model_wrapper.model_path)
+            )
+            calibrator = CALIBRATOR["minmax"]()  # use minmax method to calibrate initializer tensors
             calibrator.collect(initializer_tensor)
             weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)]
             calibrator.clear()
             del calibrator
         return weight_tensors_calib_range
-    
+
     def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False):
         """Gather intermediate model outputs after running inference."""
         output_dicts = {}
@@ -634,7 +638,7 @@ def dump_tensor(self, activation=True, weight=False, format=None):
             self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]
             is_qdq = format == "qdq"
         if activation:
-            self.augment_graph() # add activation tensors to model output
+            self.augment_graph()  # add activation tensors to model output
         _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation)
         iters = len(list(output_dicts.values())[-1])
         map_node_activation = [{} for _ in range(iters)]
diff --git a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py
index 9c342f05c15..382efeaab6c 100644
--- a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py
+++ b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py
@@ -391,10 +391,8 @@ def test_augment_graph(self):
 
         augmented_model_node_names = [node.name for node in augmented_model.graph.node]
         augmented_model_outputs = [output.name for output in augmented_model.graph.output]
-        added_node_names = ['attention_quant', 
-                            'attn_output_QuantizeLinear', 
-                            'input_quantized_DequantizeLinear']
-        added_outputs = ['attn_output_quantized', 'input_quantized_output', 'attn_output']
+        added_node_names = ["attention_quant", "attn_output_QuantizeLinear", "input_quantized_DequantizeLinear"]
+        added_outputs = ["attn_output_quantized", "input_quantized_output", "attn_output"]
         self.assertEqual(len(augmented_model_node_names), 3)
         self.assertEqual(len(augmented_model_outputs), 3)
         for name in added_node_names:
@@ -477,9 +475,14 @@ def test_augment_graph(self):
 
         augmented_model_node_names = [node.name for node in augmented_model.graph.node]
         augmented_model_outputs = [output.name for output in augmented_model.graph.output]
-        added_node_names = ['A_QuantizeLinear', 'conv_quant', 'D_DequantizeLinear', 
-                            'D_quantized_DequantizeLinear', 'A_quantized_DequantizeLinear']
-        added_outputs = ['D', 'D_quantized_output', 'A_quantized_output']
+        added_node_names = [
+            "A_QuantizeLinear",
+            "conv_quant",
+            "D_DequantizeLinear",
+            "D_quantized_DequantizeLinear",
+            "A_quantized_DequantizeLinear",
+        ]
+        added_outputs = ["D", "D_quantized_output", "A_quantized_output"]
         self.assertEqual(len(augmented_model_node_names), 5)
         self.assertEqual(len(augmented_model_outputs), 3)
         for name in added_node_names:

From 602c8f0d36dbc2a5a898cf766c3dcd4445025830 Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Fri, 12 Jan 2024 18:08:14 +0800
Subject: [PATCH 11/12] update calibration.py

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 neural_compressor/adaptor/ox_utils/calibration.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index f3ae82c3cfe..0b419a3b864 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -434,8 +434,9 @@ def get_weight_tensors_calib_range(self):
                 continue
 
             initializer_tensor = numpy_helper.to_array(
-                initializer_tensor, base_dir=os.path.dirname(self.model_wrapper.model_path)
-            )
+                initializer_tensor, 
+                base_dir=os.path.dirname(self.model_wrapper.model_path) \
+                    if self.model_wrapper.model_path is not None else "")
             calibrator = CALIBRATOR["minmax"]()  # use minmax method to calibrate initializer tensors
             calibrator.collect(initializer_tensor)
             weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)]

From 0ed29588c366093e7254a18f83755171b6163280 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jan 2024 10:09:37 +0000
Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/adaptor/ox_utils/calibration.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index 0b419a3b864..f56128e79ff 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -434,9 +434,11 @@ def get_weight_tensors_calib_range(self):
                 continue
 
             initializer_tensor = numpy_helper.to_array(
-                initializer_tensor, 
-                base_dir=os.path.dirname(self.model_wrapper.model_path) \
-                    if self.model_wrapper.model_path is not None else "")
+                initializer_tensor,
+                base_dir=os.path.dirname(self.model_wrapper.model_path)
+                if self.model_wrapper.model_path is not None
+                else "",
+            )
             calibrator = CALIBRATOR["minmax"]()  # use minmax method to calibrate initializer tensors
             calibrator.collect(initializer_tensor)
             weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)]