From 4e33f9675718afa6dd0da2609e59a7c3ad6d2b13 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Tue, 9 Jan 2024 18:33:43 +0800 Subject: [PATCH 01/12] update onnxrt calibration Signed-off-by: yuwenzho --- neural_compressor/adaptor/onnxrt.py | 2 +- neural_compressor/adaptor/ox_utils/calibration.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 738aa7833d0..990051e7808 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -765,7 +765,7 @@ def _get_quantize_params(self, model, data_loader, quantize_config, iterations, black_nodes=black_nodes, white_nodes=white_nodes, iterations=list(range(0, iterations)), - backend=self.backend if self.backend != "DmlExecutionProvider" else "CPUExecutionProvider", + backend=self.backend, reduce_range=self.reduce_range, **kwargs, ) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 34c899f9090..90992c0cc7c 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -242,6 +242,7 @@ def get_intermediate_outputs(self, q_config=None): """Gather intermediate model outputs after running inference.""" # conduct inference session and get intermediate outputs so = onnxruntime.SessionOptions() + so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"): # pragma: no cover from onnxruntime_extensions import get_library_path From 53945b5524aab816851d1f831a3369c50b644cff Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Tue, 9 Jan 2024 17:41:24 -0800 Subject: [PATCH 02/12] fix onnxrt calibration for dml ep Signed-off-by: yuwenzho --- neural_compressor/adaptor/onnxrt.py | 2 +- .../adaptor/ox_utils/calibration.py | 50 ++++++++++++++++--- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 738aa7833d0..990051e7808 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -765,7 +765,7 @@ def _get_quantize_params(self, model, data_loader, quantize_config, iterations, black_nodes=black_nodes, white_nodes=white_nodes, iterations=list(range(0, iterations)), - backend=self.backend if self.backend != "DmlExecutionProvider" else "CPUExecutionProvider", + backend=self.backend, reduce_range=self.reduce_range, **kwargs, ) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 34c899f9090..a3af241ba6c 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -104,7 +104,7 @@ def __init__( def dataloder_for_next_split_model(self): """Return dataloader for next split model for layer-wise quantization.""" return self._dataloder_for_next_split_model - + def augment_graph(self, activation_only=False, weight_only=False): """Augment_graph. @@ -133,7 +133,10 @@ def augment_graph(self, activation_only=False, weight_only=False): added_nodes = [] added_outputs = [] + + # calibrate initializer tensors (like weight & bias) and output tensors seperatly tensors_to_dump = set() + initializer_tensors_to_dump = set() for augment_node_type in self.augment_nodes: if augment_node_type not in ["DequantizeLinear"]: # pragma: no cover @@ -159,9 +162,12 @@ def augment_graph(self, activation_only=False, weight_only=False): ) if should_be_dump: if not weight_only and not activation_only: - tensors_to_dump.update([input for input in node.input if len(input) != 0]) + # update input tensors which should be dump + self._update_input_tensor_to_dump([input for input in node.input if len(input) != 0], + initializer_tensors_to_dump, + tensors_to_dump) + # update output tensors which should be dump tensors_to_dump.update([output for output in node.output if len(output) != 0]) - tensors_to_dump.update(node.output) elif weight_only: for input in node.input: if ( @@ -169,16 +175,22 @@ def augment_graph(self, activation_only=False, weight_only=False): and input.replace("_dequantized", "_quantized") in initializers and len(input) != 0 ): - tensors_to_dump.add(input) + self._update_input_tensor_to_dump(input, + initializer_tensors_to_dump, + tensors_to_dump) elif not self.already_quantized and input in initializers and len(input) != 0: - tensors_to_dump.add(input) + self._update_input_tensor_to_dump(input, + initializer_tensors_to_dump, + tensors_to_dump) elif activation_only: if len(node.input[0]) != 0: tensors_to_dump.update([node.input[0]]) + self.initializer_tensors_to_dump = initializer_tensors_to_dump model_inputs = [i.name for i in model.graph.input] + for tensor in tensors_to_dump: - if tensor not in node_outputs and tensor not in initializers and tensor not in model_inputs: + if tensor not in node_outputs and tensor not in model_inputs: continue if self.augment_nodes: for augment_node_type in self.augment_nodes: @@ -242,6 +254,7 @@ def get_intermediate_outputs(self, q_config=None): """Gather intermediate model outputs after running inference.""" # conduct inference session and get intermediate outputs so = onnxruntime.SessionOptions() + so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"): # pragma: no cover from onnxruntime_extensions import get_library_path @@ -280,6 +293,7 @@ def get_intermediate_outputs(self, q_config=None): assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name) name_to_node[data_name] = node.name + # step 1: calibrate output tensors output_dicts = {} intermediate_tensor = {} name_to_calibrator = {} @@ -373,10 +387,34 @@ def _collect_data(ort_inputs): calibrator.clear() del calibrator + # step 2: calibrate initializer tensors (like weight & bias) using minmax method + for initializer_tensor_name in self.initializer_tensors_to_dump: + initializer_tensor = augment_model_wrapper.get_initializer(initializer_tensor_name) + if initializer_tensor is None: # pragma: no cover + continue + initializer_tensor = numpy_helper.to_array(initializer_tensor) + calibrator = CALIBRATOR["minmax"]() + calibrator.collect(initializer_tensor) + output_dicts[initializer_tensor_name] = [list(calibrator.calib_range)] + calibrator.clear() + del calibrator + self._dataloder_for_next_split_model = ort_inputs_for_next_split_model return list(output_dicts.keys()), output_dicts + def _update_input_tensor_to_dump(self, tensor_names, initializer_tensors_to_dump, tensors_to_dump): + """Update input tensor to dump accroding to whether it is in initializer.""" + if isinstance(tensor_names, str): + tensor_names = [tensor_names] + tensor_in_initializer, tensor_not_in_initializer = [], [] + for tensor_name in tensor_names: + initializer_tensor = self.model_wrapper.get_initializer(tensor_name) + if initializer_tensor is None: + tensors_to_dump.update([tensor_name]) + else: + initializer_tensors_to_dump.update([tensor_name]) + def _dequantize(self, tensor, scale_tensor, zo_tensor): """Helper function to dequantize tensor.""" int_tensor = self.model_wrapper.get_initializer(tensor) From 3bad612101a3598fcc4a12a4792564ab2a82e73f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Jan 2024 05:20:30 +0000 Subject: [PATCH 03/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../adaptor/ox_utils/calibration.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index a3af241ba6c..b6abc6c7dfe 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -104,7 +104,7 @@ def __init__( def dataloder_for_next_split_model(self): """Return dataloader for next split model for layer-wise quantization.""" return self._dataloder_for_next_split_model - + def augment_graph(self, activation_only=False, weight_only=False): """Augment_graph. @@ -163,9 +163,9 @@ def augment_graph(self, activation_only=False, weight_only=False): if should_be_dump: if not weight_only and not activation_only: # update input tensors which should be dump - self._update_input_tensor_to_dump([input for input in node.input if len(input) != 0], - initializer_tensors_to_dump, - tensors_to_dump) + self._update_input_tensor_to_dump( + [input for input in node.input if len(input) != 0], initializer_tensors_to_dump, tensors_to_dump + ) # update output tensors which should be dump tensors_to_dump.update([output for output in node.output if len(output) != 0]) elif weight_only: @@ -175,13 +175,9 @@ def augment_graph(self, activation_only=False, weight_only=False): and input.replace("_dequantized", "_quantized") in initializers and len(input) != 0 ): - self._update_input_tensor_to_dump(input, - initializer_tensors_to_dump, - tensors_to_dump) + self._update_input_tensor_to_dump(input, initializer_tensors_to_dump, tensors_to_dump) elif not self.already_quantized and input in initializers and len(input) != 0: - self._update_input_tensor_to_dump(input, - initializer_tensors_to_dump, - tensors_to_dump) + self._update_input_tensor_to_dump(input, initializer_tensors_to_dump, tensors_to_dump) elif activation_only: if len(node.input[0]) != 0: tensors_to_dump.update([node.input[0]]) @@ -390,7 +386,7 @@ def _collect_data(ort_inputs): # step 2: calibrate initializer tensors (like weight & bias) using minmax method for initializer_tensor_name in self.initializer_tensors_to_dump: initializer_tensor = augment_model_wrapper.get_initializer(initializer_tensor_name) - if initializer_tensor is None: # pragma: no cover + if initializer_tensor is None: # pragma: no cover continue initializer_tensor = numpy_helper.to_array(initializer_tensor) calibrator = CALIBRATOR["minmax"]() From 81ed91346310474d5465dccf4f341d7b68a064c6 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Wed, 10 Jan 2024 13:47:02 +0800 Subject: [PATCH 04/12] fix typo Signed-off-by: yuwenzho --- neural_compressor/adaptor/ox_utils/calibration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index b6abc6c7dfe..813db137342 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -134,7 +134,7 @@ def augment_graph(self, activation_only=False, weight_only=False): added_nodes = [] added_outputs = [] - # calibrate initializer tensors (like weight & bias) and output tensors seperatly + # calibrate initializer tensors (like weight & bias) and output tensors separately tensors_to_dump = set() initializer_tensors_to_dump = set() @@ -400,7 +400,7 @@ def _collect_data(ort_inputs): return list(output_dicts.keys()), output_dicts def _update_input_tensor_to_dump(self, tensor_names, initializer_tensors_to_dump, tensors_to_dump): - """Update input tensor to dump accroding to whether it is in initializer.""" + """Update input tensor to dump according to whether it is in initializer.""" if isinstance(tensor_names, str): tensor_names = [tensor_names] tensor_in_initializer, tensor_not_in_initializer = [], [] From 4a2281d00dff5df0c1f126b847c989bea40f19c9 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Wed, 10 Jan 2024 18:15:33 +0800 Subject: [PATCH 05/12] update calibration.py Signed-off-by: yuwenzho --- .../adaptor/ox_utils/calibration.py | 157 +++++++++++------- 1 file changed, 97 insertions(+), 60 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 813db137342..178ba361cbd 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -105,7 +105,7 @@ def dataloder_for_next_split_model(self): """Return dataloader for next split model for layer-wise quantization.""" return self._dataloder_for_next_split_model - def augment_graph(self, activation_only=False, weight_only=False): + def augment_graph(self): """Augment_graph. Adds nodes to all quantization_candidates op type nodes in model and @@ -118,7 +118,7 @@ def augment_graph(self, activation_only=False, weight_only=False): self.dequantized_output.clear() onnx_version = Version(onnx.__version__) if onnx_version < ONNX18_VERSION: - logger.warning("Static quantization for NLP model is supported " "at onnx 1.8.0 and newer.") + logger.warning("Static quantization for NLP model is supported at onnx 1.8.0 and newer.") if self.already_quantized and any( [i.dims in [1, 2] for i in self.model_wrapper.initializer() if i.name.endswith("_scale")] ): @@ -133,15 +133,12 @@ def augment_graph(self, activation_only=False, weight_only=False): added_nodes = [] added_outputs = [] - - # calibrate initializer tensors (like weight & bias) and output tensors separately tensors_to_dump = set() - initializer_tensors_to_dump = set() for augment_node_type in self.augment_nodes: if augment_node_type not in ["DequantizeLinear"]: # pragma: no cover raise ValueError( - "Unexpected augment_node {} only DequantizeLinear is " "supported".format(augment_node_type) + "Unexpected augment_node {} only DequantizeLinear is supported".format(augment_node_type) ) if self.already_quantized: @@ -149,11 +146,11 @@ def augment_graph(self, activation_only=False, weight_only=False): new_white_nodes = [] for white_node in self.white_nodes: new_white_node = white_node + "_quant" - assert new_white_node in model_nodes_names, "no quantized {} in the " "graph".format(white_node) + assert new_white_node in model_nodes_names, "no quantized {} in the graph".format(white_node) new_white_nodes.append(new_white_node) self.white_nodes = new_white_nodes - initializers = {i.name: i.data_type for i in model.graph.initializer} + # initializers = {i.name: i.data_type for i in model.graph.initializer} node_outputs = [] for node in model.graph.node: # pylint: disable=no-member node_outputs.extend(node.output) @@ -161,30 +158,42 @@ def augment_graph(self, activation_only=False, weight_only=False): node.name in self.white_nodes ) if should_be_dump: - if not weight_only and not activation_only: - # update input tensors which should be dump - self._update_input_tensor_to_dump( - [input for input in node.input if len(input) != 0], initializer_tensors_to_dump, tensors_to_dump - ) - # update output tensors which should be dump - tensors_to_dump.update([output for output in node.output if len(output) != 0]) - elif weight_only: - for input in node.input: - if ( - self.already_quantized - and input.replace("_dequantized", "_quantized") in initializers - and len(input) != 0 - ): - self._update_input_tensor_to_dump(input, initializer_tensors_to_dump, tensors_to_dump) - elif not self.already_quantized and input in initializers and len(input) != 0: - self._update_input_tensor_to_dump(input, initializer_tensors_to_dump, tensors_to_dump) - elif activation_only: - if len(node.input[0]) != 0: - tensors_to_dump.update([node.input[0]]) - - self.initializer_tensors_to_dump = initializer_tensors_to_dump - model_inputs = [i.name for i in model.graph.input] + # add input tensors which should be dump + for input in node.input: + if len(input) != 0: # to prevent input is "" + initializer_tensor = self.model_wrapper.get_initializer(input) + if initializer_tensor is None: + tensors_to_dump.add(input) + # add output tensors which should be dump + tensors_to_dump.update([output for output in node.output if len(output) != 0]) + + # # calibrate output tensors + # if not weight_only and not activation_only: + # # update input tensors which should be dump + # for input in node.input: + # if len(input) != 0: + # initializer_tensor = self.model_wrapper.get_initializer(input) + # if initializer_tensor is None: + # tensors_to_dump.add(input) + # # update output tensors which should be dump + # tensors_to_dump.update([output for output in node.output if len(output) != 0]) + # elif weight_only: + # for input in node.input: + # if ( + # self.already_quantized + # and input.replace("_dequantized", "_quantized") in initializers + # and len(input) != 0 + # ): + # initializer_tensor = self.model_wrapper.get_initializer(input) + # if initializer_tensor is None: + # tensors_to_dump.add(input) + # elif activation_only: + # if len(node.input[0]) != 0: + # tensors_to_dump.update([node.input[0]]) + model_inputs = [i.name for i in model.graph.input] + logger.debug("tensors to dump:") + logger.debug(tensors_to_dump) for tensor in tensors_to_dump: if tensor not in node_outputs and tensor not in model_inputs: continue @@ -246,8 +255,7 @@ def augment_graph(self, activation_only=False, weight_only=False): convert_attribute=False, ) - def get_intermediate_outputs(self, q_config=None): - """Gather intermediate model outputs after running inference.""" + def get_activation_tensors_calib_range(self, q_config=None): # conduct inference session and get intermediate outputs so = onnxruntime.SessionOptions() so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL @@ -289,8 +297,7 @@ def get_intermediate_outputs(self, q_config=None): assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name) name_to_node[data_name] = node.name - # step 1: calibrate output tensors - output_dicts = {} + activation_tensors_calib_range = {} intermediate_tensor = {} name_to_calibrator = {} ort_inputs_for_next_split_model = [] @@ -304,8 +311,8 @@ def get_intermediate_outputs(self, q_config=None): else: ort_inputs.update({inputs_names[0]: to_numpy(inputs)}) else: + # skip check input length for layer-wise calibration if not self.layer_wise: - # for layer-wise calibration assert len_inputs == len(inputs), "number of input tensors must align with graph inputs" if isinstance(inputs, dict): @@ -345,14 +352,15 @@ def _collect_data(ort_inputs): # per iteration in the future. if calibrator.method_name == "minmax": calibrator.collect(output) - output_dicts[node_output_names[output_idx]] = [list(calibrator.calib_range)] + activation_tensors_calib_range[node_output_names[output_idx]] = \ + [list(calibrator.calib_range)] name_to_calibrator[node_output_names[output_idx]] = calibrator else: intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append( output ) elif q_config is None: - output_dicts.setdefault(node_output_names[output_idx], []).append(output) + activation_tensors_calib_range.setdefault(node_output_names[output_idx], []).append(output) if self.layer_wise: # for layer-wise calibration @@ -379,38 +387,67 @@ def _collect_data(ort_inputs): ) calibrator = CALIBRATOR[calib_method]() calibrator.collect(datas) - output_dicts.setdefault(output_name, []).append(list(calibrator.calib_range)) + activation_tensors_calib_range.setdefault(output_name, []).append(list(calibrator.calib_range)) calibrator.clear() del calibrator - # step 2: calibrate initializer tensors (like weight & bias) using minmax method - for initializer_tensor_name in self.initializer_tensors_to_dump: - initializer_tensor = augment_model_wrapper.get_initializer(initializer_tensor_name) + # set for layer-wise quant + self._dataloder_for_next_split_model = ort_inputs_for_next_split_model + + return activation_tensors_calib_range + + def get_weight_tensors_calib_range(self): + initializer_tensors_to_dump = set() + initializers = [init.name for init in self.model.graph.initializer] + for node in self.model.graph.node: # pylint: disable=no-member + should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or ( + node.name in self.white_nodes + ) + if should_be_dump: + for input in node.input: + if ( + self.already_quantized + and input.replace("_dequantized", "_quantized") in initializers + and len(input) != 0 + ) or ( + not self.already_quantized + and input in initializers + and len(input) != 0 + ): + initializer_tensors_to_dump.add(input) + + logger.debug("initializer tensors to dump:") + logger.debug(initializer_tensors_to_dump) + weight_tensors_calib_range = {} + for initializer_tensor_name in initializer_tensors_to_dump: + initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name) + + # double check initializer tensor is not None if initializer_tensor is None: # pragma: no cover continue + initializer_tensor = numpy_helper.to_array(initializer_tensor) - calibrator = CALIBRATOR["minmax"]() + calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors calibrator.collect(initializer_tensor) - output_dicts[initializer_tensor_name] = [list(calibrator.calib_range)] + weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)] calibrator.clear() del calibrator - - self._dataloder_for_next_split_model = ort_inputs_for_next_split_model + return weight_tensors_calib_range + + def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False): + """Gather intermediate model outputs after running inference.""" + + output_dicts = {} + if not activation_only and not weight_only: + output_dicts = self.get_activation_tensors_calib_range(q_config) + output_dicts.update(self.get_weight_tensors_calib_range()) + elif weight_only: + output_dicts = self.get_weight_tensors_calib_range() + elif activation_only: + output_dicts = self.get_activation_tensors_calib_range(q_config) return list(output_dicts.keys()), output_dicts - def _update_input_tensor_to_dump(self, tensor_names, initializer_tensors_to_dump, tensors_to_dump): - """Update input tensor to dump according to whether it is in initializer.""" - if isinstance(tensor_names, str): - tensor_names = [tensor_names] - tensor_in_initializer, tensor_not_in_initializer = [], [] - for tensor_name in tensor_names: - initializer_tensor = self.model_wrapper.get_initializer(tensor_name) - if initializer_tensor is None: - tensors_to_dump.update([tensor_name]) - else: - initializer_tensors_to_dump.update([tensor_name]) - def _dequantize(self, tensor, scale_tensor, zo_tensor): """Helper function to dequantize tensor.""" int_tensor = self.model_wrapper.get_initializer(tensor) @@ -587,8 +624,8 @@ def dump_tensor(self, activation=True, weight=False, format=None): self.already_quantized = True self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node] is_qdq = format == "qdq" - self.augment_graph(activation_only=not weight, weight_only=not activation) - _, output_dicts = self.get_intermediate_outputs() + self.augment_graph() + _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation) iters = len(list(output_dicts.values())[-1]) map_node_activation = [{} for _ in range(iters)] map_node_weight = {} From 019c6ba482a3302f623ae3ac93f9ca82a60824b9 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Fri, 12 Jan 2024 09:54:23 +0800 Subject: [PATCH 06/12] update calibration.py Signed-off-by: yuwenzho --- .../adaptor/ox_utils/calibration.py | 107 ++++++++++-------- 1 file changed, 59 insertions(+), 48 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 178ba361cbd..3866b9df196 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -150,7 +150,6 @@ def augment_graph(self): new_white_nodes.append(new_white_node) self.white_nodes = new_white_nodes - # initializers = {i.name: i.data_type for i in model.graph.initializer} node_outputs = [] for node in model.graph.node: # pylint: disable=no-member node_outputs.extend(node.output) @@ -166,34 +165,8 @@ def augment_graph(self): tensors_to_dump.add(input) # add output tensors which should be dump tensors_to_dump.update([output for output in node.output if len(output) != 0]) - - # # calibrate output tensors - # if not weight_only and not activation_only: - # # update input tensors which should be dump - # for input in node.input: - # if len(input) != 0: - # initializer_tensor = self.model_wrapper.get_initializer(input) - # if initializer_tensor is None: - # tensors_to_dump.add(input) - # # update output tensors which should be dump - # tensors_to_dump.update([output for output in node.output if len(output) != 0]) - # elif weight_only: - # for input in node.input: - # if ( - # self.already_quantized - # and input.replace("_dequantized", "_quantized") in initializers - # and len(input) != 0 - # ): - # initializer_tensor = self.model_wrapper.get_initializer(input) - # if initializer_tensor is None: - # tensors_to_dump.add(input) - # elif activation_only: - # if len(node.input[0]) != 0: - # tensors_to_dump.update([node.input[0]]) model_inputs = [i.name for i in model.graph.input] - logger.debug("tensors to dump:") - logger.debug(tensors_to_dump) for tensor in tensors_to_dump: if tensor not in node_outputs and tensor not in model_inputs: continue @@ -201,7 +174,7 @@ def augment_graph(self): for augment_node_type in self.augment_nodes: if augment_node_type in ["DequantizeLinear"]: # insert DequantizeLinear node as output - if tensor.endswith("_scale") or tensor.endswith("_zero_point"): + if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover continue if not self.dynamically_quantized: @@ -256,6 +229,14 @@ def augment_graph(self): ) def get_activation_tensors_calib_range(self, q_config=None): + """Get calib ranges of activation tensors. + + Args: + q_config (dict, optional): quantization config. Defaults to None. + + Returns: + dict: calib ranges + """ # conduct inference session and get intermediate outputs so = onnxruntime.SessionOptions() so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL @@ -397,7 +378,26 @@ def _collect_data(ort_inputs): return activation_tensors_calib_range def get_weight_tensors_calib_range(self): - initializer_tensors_to_dump = set() + """Get calib ranges of weight tensors. + + Returns: + dict: calib ranges + """ + model_nodes_names = [node.name for node in self.model.graph.node] + + # if augmented_model is not None, it means self.white_nodes is already updated in augment_graph func + # then skip update here + if self.already_quantized and self.augmented_model is None: + # mapping between fp32 node and int8 node + new_white_nodes = [] + for white_node in self.white_nodes: + new_white_node = white_node + "_quant" + assert new_white_node in model_nodes_names, "no quantized {} in the " "graph".format(white_node) + new_white_nodes.append(new_white_node) + self.white_nodes = new_white_nodes + + added_outputs = set() + initializer_tensors_to_dump = [] initializers = [init.name for init in self.model.graph.initializer] for node in self.model.graph.node: # pylint: disable=no-member should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or ( @@ -405,19 +405,21 @@ def get_weight_tensors_calib_range(self): ) if should_be_dump: for input in node.input: - if ( - self.already_quantized - and input.replace("_dequantized", "_quantized") in initializers - and len(input) != 0 - ) or ( - not self.already_quantized - and input in initializers - and len(input) != 0 - ): - initializer_tensors_to_dump.add(input) - - logger.debug("initializer tensors to dump:") - logger.debug(initializer_tensors_to_dump) + if ((self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) or + (not self.already_quantized and input in initializers)) and len(input) != 0: + added_outputs.add(input) + + for tensor in added_outputs: + if tensor not in initializers: + continue + if self.augment_nodes: + for augment_node_type in self.augment_nodes: + if augment_node_type in ["DequantizeLinear"]: + if not (tensor.endswith("_scale") or tensor.endswith("_zero_point")): + initializer_tensors_to_dump.append(tensor) + else: + initializer_tensors_to_dump.append(tensor) + weight_tensors_calib_range = {} for initializer_tensor_name in initializer_tensors_to_dump: initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name) @@ -436,7 +438,6 @@ def get_weight_tensors_calib_range(self): def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False): """Gather intermediate model outputs after running inference.""" - output_dicts = {} if not activation_only and not weight_only: output_dicts = self.get_activation_tensors_calib_range(q_config) @@ -543,7 +544,12 @@ def _map_calibration(self, node_output_names, output_dicts): return final_dict def dump_minmax(self, q_config): - """Get min/max values of tensors.""" + """Get calib ranges of tensors.""" + # pipeline of getting calib ranges of tensors during calibration: + # 1. augment_graph(): insert activation tensors to model output + # 2. get_intermediate_outputs(): + # 2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augmnet graph + # 2.2 get_weight_tensors_calib_range(): get calib ranges of weight tensors self.augment_graph() node_output_names, output_dicts = self.get_intermediate_outputs(q_config) return self._map_calibration(node_output_names, output_dicts) @@ -624,15 +630,20 @@ def dump_tensor(self, activation=True, weight=False, format=None): self.already_quantized = True self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node] is_qdq = format == "qdq" - self.augment_graph() + if activation: + self.augment_graph(inspect_tensor=True) # add activation tensors to model output _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation) iters = len(list(output_dicts.values())[-1]) map_node_activation = [{} for _ in range(iters)] map_node_weight = {} self.white_nodes = [node.replace("_quant", "") for node in self.white_nodes] - augmengted_wrapper = ONNXModel(self.augmented_model) - map_output = augmengted_wrapper.output_name_to_node - map_input = augmengted_wrapper.input_name_to_nodes + + if activation and self.augmented_model is None: + raise ValueError("augmented model should not be None when dump activation tensors.") + # if activation tensors are not dumped, then use origin model wrapper + model_wrapper = ONNXModel(self.augmented_model) if activation else self.model_wrapper + map_output = model_wrapper.output_name_to_node + map_input = model_wrapper.input_name_to_nodes model_output_names = [t.name for t in self.model.graph.output] model_input_names = [t.name for t in self.model.graph.input] model_initializer_names = [t.name for t in self.model.graph.initializer] From b0f980377692d7ce7a0f8aa46805ebaa854a2b5a Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Fri, 12 Jan 2024 10:26:11 +0800 Subject: [PATCH 07/12] update calibration.py Signed-off-by: yuwenzho --- neural_compressor/adaptor/ox_utils/calibration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 3866b9df196..a57138089e9 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -548,7 +548,7 @@ def dump_minmax(self, q_config): # pipeline of getting calib ranges of tensors during calibration: # 1. augment_graph(): insert activation tensors to model output # 2. get_intermediate_outputs(): - # 2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augmnet graph + # 2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augment graph # 2.2 get_weight_tensors_calib_range(): get calib ranges of weight tensors self.augment_graph() node_output_names, output_dicts = self.get_intermediate_outputs(q_config) @@ -631,7 +631,7 @@ def dump_tensor(self, activation=True, weight=False, format=None): self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node] is_qdq = format == "qdq" if activation: - self.augment_graph(inspect_tensor=True) # add activation tensors to model output + self.augment_graph() # add activation tensors to model output _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation) iters = len(list(output_dicts.values())[-1]) map_node_activation = [{} for _ in range(iters)] From 862fbbd153c5ea372042b0dedd2819942a4c27a8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jan 2024 02:27:34 +0000 Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../adaptor/ox_utils/calibration.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index a57138089e9..7c14f2b65f0 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -159,7 +159,7 @@ def augment_graph(self): if should_be_dump: # add input tensors which should be dump for input in node.input: - if len(input) != 0: # to prevent input is "" + if len(input) != 0: # to prevent input is "" initializer_tensor = self.model_wrapper.get_initializer(input) if initializer_tensor is None: tensors_to_dump.add(input) @@ -174,7 +174,7 @@ def augment_graph(self): for augment_node_type in self.augment_nodes: if augment_node_type in ["DequantizeLinear"]: # insert DequantizeLinear node as output - if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover + if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover continue if not self.dynamically_quantized: @@ -333,8 +333,9 @@ def _collect_data(ort_inputs): # per iteration in the future. if calibrator.method_name == "minmax": calibrator.collect(output) - activation_tensors_calib_range[node_output_names[output_idx]] = \ - [list(calibrator.calib_range)] + activation_tensors_calib_range[node_output_names[output_idx]] = [ + list(calibrator.calib_range) + ] name_to_calibrator[node_output_names[output_idx]] = calibrator else: intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append( @@ -376,7 +377,7 @@ def _collect_data(ort_inputs): self._dataloder_for_next_split_model = ort_inputs_for_next_split_model return activation_tensors_calib_range - + def get_weight_tensors_calib_range(self): """Get calib ranges of weight tensors. @@ -405,8 +406,10 @@ def get_weight_tensors_calib_range(self): ) if should_be_dump: for input in node.input: - if ((self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) or - (not self.already_quantized and input in initializers)) and len(input) != 0: + if ( + (self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) + or (not self.already_quantized and input in initializers) + ) and len(input) != 0: added_outputs.add(input) for tensor in added_outputs: @@ -429,13 +432,13 @@ def get_weight_tensors_calib_range(self): continue initializer_tensor = numpy_helper.to_array(initializer_tensor) - calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors + calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors calibrator.collect(initializer_tensor) weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)] calibrator.clear() del calibrator return weight_tensors_calib_range - + def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False): """Gather intermediate model outputs after running inference.""" output_dicts = {} @@ -631,7 +634,7 @@ def dump_tensor(self, activation=True, weight=False, format=None): self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node] is_qdq = format == "qdq" if activation: - self.augment_graph() # add activation tensors to model output + self.augment_graph() # add activation tensors to model output _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation) iters = len(list(output_dicts.values())[-1]) map_node_activation = [{} for _ in range(iters)] From ae681700acdc0a74a5c7d521b53e624d3bc2ab8b Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Fri, 12 Jan 2024 17:30:18 +0800 Subject: [PATCH 09/12] fix bug Signed-off-by: yuwenzho --- .../adaptor/ox_utils/calibration.py | 28 +++++++++---------- .../onnxrt_adaptor/test_onnxrt_augment.py | 23 ++++++++------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 7c14f2b65f0..fa2773da3e1 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -159,7 +159,7 @@ def augment_graph(self): if should_be_dump: # add input tensors which should be dump for input in node.input: - if len(input) != 0: # to prevent input is "" + if len(input) != 0: # to prevent input is "" initializer_tensor = self.model_wrapper.get_initializer(input) if initializer_tensor is None: tensors_to_dump.add(input) @@ -174,7 +174,7 @@ def augment_graph(self): for augment_node_type in self.augment_nodes: if augment_node_type in ["DequantizeLinear"]: # insert DequantizeLinear node as output - if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover + if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover continue if not self.dynamically_quantized: @@ -333,9 +333,8 @@ def _collect_data(ort_inputs): # per iteration in the future. if calibrator.method_name == "minmax": calibrator.collect(output) - activation_tensors_calib_range[node_output_names[output_idx]] = [ - list(calibrator.calib_range) - ] + activation_tensors_calib_range[node_output_names[output_idx]] = \ + [list(calibrator.calib_range)] name_to_calibrator[node_output_names[output_idx]] = calibrator else: intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append( @@ -377,7 +376,7 @@ def _collect_data(ort_inputs): self._dataloder_for_next_split_model = ort_inputs_for_next_split_model return activation_tensors_calib_range - + def get_weight_tensors_calib_range(self): """Get calib ranges of weight tensors. @@ -406,10 +405,8 @@ def get_weight_tensors_calib_range(self): ) if should_be_dump: for input in node.input: - if ( - (self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) - or (not self.already_quantized and input in initializers) - ) and len(input) != 0: + if ((self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) or + (not self.already_quantized and input in initializers)) and len(input) != 0: added_outputs.add(input) for tensor in added_outputs: @@ -425,20 +422,23 @@ def get_weight_tensors_calib_range(self): weight_tensors_calib_range = {} for initializer_tensor_name in initializer_tensors_to_dump: + if self.layer_wise: + self.model_wrapper.load_model_initializer_by_tensor() initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name) # double check initializer tensor is not None if initializer_tensor is None: # pragma: no cover continue - initializer_tensor = numpy_helper.to_array(initializer_tensor) - calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors + initializer_tensor = numpy_helper.to_array(initializer_tensor, + base_dir=os.path.dirname(self.model_wrapper.model_path)) + calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors calibrator.collect(initializer_tensor) weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)] calibrator.clear() del calibrator return weight_tensors_calib_range - + def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False): """Gather intermediate model outputs after running inference.""" output_dicts = {} @@ -634,7 +634,7 @@ def dump_tensor(self, activation=True, weight=False, format=None): self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node] is_qdq = format == "qdq" if activation: - self.augment_graph() # add activation tensors to model output + self.augment_graph() # add activation tensors to model output _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation) iters = len(list(output_dicts.values())[-1]) map_node_activation = [{} for _ in range(iters)] diff --git a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py index 41b668f182a..9c342f05c15 100644 --- a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py +++ b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py @@ -330,7 +330,7 @@ def test_augment_graph(self): attn_output_scale = generate_input_initializer([1], np.float32, "attn_output_scale") Q_zo = helper.make_tensor_value_info("attn_output_zero_point", TensorProto.INT8, [1]) attn_output_zero_point = generate_input_initializer([1], np.int8, "attn_output_zero_point") - Output = helper.make_tensor_value_info("output", TensorProto.INT8, [13, 7]) + Output = helper.make_tensor_value_info("attn_output_quantized", TensorProto.INT8, [13, 7]) attention_node = onnx.helper.make_node( "QAttention", [ @@ -386,15 +386,17 @@ def test_augment_graph(self): augment.augment_nodes = ["DequantizeLinear"] augment.already_quantized = True - augment.augment_graph(activation_only=True, weight_only=False) + augment.augment_graph() augmented_model = augment.augmented_model augmented_model_node_names = [node.name for node in augmented_model.graph.node] augmented_model_outputs = [output.name for output in augmented_model.graph.output] - added_node_names = ["attention_quant", "attn_output_QuantizeLinear"] - added_outputs = ["input_quantized_output", "output"] + added_node_names = ['attention_quant', + 'attn_output_QuantizeLinear', + 'input_quantized_DequantizeLinear'] + added_outputs = ['attn_output_quantized', 'input_quantized_output', 'attn_output'] self.assertEqual(len(augmented_model_node_names), 3) - self.assertEqual(len(augmented_model_outputs), 2) + self.assertEqual(len(augmented_model_outputs), 3) for name in added_node_names: self.assertTrue(name in augmented_model_node_names) for output in added_outputs: @@ -470,15 +472,16 @@ def test_augment_graph(self): augment = ONNXRTAugment(ONNXModel(model), data_reader, [], white_nodes=["conv"]) augment.augment_nodes = ["DequantizeLinear"] augment.already_quantized = True - augment.augment_graph(activation_only=True, weight_only=False) + augment.augment_graph() augmented_model = augment.augmented_model augmented_model_node_names = [node.name for node in augmented_model.graph.node] augmented_model_outputs = [output.name for output in augmented_model.graph.output] - added_node_names = ["A_QuantizeLinear", "conv_quant", "D_DequantizeLinear", "A_quantized_DequantizeLinear"] - added_outputs = ["D", "A_quantized_output"] - self.assertEqual(len(augmented_model_node_names), 4) - self.assertEqual(len(augmented_model_outputs), 2) + added_node_names = ['A_QuantizeLinear', 'conv_quant', 'D_DequantizeLinear', + 'D_quantized_DequantizeLinear', 'A_quantized_DequantizeLinear'] + added_outputs = ['D', 'D_quantized_output', 'A_quantized_output'] + self.assertEqual(len(augmented_model_node_names), 5) + self.assertEqual(len(augmented_model_outputs), 3) for name in added_node_names: self.assertTrue(name in augmented_model_node_names) for output in added_outputs: From 98e4daab7f057a978eca4ff2f17085ce5ce0c785 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jan 2024 09:31:42 +0000 Subject: [PATCH 10/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../adaptor/ox_utils/calibration.py | 28 +++++++++++-------- .../onnxrt_adaptor/test_onnxrt_augment.py | 17 ++++++----- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index fa2773da3e1..f3ae82c3cfe 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -159,7 +159,7 @@ def augment_graph(self): if should_be_dump: # add input tensors which should be dump for input in node.input: - if len(input) != 0: # to prevent input is "" + if len(input) != 0: # to prevent input is "" initializer_tensor = self.model_wrapper.get_initializer(input) if initializer_tensor is None: tensors_to_dump.add(input) @@ -174,7 +174,7 @@ def augment_graph(self): for augment_node_type in self.augment_nodes: if augment_node_type in ["DequantizeLinear"]: # insert DequantizeLinear node as output - if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover + if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover continue if not self.dynamically_quantized: @@ -333,8 +333,9 @@ def _collect_data(ort_inputs): # per iteration in the future. if calibrator.method_name == "minmax": calibrator.collect(output) - activation_tensors_calib_range[node_output_names[output_idx]] = \ - [list(calibrator.calib_range)] + activation_tensors_calib_range[node_output_names[output_idx]] = [ + list(calibrator.calib_range) + ] name_to_calibrator[node_output_names[output_idx]] = calibrator else: intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append( @@ -376,7 +377,7 @@ def _collect_data(ort_inputs): self._dataloder_for_next_split_model = ort_inputs_for_next_split_model return activation_tensors_calib_range - + def get_weight_tensors_calib_range(self): """Get calib ranges of weight tensors. @@ -405,8 +406,10 @@ def get_weight_tensors_calib_range(self): ) if should_be_dump: for input in node.input: - if ((self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) or - (not self.already_quantized and input in initializers)) and len(input) != 0: + if ( + (self.already_quantized and input.replace("_dequantized", "_quantized") in initializers) + or (not self.already_quantized and input in initializers) + ) and len(input) != 0: added_outputs.add(input) for tensor in added_outputs: @@ -430,15 +433,16 @@ def get_weight_tensors_calib_range(self): if initializer_tensor is None: # pragma: no cover continue - initializer_tensor = numpy_helper.to_array(initializer_tensor, - base_dir=os.path.dirname(self.model_wrapper.model_path)) - calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors + initializer_tensor = numpy_helper.to_array( + initializer_tensor, base_dir=os.path.dirname(self.model_wrapper.model_path) + ) + calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors calibrator.collect(initializer_tensor) weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)] calibrator.clear() del calibrator return weight_tensors_calib_range - + def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False): """Gather intermediate model outputs after running inference.""" output_dicts = {} @@ -634,7 +638,7 @@ def dump_tensor(self, activation=True, weight=False, format=None): self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node] is_qdq = format == "qdq" if activation: - self.augment_graph() # add activation tensors to model output + self.augment_graph() # add activation tensors to model output _, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation) iters = len(list(output_dicts.values())[-1]) map_node_activation = [{} for _ in range(iters)] diff --git a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py index 9c342f05c15..382efeaab6c 100644 --- a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py +++ b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py @@ -391,10 +391,8 @@ def test_augment_graph(self): augmented_model_node_names = [node.name for node in augmented_model.graph.node] augmented_model_outputs = [output.name for output in augmented_model.graph.output] - added_node_names = ['attention_quant', - 'attn_output_QuantizeLinear', - 'input_quantized_DequantizeLinear'] - added_outputs = ['attn_output_quantized', 'input_quantized_output', 'attn_output'] + added_node_names = ["attention_quant", "attn_output_QuantizeLinear", "input_quantized_DequantizeLinear"] + added_outputs = ["attn_output_quantized", "input_quantized_output", "attn_output"] self.assertEqual(len(augmented_model_node_names), 3) self.assertEqual(len(augmented_model_outputs), 3) for name in added_node_names: @@ -477,9 +475,14 @@ def test_augment_graph(self): augmented_model_node_names = [node.name for node in augmented_model.graph.node] augmented_model_outputs = [output.name for output in augmented_model.graph.output] - added_node_names = ['A_QuantizeLinear', 'conv_quant', 'D_DequantizeLinear', - 'D_quantized_DequantizeLinear', 'A_quantized_DequantizeLinear'] - added_outputs = ['D', 'D_quantized_output', 'A_quantized_output'] + added_node_names = [ + "A_QuantizeLinear", + "conv_quant", + "D_DequantizeLinear", + "D_quantized_DequantizeLinear", + "A_quantized_DequantizeLinear", + ] + added_outputs = ["D", "D_quantized_output", "A_quantized_output"] self.assertEqual(len(augmented_model_node_names), 5) self.assertEqual(len(augmented_model_outputs), 3) for name in added_node_names: From 602c8f0d36dbc2a5a898cf766c3dcd4445025830 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Fri, 12 Jan 2024 18:08:14 +0800 Subject: [PATCH 11/12] update calibration.py Signed-off-by: yuwenzho --- neural_compressor/adaptor/ox_utils/calibration.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index f3ae82c3cfe..0b419a3b864 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -434,8 +434,9 @@ def get_weight_tensors_calib_range(self): continue initializer_tensor = numpy_helper.to_array( - initializer_tensor, base_dir=os.path.dirname(self.model_wrapper.model_path) - ) + initializer_tensor, + base_dir=os.path.dirname(self.model_wrapper.model_path) \ + if self.model_wrapper.model_path is not None else "") calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors calibrator.collect(initializer_tensor) weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)] From 0ed29588c366093e7254a18f83755171b6163280 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jan 2024 10:09:37 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/adaptor/ox_utils/calibration.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 0b419a3b864..f56128e79ff 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -434,9 +434,11 @@ def get_weight_tensors_calib_range(self): continue initializer_tensor = numpy_helper.to_array( - initializer_tensor, - base_dir=os.path.dirname(self.model_wrapper.model_path) \ - if self.model_wrapper.model_path is not None else "") + initializer_tensor, + base_dir=os.path.dirname(self.model_wrapper.model_path) + if self.model_wrapper.model_path is not None + else "", + ) calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors calibrator.collect(initializer_tensor) weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)]