@@ -105,7 +105,7 @@ def dataloder_for_next_split_model(self):
105
105
"""Return dataloader for next split model for layer-wise quantization."""
106
106
return self ._dataloder_for_next_split_model
107
107
108
- def augment_graph (self , activation_only = False , weight_only = False ):
108
+ def augment_graph (self ):
109
109
"""Augment_graph.
110
110
111
111
Adds nodes to all quantization_candidates op type nodes in model and
@@ -118,7 +118,7 @@ def augment_graph(self, activation_only=False, weight_only=False):
118
118
self .dequantized_output .clear ()
119
119
onnx_version = Version (onnx .__version__ )
120
120
if onnx_version < ONNX18_VERSION :
121
- logger .warning ("Static quantization for NLP model is supported " " at onnx 1.8.0 and newer." )
121
+ logger .warning ("Static quantization for NLP model is supported at onnx 1.8.0 and newer." )
122
122
if self .already_quantized and any (
123
123
[i .dims in [1 , 2 ] for i in self .model_wrapper .initializer () if i .name .endswith ("_scale" )]
124
124
):
@@ -138,53 +138,43 @@ def augment_graph(self, activation_only=False, weight_only=False):
138
138
for augment_node_type in self .augment_nodes :
139
139
if augment_node_type not in ["DequantizeLinear" ]: # pragma: no cover
140
140
raise ValueError (
141
- "Unexpected augment_node {} only DequantizeLinear is " " supported" .format (augment_node_type )
141
+ "Unexpected augment_node {} only DequantizeLinear is supported" .format (augment_node_type )
142
142
)
143
143
144
144
if self .already_quantized :
145
145
# mapping between fp32 node and int8 node
146
146
new_white_nodes = []
147
147
for white_node in self .white_nodes :
148
148
new_white_node = white_node + "_quant"
149
- assert new_white_node in model_nodes_names , "no quantized {} in the " " graph" .format (white_node )
149
+ assert new_white_node in model_nodes_names , "no quantized {} in the graph" .format (white_node )
150
150
new_white_nodes .append (new_white_node )
151
151
self .white_nodes = new_white_nodes
152
152
153
- initializers = {i .name : i .data_type for i in model .graph .initializer }
154
153
node_outputs = []
155
154
for node in model .graph .node : # pylint: disable=no-member
156
155
node_outputs .extend (node .output )
157
156
should_be_dump = ((node .op_type in self .dump_op_types ) and (node .name not in self .black_nodes )) or (
158
157
node .name in self .white_nodes
159
158
)
160
159
if should_be_dump :
161
- if not weight_only and not activation_only :
162
- tensors_to_dump .update ([input for input in node .input if len (input ) != 0 ])
163
- tensors_to_dump .update ([output for output in node .output if len (output ) != 0 ])
164
- tensors_to_dump .update (node .output )
165
- elif weight_only :
166
- for input in node .input :
167
- if (
168
- self .already_quantized
169
- and input .replace ("_dequantized" , "_quantized" ) in initializers
170
- and len (input ) != 0
171
- ):
172
- tensors_to_dump .add (input )
173
- elif not self .already_quantized and input in initializers and len (input ) != 0 :
160
+ # add input tensors which should be dump
161
+ for input in node .input :
162
+ if len (input ) != 0 : # to prevent input is ""
163
+ initializer_tensor = self .model_wrapper .get_initializer (input )
164
+ if initializer_tensor is None :
174
165
tensors_to_dump .add (input )
175
- elif activation_only :
176
- if len (node .input [0 ]) != 0 :
177
- tensors_to_dump .update ([node .input [0 ]])
166
+ # add output tensors which should be dump
167
+ tensors_to_dump .update ([output for output in node .output if len (output ) != 0 ])
178
168
179
169
model_inputs = [i .name for i in model .graph .input ]
180
170
for tensor in tensors_to_dump :
181
- if tensor not in node_outputs and tensor not in initializers and tensor not in model_inputs :
171
+ if tensor not in node_outputs and tensor not in model_inputs :
182
172
continue
183
173
if self .augment_nodes :
184
174
for augment_node_type in self .augment_nodes :
185
175
if augment_node_type in ["DequantizeLinear" ]:
186
176
# insert DequantizeLinear node as output
187
- if tensor .endswith ("_scale" ) or tensor .endswith ("_zero_point" ):
177
+ if tensor .endswith ("_scale" ) or tensor .endswith ("_zero_point" ): # pragma: no cover
188
178
continue
189
179
190
180
if not self .dynamically_quantized :
@@ -238,10 +228,18 @@ def augment_graph(self, activation_only=False, weight_only=False):
238
228
convert_attribute = False ,
239
229
)
240
230
241
- def get_intermediate_outputs (self , q_config = None ):
242
- """Gather intermediate model outputs after running inference."""
231
+ def get_activation_tensors_calib_range (self , q_config = None ):
232
+ """Get calib ranges of activation tensors.
233
+
234
+ Args:
235
+ q_config (dict, optional): quantization config. Defaults to None.
236
+
237
+ Returns:
238
+ dict: calib ranges
239
+ """
243
240
# conduct inference session and get intermediate outputs
244
241
so = onnxruntime .SessionOptions ()
242
+ so .graph_optimization_level = onnxruntime .GraphOptimizationLevel .ORT_DISABLE_ALL
245
243
if sys .version_info < (3 , 11 ) and find_spec ("onnxruntime_extensions" ): # pragma: no cover
246
244
from onnxruntime_extensions import get_library_path
247
245
@@ -280,7 +278,7 @@ def get_intermediate_outputs(self, q_config=None):
280
278
assert node , "{} is neither an input nor an output of nodes in augmented model." .format (data_name )
281
279
name_to_node [data_name ] = node .name
282
280
283
- output_dicts = {}
281
+ activation_tensors_calib_range = {}
284
282
intermediate_tensor = {}
285
283
name_to_calibrator = {}
286
284
ort_inputs_for_next_split_model = []
@@ -294,8 +292,8 @@ def get_intermediate_outputs(self, q_config=None):
294
292
else :
295
293
ort_inputs .update ({inputs_names [0 ]: to_numpy (inputs )})
296
294
else :
295
+ # skip check input length for layer-wise calibration
297
296
if not self .layer_wise :
298
- # for layer-wise calibration
299
297
assert len_inputs == len (inputs ), "number of input tensors must align with graph inputs"
300
298
301
299
if isinstance (inputs , dict ):
@@ -335,14 +333,16 @@ def _collect_data(ort_inputs):
335
333
# per iteration in the future.
336
334
if calibrator .method_name == "minmax" :
337
335
calibrator .collect (output )
338
- output_dicts [node_output_names [output_idx ]] = [list (calibrator .calib_range )]
336
+ activation_tensors_calib_range [node_output_names [output_idx ]] = [
337
+ list (calibrator .calib_range )
338
+ ]
339
339
name_to_calibrator [node_output_names [output_idx ]] = calibrator
340
340
else :
341
341
intermediate_tensor .setdefault ((node_output_names [output_idx ], node_name ), []).append (
342
342
output
343
343
)
344
344
elif q_config is None :
345
- output_dicts .setdefault (node_output_names [output_idx ], []).append (output )
345
+ activation_tensors_calib_range .setdefault (node_output_names [output_idx ], []).append (output )
346
346
347
347
if self .layer_wise :
348
348
# for layer-wise calibration
@@ -369,12 +369,94 @@ def _collect_data(ort_inputs):
369
369
)
370
370
calibrator = CALIBRATOR [calib_method ]()
371
371
calibrator .collect (datas )
372
- output_dicts .setdefault (output_name , []).append (list (calibrator .calib_range ))
372
+ activation_tensors_calib_range .setdefault (output_name , []).append (list (calibrator .calib_range ))
373
373
calibrator .clear ()
374
374
del calibrator
375
375
376
+ # set for layer-wise quant
376
377
self ._dataloder_for_next_split_model = ort_inputs_for_next_split_model
377
378
379
+ return activation_tensors_calib_range
380
+
381
+ def get_weight_tensors_calib_range (self ):
382
+ """Get calib ranges of weight tensors.
383
+
384
+ Returns:
385
+ dict: calib ranges
386
+ """
387
+ model_nodes_names = [node .name for node in self .model .graph .node ]
388
+
389
+ # if augmented_model is not None, it means self.white_nodes is already updated in augment_graph func
390
+ # then skip update here
391
+ if self .already_quantized and self .augmented_model is None :
392
+ # mapping between fp32 node and int8 node
393
+ new_white_nodes = []
394
+ for white_node in self .white_nodes :
395
+ new_white_node = white_node + "_quant"
396
+ assert new_white_node in model_nodes_names , "no quantized {} in the " "graph" .format (white_node )
397
+ new_white_nodes .append (new_white_node )
398
+ self .white_nodes = new_white_nodes
399
+
400
+ added_outputs = set ()
401
+ initializer_tensors_to_dump = []
402
+ initializers = [init .name for init in self .model .graph .initializer ]
403
+ for node in self .model .graph .node : # pylint: disable=no-member
404
+ should_be_dump = ((node .op_type in self .dump_op_types ) and (node .name not in self .black_nodes )) or (
405
+ node .name in self .white_nodes
406
+ )
407
+ if should_be_dump :
408
+ for input in node .input :
409
+ if (
410
+ (self .already_quantized and input .replace ("_dequantized" , "_quantized" ) in initializers )
411
+ or (not self .already_quantized and input in initializers )
412
+ ) and len (input ) != 0 :
413
+ added_outputs .add (input )
414
+
415
+ for tensor in added_outputs :
416
+ if tensor not in initializers :
417
+ continue
418
+ if self .augment_nodes :
419
+ for augment_node_type in self .augment_nodes :
420
+ if augment_node_type in ["DequantizeLinear" ]:
421
+ if not (tensor .endswith ("_scale" ) or tensor .endswith ("_zero_point" )):
422
+ initializer_tensors_to_dump .append (tensor )
423
+ else :
424
+ initializer_tensors_to_dump .append (tensor )
425
+
426
+ weight_tensors_calib_range = {}
427
+ for initializer_tensor_name in initializer_tensors_to_dump :
428
+ if self .layer_wise :
429
+ self .model_wrapper .load_model_initializer_by_tensor ()
430
+ initializer_tensor = self .model_wrapper .get_initializer (initializer_tensor_name )
431
+
432
+ # double check initializer tensor is not None
433
+ if initializer_tensor is None : # pragma: no cover
434
+ continue
435
+
436
+ initializer_tensor = numpy_helper .to_array (
437
+ initializer_tensor ,
438
+ base_dir = os .path .dirname (self .model_wrapper .model_path )
439
+ if self .model_wrapper .model_path is not None
440
+ else "" ,
441
+ )
442
+ calibrator = CALIBRATOR ["minmax" ]() # use minmax method to calibrate initializer tensors
443
+ calibrator .collect (initializer_tensor )
444
+ weight_tensors_calib_range [initializer_tensor_name ] = [list (calibrator .calib_range )]
445
+ calibrator .clear ()
446
+ del calibrator
447
+ return weight_tensors_calib_range
448
+
449
+ def get_intermediate_outputs (self , q_config = None , activation_only = False , weight_only = False ):
450
+ """Gather intermediate model outputs after running inference."""
451
+ output_dicts = {}
452
+ if not activation_only and not weight_only :
453
+ output_dicts = self .get_activation_tensors_calib_range (q_config )
454
+ output_dicts .update (self .get_weight_tensors_calib_range ())
455
+ elif weight_only :
456
+ output_dicts = self .get_weight_tensors_calib_range ()
457
+ elif activation_only :
458
+ output_dicts = self .get_activation_tensors_calib_range (q_config )
459
+
378
460
return list (output_dicts .keys ()), output_dicts
379
461
380
462
def _dequantize (self , tensor , scale_tensor , zo_tensor ):
@@ -472,7 +554,12 @@ def _map_calibration(self, node_output_names, output_dicts):
472
554
return final_dict
473
555
474
556
def dump_minmax (self , q_config ):
475
- """Get min/max values of tensors."""
557
+ """Get calib ranges of tensors."""
558
+ # pipeline of getting calib ranges of tensors during calibration:
559
+ # 1. augment_graph(): insert activation tensors to model output
560
+ # 2. get_intermediate_outputs():
561
+ # 2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augment graph
562
+ # 2.2 get_weight_tensors_calib_range(): get calib ranges of weight tensors
476
563
self .augment_graph ()
477
564
node_output_names , output_dicts = self .get_intermediate_outputs (q_config )
478
565
return self ._map_calibration (node_output_names , output_dicts )
@@ -553,15 +640,20 @@ def dump_tensor(self, activation=True, weight=False, format=None):
553
640
self .already_quantized = True
554
641
self .dynamically_quantized = "DynamicQuantizeLinear" in [node .op_type for node in self .model .graph .node ]
555
642
is_qdq = format == "qdq"
556
- self .augment_graph (activation_only = not weight , weight_only = not activation )
557
- _ , output_dicts = self .get_intermediate_outputs ()
643
+ if activation :
644
+ self .augment_graph () # add activation tensors to model output
645
+ _ , output_dicts = self .get_intermediate_outputs (activation_only = not weight , weight_only = not activation )
558
646
iters = len (list (output_dicts .values ())[- 1 ])
559
647
map_node_activation = [{} for _ in range (iters )]
560
648
map_node_weight = {}
561
649
self .white_nodes = [node .replace ("_quant" , "" ) for node in self .white_nodes ]
562
- augmengted_wrapper = ONNXModel (self .augmented_model )
563
- map_output = augmengted_wrapper .output_name_to_node
564
- map_input = augmengted_wrapper .input_name_to_nodes
650
+
651
+ if activation and self .augmented_model is None :
652
+ raise ValueError ("augmented model should not be None when dump activation tensors." )
653
+ # if activation tensors are not dumped, then use origin model wrapper
654
+ model_wrapper = ONNXModel (self .augmented_model ) if activation else self .model_wrapper
655
+ map_output = model_wrapper .output_name_to_node
656
+ map_input = model_wrapper .input_name_to_nodes
565
657
model_output_names = [t .name for t in self .model .graph .output ]
566
658
model_input_names = [t .name for t in self .model .graph .input ]
567
659
model_initializer_names = [t .name for t in self .model .graph .initializer ]
0 commit comments