35
35
from neural_compressor .adaptor .ox_utils .util import ONNXRT_BACKENDS , PROVIDERS , to_numpy
36
36
from neural_compressor .adaptor .query import QueryBackendCapability
37
37
from neural_compressor .data .dataloaders .base_dataloader import BaseDataLoader
38
+ from neural_compressor .model .onnx_model import ONNXModel
38
39
from neural_compressor .utils .utility import GLOBAL_STATE , MODE , CpuInfo , LazyImport , Statistics , dump_elapsed_time
39
40
40
41
onnx = LazyImport ("onnx" )
@@ -267,8 +268,6 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
267
268
): # pragma: no cover
268
269
from onnx import version_converter
269
270
270
- from neural_compressor .model .onnx_model import ONNXModel
271
-
272
271
try :
273
272
model = self ._rename_node (ONNXModel (version_converter .convert_version (model .model , 15 )))
274
273
except :
@@ -308,18 +307,146 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
308
307
309
308
iterations = tune_cfg .get ("calib_iteration" , 1 )
310
309
calib_sampling_size = tune_cfg .get ("calib_sampling_size" , 1 )
311
- if not self .dynamic :
312
- calib_iterations = self ._reset_calib_iter (data_loader , calib_sampling_size , iterations )
313
- quantize_params = self ._get_quantize_params (tmp_model , data_loader , quantize_config , calib_iterations )
310
+
311
+ if self .recipes .get ("layer_wise_quant" , False ) and not self .dynamic :
312
+ # layer-wise quantization
313
+ # details refer to docs/source/quantization_weight_only.md#layer-wise-quantization
314
+ _model_to_split = copy .deepcopy (tmp_model )
315
+
316
+ split_nodes = _model_to_split .find_split_nodes ()
317
+ logger .info (
318
+ "Will split model into {} parts to do layer-wise quantization" .format (
319
+ len ([node .name for node in split_nodes ]) + 1
320
+ )
321
+ )
322
+ logger .debug (
323
+ "Will split model with these nodes for layer-wise quantization: {}" .format (
324
+ [node .name for node in split_nodes ]
325
+ )
326
+ )
327
+
328
+ split_idx = 1
329
+ model_to_split = [_model_to_split ]
330
+ dataloader_for_split_model = [data_loader ]
331
+ quantize_params = {}
332
+ quantized_model_merged = None
333
+
334
+ while len (model_to_split ) != 0 :
335
+ split_model = model_to_split .pop (0 )
336
+ split_node = split_nodes .pop (0 )
337
+ save_both_split_models = True if len (split_nodes ) == 0 else False
338
+ shape_infer = True if split_idx == 1 else False
339
+
340
+ # split model with given split_node
341
+ split_model_part_1 , split_model_part_2 = split_model .split_model_with_node (
342
+ split_node .name , tmp_model .model_path , shape_infer , save_both_split_models
343
+ )
344
+ if not save_both_split_models :
345
+ # append split_model_part_2 to do next split
346
+ model_to_split .append (split_model_part_2 )
347
+
348
+ logger .info ("Quantize split model {}" .format (split_idx ))
349
+ # get quantize params of split model
350
+ split_quantize_params , dataloder_for_next_split_model = self ._get_split_model_quantize_params (
351
+ split_model_part_1 , dataloader_for_split_model , quantize_config , calib_sampling_size , iterations
352
+ )
353
+ dataloader_for_split_model .append (dataloder_for_next_split_model )
354
+ quantize_params .update (split_quantize_params )
355
+
356
+ # quantize split model
357
+ quantized_model_merged = self ._quantize_split_model (
358
+ split_model_part_1 , quantize_config , split_quantize_params , quantized_model_merged
359
+ )
360
+
361
+ split_idx += 1
362
+
363
+ # if this is the last split, then quantize the last split model
364
+ if save_both_split_models :
365
+ logger .info ("Quantize split model {}" .format (split_idx ))
366
+ # get quantize params of split model
367
+ split_quantize_params , dataloder_for_next_split_model = self ._get_split_model_quantize_params (
368
+ split_model_part_2 , dataloader_for_split_model , quantize_config , calib_sampling_size , iterations
369
+ )
370
+ quantize_params .update (split_quantize_params )
371
+
372
+ # quantize split model
373
+ quantized_model_merged = self ._quantize_split_model (
374
+ split_model_part_2 , quantize_config , split_quantize_params , quantized_model_merged
375
+ )
376
+ quantized_model_merged .re_org_output (tmp_model .output ()) # re-org output as the origin output
377
+
378
+ self .quantize_params = quantize_params
379
+ tmp_model .q_config = self ._generate_qconfig (model .model , tune_cfg , quantize_params )
380
+ tmp_model .model = quantized_model_merged .model
381
+ self .quantize_config = quantize_config # update so other methods can know current configs
382
+ self ._dump_model_op_stats (tmp_model )
383
+ tmp_model .topological_sort ()
384
+ tmp_model .check_is_large_model ()
385
+ return tmp_model
386
+
314
387
else :
315
- quantize_params = None
316
- self .quantize_params = quantize_params
388
+ if not self .dynamic :
389
+ calib_iterations = self ._reset_calib_iter (data_loader , calib_sampling_size , iterations )
390
+ quantize_params , _ = self ._get_quantize_params (
391
+ tmp_model , data_loader , quantize_config , calib_iterations
392
+ )
393
+ else :
394
+ quantize_params = None
395
+ self .quantize_params = quantize_params
396
+
397
+ from neural_compressor import options
398
+ from neural_compressor .adaptor .ox_utils .quantizer import Quantizer
317
399
400
+ quantizer = Quantizer (
401
+ tmp_model ,
402
+ quantize_config ,
403
+ format ,
404
+ self .static ,
405
+ quantize_params ,
406
+ self .quantizable_op_types ,
407
+ self .query_handler .get_fallback_list (),
408
+ self .reduce_range ,
409
+ options .onnxrt .qdq_setting .AddQDQPairToWeight
410
+ if "add_qdq_pair_to_weight" not in self .recipes
411
+ else self .recipes .get ("add_qdq_pair_to_weight" , False ),
412
+ options .onnxrt .qdq_setting .OpTypesToExcludeOutputQuantizatioin
413
+ if "optypes_to_exclude_output_quant" not in self .recipes
414
+ else self .recipes .get ("optypes_to_exclude_output_quant" , []),
415
+ options .onnxrt .qdq_setting .DedicatedQDQPair
416
+ if "dedicated_qdq_pair" not in self .recipes
417
+ else self .recipes .get ("dedicated_qdq_pair" , False ),
418
+ self .backend ,
419
+ )
420
+ quantizer .quantize_model ()
421
+ tmp_model .q_config = self ._generate_qconfig (model .model , tune_cfg , quantize_params )
422
+ tmp_model .model = quantizer .model .model
423
+ self .quantize_config = quantize_config # update so other methods can know current configs
424
+ self ._dump_model_op_stats (tmp_model )
425
+ tmp_model .topological_sort ()
426
+ return tmp_model
427
+
428
+ def _get_split_model_quantize_params (
429
+ self , split_model , split_dataloader , quantize_config , calib_sampling_size , iterations
430
+ ):
431
+ """Get quantize params for current split model and get dataloader for next split model."""
432
+ dataloader = split_dataloader .pop (0 )
433
+ calib_iterations = self ._reset_calib_iter (dataloader , calib_sampling_size , iterations )
434
+ split_quantize_params , dataloder_for_next_split_model = self ._get_quantize_params (
435
+ split_model ,
436
+ dataloader ,
437
+ quantize_config ,
438
+ calib_iterations ,
439
+ split_model_input_names = split_model .input (),
440
+ )
441
+ return split_quantize_params , dataloder_for_next_split_model
442
+
443
+ def _quantize_split_model (self , split_model , quantize_config , quantize_params , quantized_model_merged ):
444
+ """Quantize split model, and merge the quantized models to generate final model."""
318
445
from neural_compressor import options
319
446
from neural_compressor .adaptor .ox_utils .quantizer import Quantizer
320
447
321
448
quantizer = Quantizer (
322
- tmp_model ,
449
+ split_model ,
323
450
quantize_config ,
324
451
format ,
325
452
self .static ,
@@ -339,12 +466,16 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
339
466
self .backend ,
340
467
)
341
468
quantizer .quantize_model ()
342
- tmp_model .q_config = self ._generate_qconfig (model .model , tune_cfg , quantize_params )
343
- tmp_model .model = quantizer .model .model
344
- self .quantize_config = quantize_config # update so other methods can know current configs
345
- self ._dump_model_op_stats (tmp_model )
346
- tmp_model .topological_sort ()
347
- return tmp_model
469
+ split_model .model = quantizer .model .model
470
+ split_model .topological_sort ()
471
+
472
+ if quantized_model_merged is None :
473
+ quantized_model_merged = quantizer .model
474
+ quantized_model_merged .write_external_data_to_new_location (overwrite = True )
475
+ else :
476
+ quantized_model_merged .merge_split_models (quantizer .model )
477
+
478
+ return quantized_model_merged
348
479
349
480
def _check_backend_available (self , backend ):
350
481
"""Check backend is available or not."""
@@ -570,7 +701,7 @@ def _dump_model_op_stats(self, model):
570
701
Statistics (output_data , header = "Mixed Precision Statistics" , field_names = field_names ).print_stat ()
571
702
self .optype_statistics = field_names , output_data
572
703
573
- def _get_quantize_params (self , model , data_loader , quantize_config , iterations ):
704
+ def _get_quantize_params (self , model , data_loader , quantize_config , iterations , ** kwargs ):
574
705
from neural_compressor .adaptor .ox_utils .calibration import ONNXRTAugment
575
706
from neural_compressor .model .onnx_model import ONNXModel
576
707
@@ -588,10 +719,12 @@ def _get_quantize_params(self, model, data_loader, quantize_config, iterations):
588
719
iterations = list (range (0 , iterations )),
589
720
backend = self .backend ,
590
721
reduce_range = self .reduce_range ,
722
+ ** kwargs ,
591
723
)
592
724
self .min_max = augment .dump_minmax (quantize_config )
593
725
quantize_params = augment .dump_calibration (quantize_config , min_max = self .min_max )
594
- return quantize_params
726
+ dataloder_for_next_split_model = augment .dataloder_for_next_split_model
727
+ return quantize_params , dataloder_for_next_split_model
595
728
596
729
def inspect_tensor (
597
730
self ,
@@ -606,7 +739,6 @@ def inspect_tensor(
606
739
):
607
740
"""The function is used by tune strategy class for dumping tensor info."""
608
741
from neural_compressor .adaptor .ox_utils .calibration import ONNXRTAugment
609
- from neural_compressor .model .onnx_model import ONNXModel
610
742
from neural_compressor .utils .utility import dump_data_to_local
611
743
612
744
if not isinstance (model , ONNXModel ):
@@ -763,6 +895,9 @@ def _pre_optimize(self, model, level=1):
763
895
}
764
896
if not isinstance (self .query_handler .get_graph_optimization (), list ):
765
897
level = self .query_handler .get_graph_optimization ()
898
+ elif self .recipes .get ("layer_wise_quant" ):
899
+ level = "ENABLE_BASIC"
900
+ logger .info ("Force set graph optimization level to 'ENABLE_BASIC' for layer-wise quantization" )
766
901
elif options .onnxrt .graph_optimization .level is not None :
767
902
level = options .onnxrt .graph_optimization .level
768
903
elif self .recipes .get ("graph_optimization_level" , None ) is not None :
@@ -778,10 +913,23 @@ def _pre_optimize(self, model, level=1):
778
913
)
779
914
sess_options .graph_optimization_level = optimization_levels [level ]
780
915
sess_options .optimized_model_filepath = os .path .join (self .work_space , "Optimized_model.onnx" )
916
+ if model .is_large_model and self .recipes .get ("layer_wise_quant" , False ):
917
+ # save the model and external data for layer-wise quantization
918
+ external_data_filename = os .path .basename (sess_options .optimized_model_filepath ) + "_data"
919
+ external_data_file_threshold = 1024
920
+ sess_options .add_session_config_entry (
921
+ "session.optimized_model_external_initializers_file_name" , external_data_filename
922
+ )
923
+ sess_options .add_session_config_entry (
924
+ "session.optimized_model_external_initializers_min_size_in_bytes" , str (external_data_file_threshold )
925
+ )
926
+ logger .info ("Saving optimized model for layer-wise quantization. This may take a while..." )
927
+
781
928
if sys .version_info < (3 , 11 ) and find_spec ("onnxruntime_extensions" ): # pragma: no cover
782
929
from onnxruntime_extensions import get_library_path
783
930
784
931
sess_options .register_custom_ops_library (get_library_path ())
932
+
785
933
if not model .is_large_model :
786
934
sess = ort .InferenceSession (
787
935
model .model .SerializeToString (), sess_options , providers = ["CPUExecutionProvider" ]
@@ -792,13 +940,14 @@ def _pre_optimize(self, model, level=1):
792
940
else : # pragma: no cover
793
941
logger .warning ("Please use model path instead of onnx model object to quantize" )
794
942
del sess
795
-
796
943
tmp_model = onnx .load (sess_options .optimized_model_filepath , load_external_data = False )
797
944
798
- if model .is_large_model : # pragma: no cover
945
+ # load external data if model is large and not layer wise quantization
946
+ if model .is_large_model and not self .recipes .get ("layer_wise_quant" , False ): # pragma: no cover
799
947
from onnx .external_data_helper import load_external_data_for_model
800
948
801
949
load_external_data_for_model (tmp_model , os .path .split (model .model_path )[0 ])
950
+
802
951
model .model_path = sess_options .optimized_model_filepath
803
952
model .model = (
804
953
self ._replace_gemm_with_matmul (tmp_model ).model
@@ -903,8 +1052,6 @@ def _replace_gemm_with_matmul(model):
903
1052
new_nodes = []
904
1053
from onnx import numpy_helper
905
1054
906
- from neural_compressor .model .onnx_model import ONNXModel
907
-
908
1055
if not isinstance (model , ONNXModel ):
909
1056
model = ONNXModel (model )
910
1057
0 commit comments