@@ -274,6 +274,9 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
274
274
if ort_version < ONNXRT152_VERSION : # pragma: no cover
275
275
logger .warning ("Quantize input needs onnxruntime 1.5.2 or newer." )
276
276
return model
277
+ if ort_version < ONNXRT170_VERSION and self .format == "qdq" :
278
+ logger .error ("QDQ mode needs onnxruntime1.7.0 or newer." )
279
+ exit (0 )
277
280
if model .model .opset_import [0 ].version < 11 : # pragma: no cover
278
281
logger .warning ("Quantize input needs model opset 11 or newer." )
279
282
if self .backend == "DnnlExecutionProvider" and any (
@@ -289,17 +292,6 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
289
292
"please upgrade it manually to run with bf16 data type"
290
293
)
291
294
exit (0 )
292
-
293
- from neural_compressor .adaptor .ox_utils .util import QuantizationMode
294
-
295
- if self .format == "qlinearops" :
296
- format = QuantizationMode .QLinearOps
297
- elif self .format == "qdq" :
298
- assert ort_version >= ONNXRT170_VERSION , "QDQ mode needs onnxruntime1.7.0 or newer"
299
- format = "qdq"
300
- else :
301
- format = QuantizationMode .IntegerOps
302
-
303
295
self .quantizable_ops = self ._query_quantizable_ops (model .model )
304
296
quantize_config = self ._cfg_to_quantize_config (tune_cfg )
305
297
@@ -405,43 +397,11 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
405
397
)
406
398
else :
407
399
quantize_params = None
400
+ q_config = self ._generate_qconfig (model .model , tune_cfg , quantize_params )
408
401
self .quantize_params = quantize_params
409
-
410
- from neural_compressor import options
411
- from neural_compressor .adaptor .ox_utils .quantizer import Quantizer
412
-
413
- quantizer = Quantizer (
414
- tmp_model ,
415
- quantize_config ,
416
- format ,
417
- self .static ,
418
- quantize_params ,
419
- self .quantizable_op_types ,
420
- self .query_handler .get_fallback_list (),
421
- self .reduce_range ,
422
- (
423
- options .onnxrt .qdq_setting .AddQDQPairToWeight
424
- if "add_qdq_pair_to_weight" not in self .recipes
425
- else self .recipes .get ("add_qdq_pair_to_weight" , False )
426
- ),
427
- (
428
- options .onnxrt .qdq_setting .OpTypesToExcludeOutputQuantizatioin
429
- if "optypes_to_exclude_output_quant" not in self .recipes
430
- else self .recipes .get ("optypes_to_exclude_output_quant" , [])
431
- ),
432
- (
433
- options .onnxrt .qdq_setting .DedicatedQDQPair
434
- if "dedicated_qdq_pair" not in self .recipes
435
- else self .recipes .get ("dedicated_qdq_pair" , False )
436
- ),
437
- self .backend ,
438
- )
439
- quantizer .quantize_model ()
440
- tmp_model .q_config = self ._generate_qconfig (model .model , tune_cfg , quantize_params )
441
- tmp_model .model = quantizer .model .model
442
- self .quantize_config = quantize_config # update so other methods can know current configs
402
+ tmp_model = self ._quantize_model (tmp_model , quantize_config , quantize_params )
403
+ tmp_model .q_config = q_config
443
404
self ._dump_model_op_stats (tmp_model )
444
- tmp_model .topological_sort ()
445
405
446
406
# if the model is large and acc tuning is required, save it to workspace
447
407
if not self .performance_only and tmp_model .is_large_model : # pragma: no cover
@@ -496,13 +456,21 @@ def _get_split_model_quantize_params(
496
456
)
497
457
return split_quantize_params , dataloder_for_next_split_model
498
458
499
- def _quantize_split_model (self , split_model , quantize_config , quantize_params , quantized_model_merged ):
500
- """Quantize split model, and merge the quantized models to generate final model."""
459
+ def _quantize_model (self , model , quantize_config , quantize_params ):
460
+ """Quantize model."""
501
461
from neural_compressor import options
502
462
from neural_compressor .adaptor .ox_utils .quantizer import Quantizer
463
+ from neural_compressor .adaptor .ox_utils .util import QuantizationMode
464
+
465
+ if self .format == "qlinearops" :
466
+ format = QuantizationMode .QLinearOps
467
+ elif self .format == "qdq" :
468
+ format = "qdq"
469
+ else :
470
+ format = QuantizationMode .IntegerOps
503
471
504
472
quantizer = Quantizer (
505
- split_model ,
473
+ model ,
506
474
quantize_config ,
507
475
format ,
508
476
self .static ,
@@ -528,14 +496,19 @@ def _quantize_split_model(self, split_model, quantize_config, quantize_params, q
528
496
self .backend ,
529
497
)
530
498
quantizer .quantize_model ()
531
- split_model .model = quantizer .model .model
532
- split_model .topological_sort ()
499
+ model .model = quantizer .model .model
500
+ self .quantize_config = quantize_config # update so other methods can know current configs
501
+ model .topological_sort ()
502
+ return model
533
503
504
+ def _quantize_split_model (self , split_model , quantize_config , quantize_params , quantized_model_merged ):
505
+ """Quantize split model, and merge the quantized models to generate final model."""
506
+ split_model = self ._quantize_model (split_model , quantize_config , quantize_params )
534
507
if quantized_model_merged is None :
535
- quantized_model_merged = quantizer . model
508
+ quantized_model_merged = split_model
536
509
quantized_model_merged .write_external_data_to_new_location (overwrite = True )
537
510
else :
538
- quantized_model_merged .merge_split_models (quantizer . model )
511
+ quantized_model_merged .merge_split_models (split_model )
539
512
540
513
return quantized_model_merged
541
514
@@ -640,57 +613,109 @@ def recover(self, model, q_config):
640
613
"""
641
614
self ._pre_optimize (model )
642
615
model = self .pre_optimized_model
616
+
643
617
ort_version = Version (ort .__version__ )
644
618
if ort_version < ONNXRT152_VERSION : # pragma: no cover
645
619
logger .warning ("Quantize input needs onnxruntime 1.5.2 or newer." )
646
620
return model
647
621
if model .model .opset_import [0 ].version < 11 : # pragma: no cover
648
622
logger .warning ("Quantize input needs model opset 11 or newer." )
623
+ if ort_version < ONNXRT170_VERSION and self .format == "qdq" :
624
+ logger .error ("QDQ mode needs onnxruntime1.7.0 or newer." )
625
+ exit (0 )
626
+ if self .backend == "DnnlExecutionProvider" and any (
627
+ [i .domain in ["" , "ai.onnx" ] and i .version < 15 for i in model .model .opset_import ]
628
+ ): # pragma: no cover
629
+ from onnx import version_converter
630
+
631
+ try :
632
+ model = self ._rename_node (ONNXModel (version_converter .convert_version (model .model , 15 )))
633
+ except :
634
+ logging .warning (
635
+ "Fail to upgrade model opset_import to >= 15, "
636
+ "please upgrade it manually to run with bf16 data type"
637
+ )
638
+ exit (0 )
649
639
650
640
from neural_compressor .adaptor .ox_utils .util import QuantizationMode
651
641
652
- if self .format in [ "qlinearops" ] :
642
+ if self .format == "qlinearops" :
653
643
format = QuantizationMode .QLinearOps
654
644
elif self .format == "qdq" :
655
- assert ort_version >= ONNXRT170_VERSION , "QDQ mode needs onnxruntime1.7.0 or newer"
656
- format = self .format
645
+ format = "qdq"
657
646
else :
658
647
format = QuantizationMode .IntegerOps
659
- from neural_compressor import options
660
- from neural_compressor .adaptor .ox_utils .quantizer import Quantizer
661
648
662
649
self .quantizable_ops = self ._query_quantizable_ops (model .model )
663
650
quantize_params , tune_cfg = self ._parse_qconfig (q_config )
664
651
quantize_config = self ._cfg_to_quantize_config (tune_cfg )
665
- quantizer = Quantizer (
666
- model .model ,
667
- quantize_config ,
668
- format ,
669
- self .static ,
670
- quantize_params ,
671
- self .quantizable_op_types ,
672
- self .query_handler .get_fallback_list (),
673
- self .reduce_range ,
674
- (
675
- options .onnxrt .qdq_setting .AddQDQPairToWeight
676
- if not options .onnxrt .qdq_setting .AddQDQPairToWeight
677
- else self .recipes .get ("add_qdq_pair_to_weight" , False )
678
- ),
679
- (
680
- options .onnxrt .qdq_setting .OpTypesToExcludeOutputQuantizatioin
681
- if options .onnxrt .qdq_setting .OpTypesToExcludeOutputQuantizatioin is not None
682
- else self .recipes .get ("optypes_to_exclude_output_quant" , [])
683
- ),
684
- (
685
- options .onnxrt .qdq_setting .DedicatedQDQPair
686
- if not options .onnxrt .qdq_setting .DedicatedQDQPair
687
- else self .recipes .get ("dedicated_qdq_pair" , False )
688
- ),
689
- )
690
652
691
- quantizer .quantize_model ()
692
- model .model = quantizer .model .model
693
- model .topological_sort ()
653
+ if self ._need_smooth_quant (tune_cfg ):
654
+ logger .error ("Don't support to recover quantized model with smooth quant from original fp32 model." )
655
+ exit (0 )
656
+
657
+ if self .recipes .get ("layer_wise_quant" , False ) and not self .dynamic :
658
+ # layer-wise quantization
659
+ # details refer to docs/source/quantization_weight_only.md#layer-wise-quantization
660
+ _model_to_split = copy .deepcopy (model )
661
+
662
+ split_nodes = _model_to_split .find_split_nodes ()
663
+ logger .info (
664
+ "Will split model into {} parts to do layer-wise quantization" .format (
665
+ len ([node .name for node in split_nodes ]) + 1
666
+ )
667
+ )
668
+ logger .debug (
669
+ "Will split model with these nodes for layer-wise quantization: {}" .format (
670
+ [node .name for node in split_nodes ]
671
+ )
672
+ )
673
+
674
+ split_idx = 1
675
+ model_to_split = [_model_to_split ]
676
+ quantized_model_merged = None
677
+
678
+ while len (model_to_split ) != 0 :
679
+ split_model = model_to_split .pop (0 )
680
+ split_node = split_nodes .pop (0 )
681
+ save_both_split_models = True if len (split_nodes ) == 0 else False
682
+ shape_infer = True if split_idx == 1 else False
683
+
684
+ # split model with given split_node
685
+ split_model_part_1 , split_model_part_2 = split_model .split_model_with_node (
686
+ split_node .name , model .model_path , shape_infer , save_both_split_models
687
+ )
688
+ if not save_both_split_models :
689
+ # append split_model_part_2 to do next split
690
+ model_to_split .append (split_model_part_2 )
691
+
692
+ logger .info ("Quantize split model {}" .format (split_idx ))
693
+
694
+ # quantize split model
695
+ quantized_model_merged = self ._quantize_split_model (
696
+ split_model_part_1 , quantize_config , quantize_params , quantized_model_merged
697
+ )
698
+
699
+ split_idx += 1
700
+
701
+ # if this is the last split, then quantize the last split model
702
+ if save_both_split_models :
703
+ logger .info ("Quantize split model {}" .format (split_idx ))
704
+
705
+ # quantize split model
706
+ quantized_model_merged = self ._quantize_split_model (
707
+ split_model_part_2 , quantize_config , quantize_params , quantized_model_merged
708
+ )
709
+ quantized_model_merged .re_org_output (model .output ()) # re-org output as the origin output
710
+
711
+ model .model = quantized_model_merged .model
712
+ self ._dump_model_op_stats (model )
713
+ model .check_is_large_model ()
714
+
715
+ else :
716
+ model = self ._quantize_model (model , quantize_config , quantize_params )
717
+
718
+ self ._dump_model_op_stats (model )
694
719
return model
695
720
696
721
def _parse_qconfig (self , q_config ):
0 commit comments