@@ -4615,6 +4615,9 @@ def quantize(self, tune_cfg, model, dataloader, calib_func=None):
4615
4615
q_model ._model = self .awq_quantize (q_model ._model , tune_cfg , dataloader , calib_func )
4616
4616
if "RTN" in all_algo :
4617
4617
q_model ._model = self .rtn_quantize (q_model ._model , tune_cfg )
4618
+ if "AUTOROUND" in all_algo :
4619
+ q_model ._model , autoround_config = self .autoround_quantize (q_model ._model , tune_cfg , dataloader )
4620
+ q_model .autoround_config = autoround_config
4618
4621
4619
4622
q_model .q_config = copy .deepcopy (self .tune_cfg )
4620
4623
q_model .is_quantized = True
@@ -4911,6 +4914,93 @@ def awq_quantize(self, model, tune_cfg, dataloader, calib_func):
4911
4914
)
4912
4915
return model
4913
4916
4917
+ def autoround_quantize (self , model , tune_cfg , dataloader ):
4918
+ logger .info ("quantizing with the AutoRound algorithm" )
4919
+ from .torch_utils .weight_only import autoround_quantize
4920
+
4921
+ # build weight_config
4922
+ """
4923
+ weight_config={
4924
+ 'layer1':##layer_name
4925
+ {
4926
+ 'data_type': 'int',
4927
+ 'bits': 4,
4928
+ 'group_size': 32,
4929
+ 'scheme': "asym", ## or sym
4930
+ }
4931
+ ...
4932
+ }
4933
+ """
4934
+ weight_config = {}
4935
+ for key , config in tune_cfg ["op" ].items ():
4936
+ if config ["weight" ]["dtype" ] == "fp32" :
4937
+ continue
4938
+ op_name , op_type = key
4939
+ weight_config [op_name ] = {}
4940
+ weight_config [op_name ]["data_type" ] = config ["weight" ]["dtype" ]
4941
+ weight_config [op_name ]["bits" ] = config ["weight" ]["bits" ]
4942
+ weight_config [op_name ]["group_size" ] = config ["weight" ]["group_size" ]
4943
+ weight_config [op_name ]["scheme" ] = config ["weight" ]["scheme" ]
4944
+
4945
+ # auto round recipes
4946
+ enable_full_range = self .recipes ["autoround_args" ].get ("enable_full_range" , False )
4947
+ bs = self .recipes ["autoround_args" ].get ("bs" , 8 )
4948
+ amp = self .recipes ["autoround_args" ].get ("amp" , True )
4949
+ device = self .recipes ["autoround_args" ].get ("device" , "cpu" )
4950
+ lr_scheduler = self .recipes ["autoround_args" ].get ("lr_scheduler" , None )
4951
+ dataset_name = self .recipes ["autoround_args" ].get ("dataset_name" , "NeelNanda/pile-10k" )
4952
+ dataset_split = self .recipes ["autoround_args" ].get ("dataset_split" , "train" )
4953
+ use_quant_input = self .recipes ["autoround_args" ].get ("use_quant_input" , True )
4954
+ enable_minmax_tuning = self .recipes ["autoround_args" ].get ("enable_minmax_tuning" , True )
4955
+ lr = self .recipes ["autoround_args" ].get ("lr" , None )
4956
+ minmax_lr = self .recipes ["autoround_args" ].get ("minmax_lr" , None )
4957
+ low_gpu_mem_usage = self .recipes ["autoround_args" ].get ("low_gpu_mem_usage" , True )
4958
+ iters = self .recipes ["autoround_args" ].get ("iters" , 200 )
4959
+ seqlen = self .recipes ["autoround_args" ].get ("seqlen" , 2048 )
4960
+ n_samples = self .recipes ["autoround_args" ].get ("n_samples" , 512 )
4961
+ sampler = self .recipes ["autoround_args" ].get ("sampler" , "rand" )
4962
+ seed = self .recipes ["autoround_args" ].get ("seed" , 42 )
4963
+ n_blocks = self .recipes ["autoround_args" ].get ("n_blocks" , 1 )
4964
+ gradient_accumulate_steps = self .recipes ["autoround_args" ].get ("gradient_accumulate_steps" , 1 )
4965
+ not_use_best_mse = self .recipes ["autoround_args" ].get ("not_use_best_mse" , False )
4966
+ dynamic_max_gap = self .recipes ["autoround_args" ].get ("dynamic_max_gap" , - 1 )
4967
+ data_type = self .recipes ["autoround_args" ].get ("data_type" , "int" ) ##only support data_type
4968
+ scale_dtype = self .recipes ["autoround_args" ].get ("scale_dtype" , "fp16" )
4969
+
4970
+ model , autoround_config = autoround_quantize (
4971
+ model = model ,
4972
+ tokenizer = None ,
4973
+ bits = 4 ,
4974
+ group_size = 128 ,
4975
+ scheme = "asym" ,
4976
+ weight_config = weight_config ,
4977
+ enable_full_range = enable_full_range ,
4978
+ bs = bs ,
4979
+ amp = amp ,
4980
+ device = device ,
4981
+ lr_scheduler = lr_scheduler ,
4982
+ dataloader = dataloader ,
4983
+ dataset_name = dataset_name ,
4984
+ dataset_split = dataset_split ,
4985
+ use_quant_input = use_quant_input ,
4986
+ enable_minmax_tuning = enable_minmax_tuning ,
4987
+ lr = lr ,
4988
+ minmax_lr = minmax_lr ,
4989
+ low_gpu_mem_usage = low_gpu_mem_usage ,
4990
+ iters = iters ,
4991
+ seqlen = seqlen ,
4992
+ n_samples = n_samples ,
4993
+ sampler = sampler ,
4994
+ seed = seed ,
4995
+ n_blocks = n_blocks ,
4996
+ gradient_accumulate_steps = gradient_accumulate_steps ,
4997
+ not_use_best_mse = not_use_best_mse ,
4998
+ dynamic_max_gap = dynamic_max_gap ,
4999
+ data_type = data_type ,
5000
+ scale_dtype = scale_dtype ,
5001
+ )
5002
+ return model , autoround_config
5003
+
4914
5004
def _dump_model_op_stats (self , model , tune_cfg ):
4915
5005
"""This is a function to dump quantizable ops of model to user.
4916
5006
0 commit comments