@@ -31,69 +31,95 @@ class AutoRoundQuantizer(Quantizer):
31
31
def __init__ (
32
32
self ,
33
33
quant_config : dict = {},
34
- enable_full_range : bool = False ,
34
+ enable_full_range : bool = False , ##for symmetric, TODO support later
35
35
batch_size : int = 8 ,
36
36
amp : bool = True ,
37
- device = None ,
37
+ device : str = None ,
38
38
lr_scheduler = None ,
39
+ dataset : Union [str , list , tuple , torch .utils .data .DataLoader ] = "NeelNanda/pile-10k" ,
39
40
enable_quanted_input : bool = True ,
40
41
enable_minmax_tuning : bool = True ,
41
42
lr : float = None ,
42
43
minmax_lr : float = None ,
43
- low_gpu_mem_usage : bool = True ,
44
+ low_gpu_mem_usage : bool = False ,
44
45
iters : int = 200 ,
45
46
seqlen : int = 2048 ,
46
- n_samples : int = 512 ,
47
+ nsamples : int = 128 ,
47
48
sampler : str = "rand" ,
48
49
seed : int = 42 ,
49
- n_blocks : int = 1 ,
50
+ nblocks : int = 1 ,
50
51
gradient_accumulate_steps : int = 1 ,
51
52
not_use_best_mse : bool = False ,
52
53
dynamic_max_gap : int = - 1 ,
53
54
data_type : str = "int" ,
54
55
scale_dtype : str = "fp16" ,
56
+ multimodal : bool = False ,
57
+ act_bits : int = 32 ,
58
+ act_group_size : int = None ,
59
+ act_sym : bool = None ,
60
+ act_dynamic : bool = True ,
61
+ low_cpu_mem_usage : bool = False ,
55
62
** kwargs ,
56
63
):
57
64
"""Init a AutQRoundQuantizer object.
58
65
59
66
Args:
60
- quant_config (dict): Configuration for weight quantization (default is None).
61
- quant_config={
62
- 'layer1':##layer_name
63
- {
64
- 'data_type': 'int',
65
- 'bits': 4,
66
- 'group_size': 32,
67
- 'sym': False,
67
+ quant_config (dict): Configuration for weight quantization (default is None).
68
+ quant_config={
69
+ 'layer1':##layer_name
70
+ {
71
+ 'data_type': 'int',
72
+ 'bits': 4,
73
+ 'group_size': 32,
74
+ 'sym': False,
75
+ 'act_data_type': None,
76
+ 'act_bits': 32,
77
+ 'act_sym': None,
78
+ 'act_dynamic': True,
79
+ }
80
+ ...,
68
81
}
69
- ...
70
- }
71
- keys:
72
- data_type (str): The data type to be used (default is "int").
73
- bits (int): Number of bits for quantization (default is 4).
74
- group_size (int): Size of the quantization group (default is 128).
75
- sym (bool): Whether to use symmetric quantization. (default is None).
76
- enable_full_range (bool): Whether to enable full range quantization (default is False).
77
- batch_size (int): Batch size for training (default is 8).
78
- amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set.
79
- device: The device to be used for tuning (default is None). Automatically detect and set.
80
- lr_scheduler: The learning rate scheduler to be used.
81
- use_quant_input (bool): Whether to use quantized input data (default is True).
82
- enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True).
83
- lr (float): The learning rate (default is 0.005).
84
- minmax_lr (float): The learning rate for min-max tuning (default is None).
85
- low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
86
- iters (int): Number of iterations (default is 200).
87
- seqlen (int): Length of the sequence.
88
- n_samples (int): Number of samples (default is 512).
89
- sampler (str): The sampling method (default is "rand").
90
- seed (int): The random seed (default is 42).
91
- n_blocks (int): Number of blocks (default is 1).
92
- gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
93
- not_use_best_mse (bool): Whether to use mean squared error (default is False).
94
- dynamic_max_gap (int): The dynamic maximum gap (default is -1).
95
- scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
96
- have different choices.
82
+ keys:
83
+ data_type (str): The data type to be used (default is "int").
84
+ bits (int): Number of bits for quantization (default is 4).
85
+ group_size (int): Size of the quantization group (default is 128).
86
+ sym (bool): Whether to use symmetric quantization. (default is None).
87
+ bits (int): Number of bits for quantization (default is 4).
88
+ group_size (int): Size of the quantization group (default is 128).
89
+ sym (bool): Whether symmetric quantization is to be used (default is False).
90
+ enable_full_range (bool): Whether to enable full range quantization (default is False).
91
+ batch_size (int): Batch size for training (default is 8).
92
+ amp (bool): Whether to use automatic mixed precision (default is True).
93
+ device: The device to be used for tuning (default is "auto").
94
+ lr_scheduler: The learning rate scheduler to be used.
95
+ dataset (str): The default dataset name (default is "NeelNanda/pile-10k").
96
+ enable_quanted_input (bool): Whether to use the output of the previous quantized block as
97
+ the input for the current block (default is True).
98
+ enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
99
+ lr (float): The learning rate (default is None, will be set to 1.0/iters).
100
+ minmax_lr (float): The learning rate for min-max tuning
101
+ (default is None, it will be set to lr automatically).
102
+ low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
103
+ iters (int): Number of iterations (default is 200).
104
+ seqlen (int): Data length of the sequence for tuning (default is 2048).
105
+ nsamples (int): Number of samples (default is 128).
106
+ sampler (str): The sampling method (default is "rand").
107
+ seed (int): The random seed (default is 42).
108
+ nblocks (int): Number of blocks (default is 1).
109
+ gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
110
+ not_use_best_mse (bool): Whether to use mean squared error (default is False).
111
+ dynamic_max_gap (int): The dynamic maximum gap (default is -1).
112
+ data_type (str): The data type to be used (default is "int").
113
+ scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
114
+ have different choices.
115
+ multimodal(bool): Enable multimodal model quantization, (default is "False").
116
+ act_bits (int): Number of bits for activation quantization. Default is 32.
117
+ act_group_size (int): Group size for activation quantization. Default is None.
118
+ act_sym (bool): Whether to use symmetric activation quantization. Default is None.
119
+ act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
120
+
121
+ Returns:
122
+ The quantized model.
97
123
"""
98
124
super ().__init__ (quant_config )
99
125
self .tokenizer = None
@@ -109,15 +135,21 @@ def __init__(
109
135
self .low_gpu_mem_usage = low_gpu_mem_usage
110
136
self .iters = iters
111
137
self .seqlen = seqlen
112
- self .n_samples = n_samples
138
+ self .nsamples = nsamples
113
139
self .sampler = sampler
114
140
self .seed = seed
115
- self .n_blocks = n_blocks
141
+ self .nblocks = nblocks
116
142
self .gradient_accumulate_steps = gradient_accumulate_steps
117
143
self .not_use_best_mse = not_use_best_mse
118
144
self .dynamic_max_gap = dynamic_max_gap
119
145
self .data_type = data_type
120
146
self .scale_dtype = scale_dtype
147
+ self .multimodal = multimodal
148
+ self .act_bits = act_bits
149
+ self .act_group_size = act_group_size
150
+ self .act_sym = act_sym
151
+ self .act_dynamic = act_dynamic
152
+ self .low_cpu_mem_usage = low_cpu_mem_usage
121
153
122
154
def prepare (self , model : torch .nn .Module , * args , ** kwargs ):
123
155
"""Prepares a given model for quantization.
@@ -137,7 +169,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
137
169
model = model ,
138
170
tokenizer = None ,
139
171
dataset = dataloader ,
140
- weight_config = self .quant_config or {},
172
+ layer_config = self .quant_config or {},
141
173
enable_full_range = self .enable_full_range ,
142
174
batch_size = self .batch_size ,
143
175
amp = self .amp ,
@@ -150,23 +182,29 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
150
182
low_gpu_mem_usage = self .low_gpu_mem_usage ,
151
183
iters = self .iters ,
152
184
seqlen = self .seqlen ,
153
- n_samples = self .n_samples ,
185
+ nsamples = self .nsamples ,
154
186
sampler = self .sampler ,
155
187
seed = self .seed ,
156
- n_blocks = self .n_blocks ,
188
+ nblocks = self .nblocks ,
157
189
gradient_accumulate_steps = self .gradient_accumulate_steps ,
158
190
not_use_best_mse = self .not_use_best_mse ,
159
191
dynamic_max_gap = self .dynamic_max_gap ,
160
192
data_type = self .data_type ,
161
193
scale_dtype = self .scale_dtype ,
194
+ multimodal = self .multimodal ,
195
+ act_bits = self .act_bits ,
196
+ act_group_size = self .act_group_size ,
197
+ act_sym = self .act_sym ,
198
+ act_dynamic = self .act_dynamic ,
199
+ low_cpu_mem_usage = self .low_cpu_mem_usage ,
162
200
)
163
201
model , weight_config = rounder .quantize ()
164
202
model .autoround_config = weight_config
165
203
model = pack_model (model , weight_config , device = self .device , inplace = True )
166
204
return model
167
205
168
206
169
- def get_dataloader (tokenizer , seqlen , dataset_name = "NeelNanda/pile-10k" , seed = 42 , bs = 8 , n_samples = 512 ):
207
+ def get_dataloader (tokenizer , seqlen , dataset_name = "NeelNanda/pile-10k" , seed = 42 , bs = 8 , nsamples = 128 ):
170
208
"""Generate a DataLoader for calibration using specified parameters.
171
209
172
210
Args:
@@ -186,6 +224,6 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
186
224
from auto_round .calib_dataset import get_dataloader # pylint: disable=E0401
187
225
188
226
dataloader = get_dataloader (
189
- tokenizer , seqlen , dataset_name = "NeelNanda/pile-10k" , seed = seed , bs = bs , n_samples = n_samples
227
+ tokenizer , seqlen , dataset_name = "NeelNanda/pile-10k" , seed = seed , bs = bs , nsamples = nsamples
190
228
)
191
229
return dataloader
0 commit comments