36
36
ort = LazyImport ("onnxruntime" )
37
37
logger = logging .getLogger ("neural_compressor" )
38
38
ONNXRT116_VERSION = Version ("1.16.0" )
39
+ ONNXRT1161_VERSION = Version ("1.16.1" )
40
+
41
+
42
+ def get_blob_size (group_size , has_zp ): # pragma: no cover
43
+ """Get blob_size.
44
+
45
+ Args:
46
+ group_size (int): how many elements share one scale/zp
47
+ has_zp (bool): whether zero_point is None
48
+ """
49
+ if Version (ort .__version__ ) > ONNXRT1161_VERSION :
50
+ blob_size = group_size // 2
51
+ elif has_zp :
52
+ blob_size = group_size // 2 + 4 + 1
53
+ else :
54
+ blob_size = group_size // 2 + 4
55
+ return blob_size
39
56
40
57
41
58
def make_matmul_weight_only_node (
@@ -54,49 +71,102 @@ def make_matmul_weight_only_node(
54
71
zero_point (array): zero point
55
72
56
73
Returns:
57
- matmul_weight_only_node: MatMulFpQ4 node
58
- new_inits: initializers of the MatMulFpQ4 node
74
+ matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node
75
+ new_inits: initializers of the new node
59
76
"""
60
- if zero_point is not None :
61
- blob_size = group_size // 2 + 4 + 1
62
- offset = 5
63
- else :
64
- blob_size = group_size // 2 + 4
65
- offset = 4
66
-
77
+ blob_size = get_blob_size (group_size , zero_point is not None )
67
78
packed = np .zeros ((q_weight .shape [0 ], blob_size ), dtype = "uint8" )
68
- for i in range (q_weight .shape [0 ]):
69
- bf = struct .pack ("f" , scale [i ])
70
- packed [i ][0 ] = bf [0 ]
71
- packed [i ][1 ] = bf [1 ]
72
- packed [i ][2 ] = bf [2 ]
73
- packed [i ][3 ] = bf [3 ]
79
+ q_weight_name = node .input [1 ] + "_Q{}G{}" .format (str (num_bits ), str (group_size ))
80
+ input_names = [node .input [0 ], q_weight_name ]
81
+ new_inits = []
82
+ kwargs = {}
83
+
84
+ if Version (ort .__version__ ) > ONNXRT1161_VERSION :
85
+ op_type = "MatMulNBits"
86
+
87
+ # pack quantized weight
88
+ for i in range (q_weight .shape [0 ]):
89
+ for k in range (0 , group_size , 2 ):
90
+ packed [i ][k // 2 ] = q_weight [i ][k ] | q_weight [i ][k + 1 ] << 4
91
+ packed = np .reshape (packed , (- 1 , k_blocks , blob_size ))
74
92
93
+ # build scale tensor
94
+ scale = np .reshape (scale , (- 1 , k_blocks )).astype ("float32" )
95
+ scale_tensor = onnx .helper .make_tensor (
96
+ name = node .input [1 ] + "_scale" , data_type = 1 , dims = scale .shape , vals = scale .tobytes (), raw = True
97
+ )
98
+ input_names .append (scale_tensor .name )
99
+ new_inits .append (scale_tensor )
100
+
101
+ # build zero_point tensor
75
102
if zero_point is not None :
76
- packed [i ][4 ] = zero_point [i ]
103
+ if num_bits > 4 :
104
+ packed_zp = np .reshape (zero_point , (1 , - 1 )).astype ("uint8" )
105
+ else :
106
+ packed_zp = np .full ((zero_point .shape [0 ] + 1 ) // 2 , 136 , dtype = "uint8" )
107
+ for i in range (zero_point .shape [0 ] // k_blocks ):
108
+ for j in range (k_blocks ):
109
+ idx = i * k_blocks + j
110
+ zp = zero_point [idx ]
111
+ packed_zp [idx // 2 ] = (
112
+ ((packed_zp [idx // 2 ] & 0x0F ) | (zp << 4 ))
113
+ if (idx & 1 )
114
+ else ((packed_zp [idx // 2 ] & 0xF0 ) | zp )
115
+ )
116
+
117
+ zp_tensor = onnx .helper .make_tensor (
118
+ name = node .input [1 ] + "_zp" , data_type = 2 , dims = packed_zp .shape , vals = packed_zp .tobytes (), raw = True
119
+ )
120
+ input_names .append (zp_tensor .name )
121
+ new_inits .append (zp_tensor )
122
+
123
+ # set kwargs
124
+ kwargs ["K" ] = weight_shape [0 ]
125
+ kwargs ["N" ] = weight_shape [1 ]
126
+ kwargs ["bits" ] = num_bits
127
+ kwargs ["block_size" ] = group_size
128
+
129
+ else :
130
+ offset = 5 if zero_point is not None else 4
131
+ op_type = "MatMulFpQ4"
132
+
133
+ # pack quantized weight
134
+ for i in range (q_weight .shape [0 ]):
135
+ bf = struct .pack ("f" , scale [i ])
136
+ packed [i ][0 ] = bf [0 ]
137
+ packed [i ][1 ] = bf [1 ]
138
+ packed [i ][2 ] = bf [2 ]
139
+ packed [i ][3 ] = bf [3 ]
140
+
141
+ if zero_point is not None :
142
+ packed [i ][4 ] = zero_point [i ]
143
+
144
+ packed [i ][offset :] = np .bitwise_or (
145
+ q_weight [i ][: group_size // 2 ], np .left_shift (q_weight [i ][group_size // 2 :], num_bits )
146
+ )
147
+ packed = packed .reshape (- 1 )
77
148
78
- packed [i ][offset :] = np .bitwise_or (
79
- q_weight [i ][: group_size // 2 ], np .left_shift (q_weight [i ][group_size // 2 :], num_bits )
149
+ # build shape tensor
150
+ shape_tensor = onnx .helper .make_tensor (
151
+ name = node .input [1 ] + "_shape" , data_type = 7 , dims = (2 ,), vals = np .array (weight_shape , dtype = "int64" )
80
152
)
153
+ new_inits .append (shape_tensor )
154
+ input_names .append (shape_tensor .name )
155
+
156
+ # set kwargs
157
+ kwargs ["blk_quant_type" ] = 1 if zero_point is not None else 0
81
158
82
- packed = packed .reshape (- 1 )
83
159
q_weight_tensor = onnx .helper .make_tensor (
84
- name = node . input [ 1 ] + "_Q{}G{}" . format ( str ( num_bits ), str ( group_size )) ,
160
+ name = q_weight_name ,
85
161
data_type = 2 ,
86
162
dims = packed .shape ,
87
163
vals = packed .tobytes (),
88
164
raw = True ,
89
165
)
90
- shape_tensor = onnx .helper .make_tensor (
91
- name = node .input [1 ] + "_shape" , data_type = 7 , dims = (2 ,), vals = np .array (weight_shape , dtype = "int64" )
92
- )
93
- input_names = [node .input [0 ], q_weight_tensor .name , shape_tensor .name ]
94
- new_inits = [q_weight_tensor , shape_tensor ]
166
+ new_inits .append (q_weight_tensor )
95
167
96
- kwargs = {}
97
- kwargs ["blk_quant_type" ] = 1 if zero_point is not None else 0
98
168
matmul_weight_only_node = onnx .helper .make_node (
99
- "MatMulFpQ4" ,
169
+ op_type ,
100
170
inputs = input_names ,
101
171
outputs = node .output ,
102
172
name = node .name + "_Q" + str (num_bits ) if node .name else "_Q" + str (num_bits ),
@@ -257,8 +327,11 @@ def rtn_quantize(
257
327
258
328
weight = pad_tensor (weight , group_size , k_blocks )
259
329
260
- if Version (ort .__version__ ) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32 : # pragma: no cover
261
- # currently MatMulFpQ4 only support 4 bits and 32 group_size
330
+ if (Version (ort .__version__ ) > ONNXRT1161_VERSION and num_bits == 4 ) or (
331
+ Version (ort .__version__ ) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
332
+ ): # pragma: no cover
333
+ # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
334
+ # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
262
335
q_weight , scale , zp = quant_tensor (
263
336
weight .T , num_bits , group_size , scheme , "uint" , ratios .get (node .input [1 ], 1 )
264
337
)
@@ -361,9 +434,11 @@ def apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
361
434
weight = weight .T * scales
362
435
weight = pad_tensor (weight , group_size , (org_w_shape [0 ] + group_size - 1 ) // group_size ).T
363
436
364
- if (
437
+ if (Version ( ort . __version__ ) > ONNXRT1161_VERSION and num_bits == 4 ) or (
365
438
Version (ort .__version__ ) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
366
439
): # pragma: no cover
440
+ # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
441
+ # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
367
442
q_weight = qdq_tensor (weight , num_bits , group_size , scheme , "uint" ) / np .expand_dims (
368
443
scales , axis = - 1
369
444
)
@@ -504,10 +579,11 @@ def apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, g
504
579
for i_s in range (10 ):
505
580
ratio = 1 - i_s / 100
506
581
weight = copy .deepcopy (org_weight )
507
- if (
582
+ if (Version ( ort . __version__ ) > ONNXRT1161_VERSION and num_bits == 4 ) or (
508
583
Version (ort .__version__ ) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
509
584
): # pragma: no cover
510
- # currently MatMulFpQ4 only support 4 bits and 32 group_size
585
+ # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
586
+ # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
511
587
weight = qdq_tensor (weight , num_bits , group_size , scheme , "uint" , ratios .get (node .input [1 ], 1 ))
512
588
else :
513
589
weight = qdq_tensor (weight , num_bits , group_size , scheme , "int" , ratios .get (node .input [1 ], 1 ))
@@ -571,9 +647,9 @@ def prepare_inputs(model, n_samples, dataloader):
571
647
572
648
if isinstance (data [0 ], dict ):
573
649
inputs .append (dict ([(name , to_numpy (inp_data )) for name , inp_data in data [0 ].items ()]))
574
- elif isinstance (data [0 ], np .ndarray ):
650
+ elif isinstance (data [0 ], np .ndarray ): # pragma: no cover
575
651
inputs .append (dict ([(name , inp ) for name , inp in zip (inputs_names , [data [0 ]])]))
576
- else :
652
+ else : # pragma: no cover
577
653
inputs .append (dict ([(name , to_numpy (inp )) for name , inp in zip (inputs_names , data [0 ])]))
578
654
return inputs , so
579
655
@@ -982,8 +1058,11 @@ def gptq_quantize(
982
1058
983
1059
weight_tensor = model .get_initializer (node .input [1 ])
984
1060
init_share_num = model .get_initializer_share_num (node .input [1 ])
985
- if Version (ort .__version__ ) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32 : # pragma: no cover
986
- # currently MatMulFpQ4 only support 4 bits and 32 group_size
1061
+ if (Version (ort .__version__ ) > ONNXRT1161_VERSION and num_bits == 4 ) or (
1062
+ Version (ort .__version__ ) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
1063
+ ): # pragma: no cover
1064
+ # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
1065
+ # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
987
1066
org_shape = weight .shape
988
1067
k_blocks = (org_shape [0 ] + group_size - 1 ) // group_size
989
1068
q_weight = pad_tensor (q_weight , group_size , k_blocks )
0 commit comments