@@ -1188,13 +1188,17 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
1188
1188
1189
1189
for (int i = 0 ; i < nb ; i ++ ) {
1190
1190
float amax = 0.0f ; // absolute max
1191
+ float max = 0.0f ;
1191
1192
1192
1193
for (int l = 0 ; l < QK4_2 ; l ++ ) {
1193
1194
const float v = x [i * QK4_2 + l ];
1194
- amax = MAX (amax , fabsf (v ));
1195
+ if (amax < fabsf (v )) {
1196
+ amax = fabsf (v );
1197
+ max = v ;
1198
+ }
1195
1199
}
1196
1200
1197
- const float d = amax / (( 1 << 3 ) - 1 ) ;
1201
+ const float d = max / -8 ;
1198
1202
1199
1203
const float id = d ? 1.0f /d : 0.0f ;
1200
1204
@@ -1204,8 +1208,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
1204
1208
const float v0 = x [i * QK4_2 + l + 0 ]* id ;
1205
1209
const float v1 = x [i * QK4_2 + l + 1 ]* id ;
1206
1210
1207
- const uint8_t vi0 = ( uint8_t ) (v0 + 8.5f );
1208
- const uint8_t vi1 = ( uint8_t ) (v1 + 8.5f );
1211
+ const uint8_t vi0 = MIN ( 15 , ( int8_t ) roundf (v0 ) + 8 );
1212
+ const uint8_t vi1 = MIN ( 15 , ( int8_t ) roundf (v1 ) + 8 );
1209
1213
1210
1214
assert (vi0 < 16 );
1211
1215
assert (vi1 < 16 );
@@ -1299,9 +1303,9 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
1299
1303
1300
1304
block_q4_2 * restrict y = vy ;
1301
1305
1302
- // quantize_row_q4_2_reference(x, y, k);
1306
+ quantize_row_q4_2_reference (x , y , k );
1303
1307
// This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
1304
- quantize_row_q4_2_rmse (x , y , k );
1308
+ // quantize_row_q4_2_rmse(x, y, k);
1305
1309
}
1306
1310
1307
1311
static void quantize_row_q4_3_reference (const float * restrict x , block_q4_3 * restrict y , int k ) {
@@ -1852,7 +1856,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1852
1856
[GGML_TYPE_Q4_2 ] = {
1853
1857
.dequantize_row_q = dequantize_row_q4_2 ,
1854
1858
.quantize_row_q = quantize_row_q4_2 ,
1855
- .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_2_rmse , // quantize_row_q4_2_reference,
1859
+ .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_2_reference ,
1856
1860
.quantize_row_q_dot = quantize_row_q8_0 ,
1857
1861
.vec_dot_q = ggml_vec_dot_q4_2_q8_0 ,
1858
1862
},
@@ -12184,8 +12188,8 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
12184
12188
for (int j = 0 ; j < n ; j += k ) {
12185
12189
block_q4_2 * restrict y = (block_q4_2 * )dst + j /QK4_2 ;
12186
12190
12187
- // quantize_row_q4_2_reference(src + j, y, k);
12188
- quantize_row_q4_2_rmse (src + j , y , k );
12191
+ quantize_row_q4_2_reference (src + j , y , k );
12192
+ // quantize_row_q4_2_rmse(src + j, y, k);
12189
12193
12190
12194
for (int i = 0 ; i < nb ; i ++ ) {
12191
12195
for (int l = 0 ; l < QK4_2 ; l += 2 ) {
0 commit comments