@@ -1188,13 +1188,17 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
11881188
11891189 for (int i = 0 ; i < nb ; i ++ ) {
11901190 float amax = 0.0f ; // absolute max
1191+ float max = 0.0f ;
11911192
11921193 for (int l = 0 ; l < QK4_2 ; l ++ ) {
11931194 const float v = x [i * QK4_2 + l ];
1194- amax = MAX (amax , fabsf (v ));
1195+ if (amax < fabsf (v )) {
1196+ amax = fabsf (v );
1197+ max = v ;
1198+ }
11951199 }
11961200
1197- const float d = amax / (( 1 << 3 ) - 1 ) ;
1201+ const float d = max / -8 ;
11981202
11991203 const float id = d ? 1.0f /d : 0.0f ;
12001204
@@ -1204,8 +1208,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
12041208 const float v0 = x [i * QK4_2 + l + 0 ]* id ;
12051209 const float v1 = x [i * QK4_2 + l + 1 ]* id ;
12061210
1207- const uint8_t vi0 = ( uint8_t ) (v0 + 8.5f );
1208- const uint8_t vi1 = ( uint8_t ) (v1 + 8.5f );
1211+ const uint8_t vi0 = MIN ( 15 , ( int8_t ) roundf (v0 ) + 8 );
1212+ const uint8_t vi1 = MIN ( 15 , ( int8_t ) roundf (v1 ) + 8 );
12091213
12101214 assert (vi0 < 16 );
12111215 assert (vi1 < 16 );
@@ -1299,9 +1303,9 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
12991303
13001304 block_q4_2 * restrict y = vy ;
13011305
1302- // quantize_row_q4_2_reference(x, y, k);
1306+ quantize_row_q4_2_reference (x , y , k );
13031307 // This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
1304- quantize_row_q4_2_rmse (x , y , k );
1308+ // quantize_row_q4_2_rmse(x, y, k);
13051309}
13061310
13071311static void quantize_row_q4_3_reference (const float * restrict x , block_q4_3 * restrict y , int k ) {
@@ -1852,7 +1856,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
18521856 [GGML_TYPE_Q4_2 ] = {
18531857 .dequantize_row_q = dequantize_row_q4_2 ,
18541858 .quantize_row_q = quantize_row_q4_2 ,
1855- .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_2_rmse , // quantize_row_q4_2_reference,
1859+ .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_2_reference ,
18561860 .quantize_row_q_dot = quantize_row_q8_0 ,
18571861 .vec_dot_q = ggml_vec_dot_q4_2_q8_0 ,
18581862 },
@@ -12184,8 +12188,8 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
1218412188 for (int j = 0 ; j < n ; j += k ) {
1218512189 block_q4_2 * restrict y = (block_q4_2 * )dst + j /QK4_2 ;
1218612190
12187- // quantize_row_q4_2_reference(src + j, y, k);
12188- quantize_row_q4_2_rmse (src + j , y , k );
12191+ quantize_row_q4_2_reference (src + j , y , k );
12192+ // quantize_row_q4_2_rmse(src + j, y, k);
1218912193
1219012194 for (int i = 0 ; i < nb ; i ++ ) {
1219112195 for (int l = 0 ; l < QK4_2 ; l += 2 ) {
0 commit comments