@@ -1200,13 +1200,17 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
12001200
12011201 for (int i = 0 ; i < nb ; i ++ ) {
12021202 float amax = 0.0f ; // absolute max
1203+ float max = 0.0f ;
12031204
12041205 for (int l = 0 ; l < QK4_2 ; l ++ ) {
12051206 const float v = x [i * QK4_2 + l ];
1206- amax = MAX (amax , fabsf (v ));
1207+ if (amax < fabsf (v )) {
1208+ amax = fabsf (v );
1209+ max = v ;
1210+ }
12071211 }
12081212
1209- const float d = amax / (( 1 << 3 ) - 1 ) ;
1213+ const float d = max / -8 ;
12101214
12111215 const float id = d ? 1.0f /d : 0.0f ;
12121216
@@ -1216,8 +1220,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
12161220 const float v0 = x [i * QK4_2 + l + 0 ]* id ;
12171221 const float v1 = x [i * QK4_2 + l + 1 ]* id ;
12181222
1219- const uint8_t vi0 = ( uint8_t )(v0 + 8.5f );
1220- const uint8_t vi1 = ( uint8_t )(v1 + 8.5f );
1223+ const uint8_t vi0 = MIN ( 15 , ( uint8_t )(v0 + 8.5f ) );
1224+ const uint8_t vi1 = MIN ( 15 , ( uint8_t )(v1 + 8.5f ) );
12211225
12221226 assert (vi0 < 16 );
12231227 assert (vi1 < 16 );
@@ -1311,9 +1315,7 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
13111315
13121316 block_q4_2 * restrict y = vy ;
13131317
1314- //quantize_row_q4_2_reference(x, y, k);
1315- // This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
1316- quantize_row_q4_2_rmse (x , y , k );
1318+ quantize_row_q4_2_reference (x , y , k );
13171319}
13181320
13191321static void quantize_row_q4_3_reference (const float * restrict x , block_q4_3 * restrict y , int k ) {
@@ -1864,7 +1866,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
18641866 [GGML_TYPE_Q4_2 ] = {
18651867 .dequantize_row_q = dequantize_row_q4_2 ,
18661868 .quantize_row_q = quantize_row_q4_2 ,
1867- .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_2_rmse , // quantize_row_q4_2_reference,
1869+ .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_2_reference ,
18681870 .quantize_row_q_dot = quantize_row_q8_0 ,
18691871 .vec_dot_q = ggml_vec_dot_q4_2_q8_0 ,
18701872 },
@@ -12196,8 +12198,7 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
1219612198 for (int j = 0 ; j < n ; j += k ) {
1219712199 block_q4_2 * restrict y = (block_q4_2 * )dst + j /QK4_2 ;
1219812200
12199- //quantize_row_q4_2_reference(src + j, y, k);
12200- quantize_row_q4_2_rmse (src + j , y , k );
12201+ quantize_row_q4_2_reference (src + j , y , k );
1220112202
1220212203 for (int i = 0 ; i < nb ; i ++ ) {
1220312204 for (int l = 0 ; l < QK4_2 ; l += 2 ) {
0 commit comments