@@ -844,8 +844,7 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s
844
844
static void quantize_row_q4_0_reference (const float * restrict x , block_q4_0 * restrict y , int k ) {
845
845
static const int qk = QK4_0 ;
846
846
847
- assert (qk / 16 == 0 );
848
- assert ( k % qk == 0 );
847
+ assert (k % qk == 0 );
849
848
850
849
const int nb = k / qk ;
851
850
@@ -866,20 +865,16 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
866
865
867
866
y [i ].d = d ;
868
867
869
- uint64_t qs [QK4_0 / 16 ] = {0 };
870
-
871
868
for (int l = 0 ; l < qk /2 ; ++ l ) {
872
869
const float x0 = x [i * qk + 0 + l ]* id ;
873
870
const float x1 = x [i * qk + qk /2 + l ]* id ;
874
871
875
- const uint64_t xi0 = MIN (15 , (int8_t )(x0 + 8.5f ));
876
- const uint64_t xi1 = MIN (15 , (int8_t )(x1 + 8.5f ));
872
+ const uint8_t xi0 = MIN (15 , (int8_t )(x0 + 8.5f ));
873
+ const uint8_t xi1 = MIN (15 , (int8_t )(x1 + 8.5f ));
877
874
878
- qs [l / 8 ] | = xi0 << ( 8 * ( l & 7 )) ;
879
- qs [l / 8 ] |= xi1 << ( 8 * ( l & 7 ) + 4 ) ;
875
+ y [ i ]. qs [l ] = xi0 ;
876
+ y [ i ]. qs [l ] |= xi1 << 4 ;
880
877
}
881
-
882
- memcpy (y [i ].qs , qs , qk /2 );
883
878
}
884
879
}
885
880
@@ -890,8 +885,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k
890
885
static void quantize_row_q4_1_reference (const float * restrict x , block_q4_1 * restrict y , int k ) {
891
886
const int qk = QK4_1 ;
892
887
893
- assert (qk / 16 == 0 );
894
- assert ( k % qk == 0 );
888
+ assert (k % qk == 0 );
895
889
896
890
const int nb = k / qk ;
897
891
@@ -912,20 +906,16 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
912
906
y [i ].d = d ;
913
907
y [i ].m = min ;
914
908
915
- uint64_t qs [QK4_1 / 16 ] = {0 };
916
-
917
909
for (int l = 0 ; l < qk /2 ; ++ l ) {
918
910
const float x0 = (x [0 + l ] - min )* id ;
919
911
const float x1 = (x [qk /2 + l ] - min )* id ;
920
912
921
- const uint64_t xi0 = MIN (15 , (int8_t )(x0 + 0.5f ));
922
- const uint64_t xi1 = MIN (15 , (int8_t )(x1 + 0.5f ));
913
+ const uint8_t xi0 = MIN (15 , (int8_t )(x0 + 0.5f ));
914
+ const uint8_t xi1 = MIN (15 , (int8_t )(x1 + 0.5f ));
923
915
924
- qs [l / 8 ] | = xi0 << ( 8 * ( l & 7 )) ;
925
- qs [l / 8 ] |= xi1 << ( 8 * ( l & 7 ) + 4 ) ;
916
+ y [ i ]. qs [l ] = xi0 ;
917
+ y [ i ]. qs [l ] |= xi1 << 4 ;
926
918
}
927
-
928
- memcpy (y [i ].qs , qs , qk /2 );
929
919
}
930
920
}
931
921
@@ -937,8 +927,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k
937
927
static void quantize_row_q4_2_reference (const float * restrict x , block_q4_2 * restrict y , int k ) {
938
928
static const int qk = QK4_2 ;
939
929
940
- assert (qk / 16 == 0 );
941
- assert ( k % qk == 0 );
930
+ assert (k % qk == 0 );
942
931
943
932
const int nb = k / qk ;
944
933
@@ -983,8 +972,7 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k
983
972
static void quantize_row_q5_0_reference (const float * restrict x , block_q5_0 * restrict y , int k ) {
984
973
static const int qk = QK5_0 ;
985
974
986
- assert (qk / 16 == 0 );
987
- assert ( k % qk == 0 );
975
+ assert (k % qk == 0 );
988
976
989
977
const int nb = k / qk ;
990
978
@@ -1006,24 +994,21 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
1006
994
y [i ].d = d ;
1007
995
1008
996
uint32_t qh = 0 ;
1009
- uint64_t qs [QK5_0 / 16 ] = {0 };
1010
997
1011
998
for (int l = 0 ; l < qk /2 ; ++ l ) {
1012
999
const float x0 = x [i * qk + 0 + l ]* id ;
1013
1000
const float x1 = x [i * qk + qk /2 + l ]* id ;
1014
1001
1015
- const uint64_t xi0 = MIN (31 , (int8_t )(x0 + 16.5f ));
1016
- const uint64_t xi1 = MIN (31 , (int8_t )(x1 + 16.5f ));
1002
+ const uint8_t xi0 = MIN (31 , (int8_t )(x0 + 16.5f ));
1003
+ const uint8_t xi1 = MIN (31 , (int8_t )(x1 + 16.5f ));
1017
1004
1018
- qs [l /8 ] |= xi0 << (8 * (l & 7 ));
1019
- qs [l /8 ] |= xi1 << (8 * (l & 7 ) + 4 );
1005
+ y [i ].qs [l ] = (xi0 & 0x0F ) | ((xi1 & 0x0F ) << 4 );
1020
1006
1021
1007
// get the 5-th bit and store it in qh at the right position
1022
1008
qh |= ((xi0 & 0x10 ) >> 4 ) << (l + 0 );
1023
1009
qh |= ((xi1 & 0x10 ) >> 4 ) << (l + qk /2 );
1024
1010
}
1025
1011
1026
- memcpy ( y [i ].qs , qs , qk /2 );
1027
1012
memcpy (& y [i ].qh , & qh , sizeof (qh ));
1028
1013
}
1029
1014
}
@@ -1033,50 +1018,50 @@ static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k
1033
1018
}
1034
1019
1035
1020
static void quantize_row_q5_1_reference (const float * restrict x , block_q5_1 * restrict y , int k ) {
1036
- assert (k % QK5_1 == 0 );
1037
- const int nb = k / QK5_1 ;
1021
+ const int qk = QK5_1 ;
1022
+
1023
+ assert (k % qk == 0 );
1024
+
1025
+ const int nb = k / qk ;
1038
1026
1039
1027
for (int i = 0 ; i < nb ; i ++ ) {
1040
1028
float min = FLT_MAX ;
1041
1029
float max = - FLT_MAX ;
1042
1030
1043
- for (int l = 0 ; l < QK5_1 ; l ++ ) {
1044
- const float v = x [i * QK5_1 + l ];
1031
+ for (int l = 0 ; l < qk ; l ++ ) {
1032
+ const float v = x [i * qk + l ];
1033
+
1045
1034
if (v < min ) min = v ;
1046
1035
if (v > max ) max = v ;
1047
1036
}
1048
1037
1049
- const float d = (max - min ) / ((1 << 5 ) - 1 );
1038
+ const float d = (max - min ) / ((1 << 5 ) - 1 );
1050
1039
const float id = d ? 1.0f /d : 0.0f ;
1051
1040
1052
1041
y [i ].d = GGML_FP32_TO_FP16 (d );
1053
1042
y [i ].m = GGML_FP32_TO_FP16 (min );
1054
1043
1055
1044
uint32_t qh = 0 ;
1056
1045
1057
- for (int l = 0 ; l < QK5_1 ; l += 2 ) {
1058
- const float v0 = (x [i * QK5_1 + l + 0 ] - min )* id ;
1059
- const float v1 = (x [i * QK5_1 + l + 1 ] - min )* id ;
1046
+ for (int l = 0 ; l < qk / 2 ; ++ l ) {
1047
+ const float x0 = (x [i * qk + 0 + l ] - min )* id ;
1048
+ const float x1 = (x [i * qk + qk / 2 + l ] - min )* id ;
1060
1049
1061
- const uint32_t vi0 = (int ) ( v0 + 0.5f );
1062
- const uint32_t vi1 = (int ) ( v1 + 0.5f );
1050
+ const uint8_t xi0 = (uint8_t )( x0 + 0.5f );
1051
+ const uint8_t xi1 = (uint8_t )( x1 + 0.5f );
1063
1052
1064
- y [i ].qs [l / 2 ] = (vi0 & 0x0F ) | ((vi1 & 0x0F ) << 4 );
1053
+ y [i ].qs [l ] = (xi0 & 0x0F ) | ((xi1 & 0x0F ) << 4 );
1065
1054
1066
1055
// get the 5-th bit and store it in qh at the right position
1067
- qh |= ((vi0 & 0x10 ) >> 4 ) << (l + 0 );
1068
- qh |= ((vi1 & 0x10 ) >> 4 ) << (l + 1 );
1056
+ qh |= ((xi0 & 0x10 ) >> 4 ) << (l + 0 );
1057
+ qh |= ((xi1 & 0x10 ) >> 4 ) << (l + qk / 2 );
1069
1058
}
1070
1059
1071
1060
memcpy (& y [i ].qh , & qh , sizeof (y [i ].qh ));
1072
1061
}
1073
1062
}
1074
1063
1075
- static void quantize_row_q5_1 (const float * restrict x , void * restrict vy , int k ) {
1076
- assert (k % QK5_1 == 0 );
1077
-
1078
- block_q5_1 * restrict y = vy ;
1079
-
1064
+ static void quantize_row_q5_1 (const float * restrict x , void * restrict y , int k ) {
1080
1065
quantize_row_q5_1_reference (x , y , k );
1081
1066
}
1082
1067
@@ -1316,8 +1301,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1316
1301
static void dequantize_row_q4_0 (const block_q4_0 * restrict x , float * restrict y , int k ) {
1317
1302
static const int qk = QK4_0 ;
1318
1303
1319
- assert (qk / 16 == 0 );
1320
- assert ( k % qk == 0 );
1304
+ assert (k % qk == 0 );
1321
1305
1322
1306
const int nb = k / qk ;
1323
1307
@@ -1337,8 +1321,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
1337
1321
static void dequantize_row_q4_1 (const block_q4_1 * restrict x , float * restrict y , int k ) {
1338
1322
static const int qk = QK4_1 ;
1339
1323
1340
- assert (qk / 16 == 0 );
1341
- assert ( k % qk == 0 );
1324
+ assert (k % qk == 0 );
1342
1325
1343
1326
const int nb = k / qk ;
1344
1327
@@ -1360,8 +1343,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
1360
1343
// BORKEN !!!
1361
1344
static const int qk = QK4_2 ;
1362
1345
1363
- assert (qk / 16 == 0 );
1364
- assert ( k % qk == 0 );
1346
+ assert (k % qk == 0 );
1365
1347
1366
1348
const int nb = k / qk ;
1367
1349
@@ -1381,8 +1363,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
1381
1363
static void dequantize_row_q5_0 (const block_q5_0 * restrict x , float * restrict y , int k ) {
1382
1364
static const int qk = QK4_0 ;
1383
1365
1384
- assert (qk / 16 == 0 );
1385
- assert ( k % qk == 0 );
1366
+ assert (k % qk == 0 );
1386
1367
1387
1368
const int nb = k / qk ;
1388
1369
@@ -1405,39 +1386,29 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
1405
1386
}
1406
1387
}
1407
1388
1408
- static void dequantize_row_q5_1 (const void * restrict vx , float * restrict y , int k ) {
1409
- assert (k % QK5_1 == 0 );
1410
- const int nb = k / QK5_1 ;
1389
+ static void dequantize_row_q5_1 (const block_q5_1 * restrict x , float * restrict y , int k ) {
1390
+ static const int qk = QK5_1 ;
1411
1391
1412
- const block_q5_1 * restrict x = vx ;
1392
+ assert (k % qk == 0 );
1393
+
1394
+ const int nb = k / qk ;
1413
1395
1414
1396
for (int i = 0 ; i < nb ; i ++ ) {
1415
1397
const float d = GGML_FP16_TO_FP32 (x [i ].d );
1416
1398
const float m = GGML_FP16_TO_FP32 (x [i ].m );
1417
1399
1418
- const uint8_t * restrict pp = x [i ].qs ;
1419
-
1420
1400
uint32_t qh ;
1421
1401
memcpy (& qh , x [i ].qh , sizeof (qh ));
1422
1402
1423
- for (int l = 0 ; l < QK5_1 ; l += 2 ) {
1424
- const uint8_t vi = pp [l /2 ];
1425
-
1426
- // extract the 5-th bit from qh
1427
- const uint8_t vh0 = ((qh & (1u << (l + 0 ))) >> (l + 0 )) << 4 ;
1428
- const uint8_t vh1 = ((qh & (1u << (l + 1 ))) >> (l + 1 )) << 4 ;
1429
-
1430
- const uint8_t vi0 = (vi & 0x0F ) | vh0 ;
1431
- const uint8_t vi1 = (vi >> 4 ) | vh1 ;
1432
-
1433
- const float v0 = vi0 * d + m ;
1434
- const float v1 = vi1 * d + m ;
1403
+ for (int j = 0 ; j < qk /2 ; ++ j ) {
1404
+ const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4 ;
1405
+ const uint8_t xh_1 = ((qh & (1u << (j + 16 ))) >> (j + 12 ));
1435
1406
1436
- y [ i * QK5_1 + l + 0 ] = v0 ;
1437
- y [ i * QK5_1 + l + 1 ] = v1 ;
1407
+ const int x0 = ( x [ i ]. qs [ j ] & 0xf ) | xh_0 ;
1408
+ const int x1 = ( x [ i ]. qs [ j ] >> 4 ) | xh_1 ;
1438
1409
1439
- assert (! isnan ( y [i * QK5_1 + l + 0 ])) ;
1440
- assert (! isnan ( y [i * QK5_1 + l + 1 ])) ;
1410
+ y [i * qk + j + 0 ] = x0 * d + m ;
1411
+ y [i * qk + j + qk / 2 ] = x1 * d + m ;
1441
1412
}
1442
1413
}
1443
1414
}
@@ -1500,7 +1471,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1500
1471
.vec_dot_type = GGML_TYPE_Q8_0 ,
1501
1472
},
1502
1473
[GGML_TYPE_Q5_1 ] = {
1503
- .dequantize_row_q = dequantize_row_q5_1 ,
1474
+ .dequantize_row_q = ( dequantize_row_q_t ) dequantize_row_q5_1 ,
1504
1475
.quantize_row_q = quantize_row_q5_1 ,
1505
1476
.quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q5_1_reference ,
1506
1477
.quantize_row_q_dot = quantize_row_q8_1 ,
@@ -2748,11 +2719,12 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2748
2719
}
2749
2720
2750
2721
static void ggml_vec_dot_q5_1_q8_1 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy ) {
2751
- const int nb = n / QK8_1 ;
2722
+ const int qk = QK8_1 ;
2723
+ const int nb = n / qk ;
2752
2724
2753
- assert (n % QK8_1 == 0 );
2725
+ assert (n % qk == 0 );
2754
2726
assert (nb % 2 == 0 );
2755
- assert (QK8_1 == QK5_1 );
2727
+ assert (qk == QK5_1 );
2756
2728
2757
2729
const block_q5_1 * restrict x = vx ;
2758
2730
const block_q8_1 * restrict y = vy ;
@@ -2788,13 +2760,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2788
2760
const int8x16_t v0l = vreinterpretq_s8_u8 (vandq_u8 (v0 , vdupq_n_u8 (0x0F )));
2789
2761
const int8x16_t v0h = vreinterpretq_s8_u8 (vshrq_n_u8 (v0 , 4 ));
2790
2762
2791
- // interleave
2792
- const int8x16_t v0lz = vzip1q_s8 (v0l , v0h );
2793
- const int8x16_t v0hz = vzip2q_s8 (v0l , v0h );
2794
-
2795
2763
// add
2796
- const int8x16_t v0lf = vorrq_s8 (v0lz , qhl );
2797
- const int8x16_t v0hf = vorrq_s8 (v0hz , qhh );
2764
+ const int8x16_t v0lf = vorrq_s8 (v0l , qhl );
2765
+ const int8x16_t v0hf = vorrq_s8 (v0h , qhh );
2798
2766
2799
2767
// load y
2800
2768
const int8x16_t v1l = vld1q_s8 (y0 -> qs );
@@ -2917,36 +2885,28 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2917
2885
2918
2886
* s = hsum_float_8 (acc ) + summs ;
2919
2887
#else
2888
+ // scalar
2920
2889
float sumf = 0.0 ;
2921
2890
2922
2891
for (int i = 0 ; i < nb ; i ++ ) {
2923
- const uint8_t * restrict x0 = x [i ].qs ;
2924
- const int8_t * restrict y0 = y [i ].qs ;
2892
+ const int8_t * py = y [i ].qs ;
2925
2893
2926
2894
uint32_t qh ;
2927
2895
memcpy (& qh , x [i ].qh , sizeof (qh ));
2928
2896
2929
- const float d = GGML_FP16_TO_FP32 (x [i ].d );
2930
- const float m = GGML_FP16_TO_FP32 (x [i ].m );
2931
-
2932
- int sxy = 0 ;
2933
-
2934
- for (int j = 0 ; j < QK8_1 /2 ; j ++ ) {
2935
- const uint8_t v0 = x0 [j ];
2936
-
2937
- const int x0_0h = ((qh & (1u << (2 * j + 0 ))) >> (2 * j + 0 )) << 4 ;
2938
- const int x1_0h = ((qh & (1u << (2 * j + 1 ))) >> (2 * j + 1 )) << 4 ;
2897
+ int sumi = 0 ;
2939
2898
2940
- const int x0_0 = (v0 & 0x0F ) | x0_0h ;
2941
- const int x1_0 = (v0 >> 4 ) | x1_0h ;
2899
+ for (int j = 0 ; j < qk /2 ; ++ j ) {
2900
+ const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4 ;
2901
+ const uint8_t xh_1 = ((qh & (1u << (j + 16 ))) >> (j + 12 ));
2942
2902
2943
- const int y0_0 = y0 [ 2 * j + 0 ] ;
2944
- const int y1_0 = y0 [ 2 * j + 1 ] ;
2903
+ const int32_t x0 = ( x [ i ]. qs [ j ] & 0xF ) | xh_0 ;
2904
+ const int32_t x1 = ( x [ i ]. qs [ j ] >> 4 ) | xh_1 ;
2945
2905
2946
- sxy += x0_0 * y0_0 + x1_0 * y1_0 ;
2906
+ sumi += ( x0 * py [ j ]) + ( x1 * py [ j + qk / 2 ]) ;
2947
2907
}
2948
2908
2949
- sumf += (d * sxy )* y [i ].d + m * (y [i ].s0 + y [i ].s1 );
2909
+ sumf += (GGML_FP16_TO_FP32 ( x [ i ]. d )* y [i ].d ) * sumi + GGML_FP16_TO_FP32 ( x [ i ]. m ) * (y [i ].s0 + y [i ].s1 );
2950
2910
}
2951
2911
2952
2912
* s = sumf ;
0 commit comments