@@ -328,8 +328,18 @@ static ggml_fp16_t table_exp_f16[1 << 16];
328
328
// precomputed f32 table for f16 (256 KB)
329
329
static float table_f32_f16 [1 << 16 ];
330
330
331
- // precomputed table for expanding 8bits to 8 bytes (shl 4)
332
- static uint64_t table_b2b [1 << 8 ];
331
+ #define B1 (c ,s ,n ) 0x ## n ## c , 0x ## n ## s
332
+ #define B2 (c ,s ,n ) B1(c,s,n ## c), B1(c,s,n ## s)
333
+ #define B3 (c ,s ,n ) B2(c,s,n ## c), B2(c,s,n ## s)
334
+ #define B4 (c ,s ,n ) B3(c,s,n ## c), B3(c,s,n ## s)
335
+ #define B5 (c ,s ,n ) B4(c,s,n ## c), B4(c,s,n ## s)
336
+ #define B6 (c ,s ,n ) B5(c,s,n ## c), B5(c,s,n ## s)
337
+ #define B7 (c ,s ,n ) B6(c,s,n ## c), B6(c,s,n ## s)
338
+ #define B8 (c ,s ) B7(c,s, c), B7(c,s, s)
339
+
340
+ // precomputed tables for expanding 8bits to 8 bytes (shl 4)
341
+ static const uint64_t table_b2b_u [1 << 8 ] = { B8 (00 , 10 ) };
342
+ static const uint64_t table_b2b_i [1 << 8 ] = { B8 (F0 , 00 ) };
333
343
334
344
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
335
345
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
@@ -688,7 +698,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
688
698
typedef struct {
689
699
ggml_fp16_t d ; // delta
690
700
ggml_fp16_t m ; // min
691
- uint32_t qh ; // 5-th bit of quants
701
+ uint8_t qh [ 4 ]; // 5-th bit of quants
692
702
uint8_t qs [QK5_1 / 2 ]; // nibbles / quants
693
703
} block_q5_1 ;
694
704
static_assert (sizeof (block_q5_1 ) == 2 * sizeof (ggml_fp16_t ) + sizeof (uint32_t ) + QK5_1 / 2 , "wrong q5_1 block size/padding" );
@@ -1376,7 +1386,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
1376
1386
1377
1387
y [i ].d = GGML_FP32_TO_FP16 (d );
1378
1388
y [i ].m = GGML_FP32_TO_FP16 (min );
1379
- y [i ].qh = 0 ;
1389
+
1390
+ uint32_t qh = 0 ;
1380
1391
1381
1392
for (int l = 0 ; l < QK5_1 ; l += 2 ) {
1382
1393
const float v0 = (x [i * QK5_1 + l + 0 ] - min )* id ;
@@ -1388,9 +1399,11 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
1388
1399
y [i ].qs [l /2 ] = (vi0 & 0x0F ) | ((vi1 & 0x0F ) << 4 );
1389
1400
1390
1401
// get the 5-th bit and store it in qh at the right position
1391
- y [ i ]. qh |= ((vi0 & 0x10 ) >> 4 ) << (l + 0 );
1392
- y [ i ]. qh |= ((vi1 & 0x10 ) >> 4 ) << (l + 1 );
1402
+ qh |= ((vi0 & 0x10 ) >> 4 ) << (l + 0 );
1403
+ qh |= ((vi1 & 0x10 ) >> 4 ) << (l + 1 );
1393
1404
}
1405
+
1406
+ memcpy (& y [i ].qh , & qh , sizeof (y [i ].qh ));
1394
1407
}
1395
1408
}
1396
1409
@@ -1966,7 +1979,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
1966
1979
1967
1980
const uint8_t * restrict pp = x [i ].qs ;
1968
1981
1969
- const uint32_t qh = x [i ].qh ;
1982
+ uint32_t qh ;
1983
+ memcpy (& qh , x [i ].qh , sizeof (qh ));
1970
1984
1971
1985
for (int l = 0 ; l < QK5_1 ; l += 2 ) {
1972
1986
const uint8_t vi = pp [l /2 ];
@@ -3297,10 +3311,10 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3297
3311
uint32_t qh ;
3298
3312
memcpy (& qh , x0 -> qh , sizeof (qh ));
3299
3313
3300
- tmp [0 ] = table_b2b [(qh >> 0 ) & 0xFF ];
3301
- tmp [1 ] = table_b2b [(qh >> 8 ) & 0xFF ];
3302
- tmp [2 ] = table_b2b [(qh >> 16 ) & 0xFF ];
3303
- tmp [3 ] = table_b2b [(qh >> 24 ) ];
3314
+ tmp [0 ] = table_b2b_u [(qh >> 0 ) & 0xFF ];
3315
+ tmp [1 ] = table_b2b_u [(qh >> 8 ) & 0xFF ];
3316
+ tmp [2 ] = table_b2b_u [(qh >> 16 ) & 0xFF ];
3317
+ tmp [3 ] = table_b2b_u [(qh >> 24 ) ];
3304
3318
3305
3319
const int8x16_t qhl = vld1q_s8 ((const int8_t * )(tmp + 0 ));
3306
3320
const int8x16_t qhh = vld1q_s8 ((const int8_t * )(tmp + 2 ));
@@ -3350,17 +3364,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3350
3364
// Main loop
3351
3365
for (int i = 0 ; i < nb ; i ++ ) {
3352
3366
/* Compute combined scale for the block */
3353
- const __m128 d0 = _mm_set1_ps (GGML_FP16_TO_FP32 (x [2 * i + 0 ].d ));
3354
- const __m128 d1 = _mm_set1_ps (GGML_FP16_TO_FP32 (x [2 * i + 1 ].d ));
3355
- const __m256 d = _mm256_mul_ps (_mm256_set_m128 (d1 , d0 ), _mm256_broadcast_ss (& y [i ].d ));
3356
-
3357
- __m128i bx0 = bytes_from_nibbles_16 (x [2 * i + 0 ].qs );
3358
- __m128i bx1 = bytes_from_nibbles_16 (x [2 * i + 1 ].qs );
3359
- __m256i bx = _mm256_set_m128i (bx1 , bx0 );
3367
+ const __m256 d = _mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (x [i ].d )), _mm256_broadcast_ss (& y [i ].d ));
3360
3368
3361
- // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
3362
- const __m256i off = _mm256_set1_epi8 (8 );
3363
- bx = _mm256_sub_epi8 (bx , off );
3369
+ __m256i bx = bytes_from_nibbles_32 (x [i ].qs );
3370
+ const __m256i bxhi = _mm256_set_epi64x (
3371
+ table_b2b_i [x [i ].qh [3 ]], table_b2b_i [x [i ].qh [2 ]],
3372
+ table_b2b_i [x [i ].qh [1 ]], table_b2b_i [x [i ].qh [0 ]]);
3373
+ bx = _mm256_or_si256 (bx , bxhi );
3364
3374
3365
3375
__m256i by = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
3366
3376
@@ -3379,7 +3389,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3379
3389
const int8_t * restrict y0 = y [i ].qs ;
3380
3390
3381
3391
uint32_t qh ;
3382
- memcpy (& qh , x0 -> qh , sizeof (qh ));
3392
+ memcpy (& qh , x [ i ]. qh , sizeof (qh ));
3383
3393
3384
3394
const float d = GGML_FP16_TO_FP32 (x [i ].d );
3385
3395
@@ -3430,12 +3440,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3430
3440
summs += GGML_FP16_TO_FP32 (x0 -> m ) * (y0 -> s0 + y0 -> s1 );
3431
3441
3432
3442
// extract the 5th bit
3433
- const uint32_t qh = x0 -> qh ;
3443
+ uint32_t qh ;
3444
+ memcpy (& qh , x0 -> qh , sizeof (qh ));
3434
3445
3435
- tmp [0 ] = table_b2b [(qh >> 0 ) & 0xFF ];
3436
- tmp [1 ] = table_b2b [(qh >> 8 ) & 0xFF ];
3437
- tmp [2 ] = table_b2b [(qh >> 16 ) & 0xFF ];
3438
- tmp [3 ] = table_b2b [(qh >> 24 ) ];
3446
+ tmp [0 ] = table_b2b_u [(qh >> 0 ) & 0xFF ];
3447
+ tmp [1 ] = table_b2b_u [(qh >> 8 ) & 0xFF ];
3448
+ tmp [2 ] = table_b2b_u [(qh >> 16 ) & 0xFF ];
3449
+ tmp [3 ] = table_b2b_u [(qh >> 24 ) ];
3439
3450
3440
3451
const int8x16_t qhl = vld1q_s8 ((const int8_t * )(tmp + 0 ));
3441
3452
const int8x16_t qhh = vld1q_s8 ((const int8_t * )(tmp + 2 ));
@@ -3485,16 +3496,15 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3485
3496
3486
3497
// Main loop
3487
3498
for (int i = 0 ; i < nb ; i ++ ) {
3488
- const __m128 d0 = _mm_set1_ps (GGML_FP16_TO_FP32 (x [2 * i + 0 ].d ));
3489
- const __m128 d1 = _mm_set1_ps (GGML_FP16_TO_FP32 (x [2 * i + 1 ].d ));
3490
- const __m256 dx = _mm256_set_m128 (d1 , d0 );
3499
+ const __m256 dx = _mm256_set1_ps (GGML_FP16_TO_FP32 (x [i ].d ));
3491
3500
3492
- summs += GGML_FP16_TO_FP32 (x [2 * i + 0 ].m ) * y [i ].s0
3493
- + GGML_FP16_TO_FP32 (x [2 * i + 1 ].m ) * y [i ].s1 ;
3501
+ summs += GGML_FP16_TO_FP32 (x [i ].m ) * (y [i ].s0 + y [i ].s1 );
3494
3502
3495
- const __m128i bx0 = bytes_from_nibbles_16 (x [2 * i + 0 ].qs );
3496
- const __m128i bx1 = bytes_from_nibbles_16 (x [2 * i + 1 ].qs );
3497
- const __m256i bx = _mm256_set_m128i (bx1 , bx0 );
3503
+ __m256i bx = bytes_from_nibbles_32 (x [i ].qs );
3504
+ const __m256i bxhi = _mm256_set_epi64x (
3505
+ table_b2b_u [x [i ].qh [3 ]], table_b2b_u [x [i ].qh [2 ]],
3506
+ table_b2b_u [x [i ].qh [1 ]], table_b2b_u [x [i ].qh [0 ]]);
3507
+ bx = _mm256_or_si256 (bx , bxhi );
3498
3508
3499
3509
const __m256 dy = _mm256_broadcast_ss (& y [i ].d );
3500
3510
const __m256i by = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
@@ -3512,7 +3522,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3512
3522
const uint8_t * restrict x0 = x [i ].qs ;
3513
3523
const int8_t * restrict y0 = y [i ].qs ;
3514
3524
3515
- const uint32_t qh = x [i ].qh ;
3525
+ uint32_t qh ;
3526
+ memcpy (& qh , x [i ].qh , sizeof (qh ));
3516
3527
3517
3528
const float d = GGML_FP16_TO_FP32 (x [i ].d );
3518
3529
const float m = GGML_FP16_TO_FP32 (x [i ].m );
@@ -4297,15 +4308,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4297
4308
table_exp_f16 [i ] = GGML_FP32_TO_FP16 (expf (f ));
4298
4309
}
4299
4310
4300
- for (int i = 0 ; i < 256 ; ++ i ) {
4301
- table_b2b [i ] = 0 ;
4302
- for (int b = 0 ; b < 8 ; ++ b ) {
4303
- table_b2b [i ] |= ((uint64_t )(((i >> b ) & 0x01 ) << 4 )) << (8 * b );
4304
- }
4305
-
4306
- //printf("%3d %016llx\n", i, table_b2b[i]);
4307
- }
4308
-
4309
4311
const uint64_t t_end = ggml_time_us (); UNUSED (t_end );
4310
4312
4311
4313
GGML_PRINT_DEBUG ("%s: GELU, SILU and EXP tables initialized in %f ms\n" , __func__ , (t_end - t_start )/1000.0f );
@@ -12855,10 +12857,10 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
12855
12857
quantize_row_q5_0_reference (src + j , y , k );
12856
12858
12857
12859
for (int i = 0 ; i < nb ; i ++ ) {
12858
- for (int l = 0 ; l < QK5_0 ; l += 2 ) {
12859
- uint32_t qh ;
12860
- memcpy (& qh , & y [i ].qh , sizeof (qh ));
12860
+ uint32_t qh ;
12861
+ memcpy (& qh , & y [i ].qh , sizeof (qh ));
12861
12862
12863
+ for (int l = 0 ; l < QK5_0 ; l += 2 ) {
12862
12864
const uint8_t vh0 = ((qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
12863
12865
const uint8_t vh1 = ((qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
12864
12866
@@ -12885,9 +12887,12 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
12885
12887
quantize_row_q5_1_reference (src + j , y , k );
12886
12888
12887
12889
for (int i = 0 ; i < nb ; i ++ ) {
12890
+ uint32_t qh ;
12891
+ memcpy (& qh , & y [i ].qh , sizeof (qh ));
12892
+
12888
12893
for (int l = 0 ; l < QK5_1 ; l += 2 ) {
12889
- const uint8_t vh0 = ((y [ i ]. qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
12890
- const uint8_t vh1 = ((y [ i ]. qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
12894
+ const uint8_t vh0 = ((qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
12895
+ const uint8_t vh1 = ((qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
12891
12896
12892
12897
// cast to 16 bins
12893
12898
const uint8_t vi0 = ((y [i ].qs [l /2 ] & 0x0F ) | vh0 ) / 2 ;
0 commit comments