@@ -3456,11 +3456,12 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
3456
3456
const uint16_t * qh = x [i ].qh ;
3457
3457
3458
3458
for (int ib = 0 ; ib < QK_K /32 ; ++ ib ) {
3459
- const float dl = d * (2 * (qh [ib ] >> 12 ) + 1 );
3459
+ const float dl = d * (2 * ((qh [ib ] >> 12 ) & 7 ) + 1 );
3460
+ const float delta = qh [ib ] & 0x8000 ? - IQ1S_DELTA : IQ1S_DELTA ;
3460
3461
for (int l = 0 ; l < 4 ; ++ l ) {
3461
3462
const int8_t * grid = (const int8_t * )(iq1s_grid + (qs [l ] | (((qh [ib ] >> 3 * l ) & 7 ) << 8 )));
3462
3463
for (int j = 0 ; j < 8 ; ++ j ) {
3463
- y [j ] = dl * grid [j ];
3464
+ y [j ] = dl * ( grid [j ] + delta ) ;
3464
3465
}
3465
3466
y += 8 ;
3466
3467
}
@@ -9582,7 +9583,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9582
9583
const uint8_t * qs = x [i ].qs ;
9583
9584
const uint16_t * qh = x [i ].qh ;
9584
9585
9585
- int sumi1 = 0 , sumi2 = 0 ;
9586
+ int sumi1 = 0 , sumi2 = 0 , sumi3 = 0 ;
9586
9587
9587
9588
for (int ib = 0 ; ib < QK_K /32 ; ib += 2 ) {
9588
9589
@@ -9601,26 +9602,32 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9601
9602
const int32x4_t p1 = ggml_vdotq_s32 (ggml_vdotq_s32 (vdupq_n_s32 (0 ), q1b .val [0 ], q8b .val [0 ]), q1b .val [1 ], q8b .val [1 ]);
9602
9603
const int32x4_t p2 = ggml_vdotq_s32 (ggml_vdotq_s32 (vdupq_n_s32 (0 ), q1b .val [2 ], q8b .val [2 ]), q1b .val [3 ], q8b .val [3 ]);
9603
9604
9604
- sumi1 += vaddvq_s32 (p1 ) * (2 * (qh [ib + 0 ] >> 12 ) + 1 );
9605
- sumi2 += vaddvq_s32 (p2 ) * (2 * (qh [ib + 1 ] >> 12 ) + 1 );
9605
+ const int ls1 = 2 * ((qh [ib + 0 ] >> 12 ) & 7 ) + 1 ;
9606
+ const int ls2 = 2 * ((qh [ib + 1 ] >> 12 ) & 7 ) + 1 ;
9607
+ sumi1 += vaddvq_s32 (p1 ) * ls1 ;
9608
+ sumi2 += vaddvq_s32 (p2 ) * ls2 ;
9609
+ sumi3 += (y [i ].bsums [2 * ib + 0 ] + y [i ].bsums [2 * ib + 1 ]) * ls1 * (qh [ib + 0 ] & 0x8000 ? -1 : 1 )
9610
+ + (y [i ].bsums [2 * ib + 2 ] + y [i ].bsums [2 * ib + 3 ]) * ls2 * (qh [ib + 1 ] & 0x8000 ? -1 : 1 );
9606
9611
9607
9612
}
9608
9613
9609
- sumf += y [i ].d * GGML_FP16_TO_FP32 (x [i ].d ) * (sumi1 + sumi2 );
9614
+ sumf += y [i ].d * GGML_FP16_TO_FP32 (x [i ].d ) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3 );
9610
9615
}
9611
9616
9612
9617
* s = sumf ;
9613
9618
9614
9619
#elif defined __AVX2__
9615
9620
9616
9621
__m256 accum = _mm256_setzero_ps ();
9622
+ float accum1 = 0 ;
9617
9623
for (int i = 0 ; i < nb ; ++ i ) {
9618
9624
9619
9625
const int8_t * q8 = y [i ].qs ;
9620
9626
const uint8_t * qs = x [i ].qs ;
9621
9627
const uint16_t * qh = x [i ].qh ;
9622
9628
9623
9629
__m256i sumi = _mm256_setzero_si256 ();
9630
+ int sumi1 = 0 ;
9624
9631
for (int ib = 0 ; ib < QK_K /32 ; ib += 2 ) {
9625
9632
const __m256i q1b_1 = _mm256_set_epi64x (iq1s_grid [qs [3 ] | ((qh [ib + 0 ] >> 1 ) & 0x700 )], iq1s_grid [qs [2 ] | ((qh [ib + 0 ] << 2 ) & 0x700 )],
9626
9633
iq1s_grid [qs [1 ] | ((qh [ib + 0 ] << 5 ) & 0x700 )], iq1s_grid [qs [0 ] | ((qh [ib + 0 ] << 8 ) & 0x700 )]);
@@ -9632,17 +9639,23 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9632
9639
9633
9640
const __m256i dot1 = mul_add_epi8 (q1b_1 , q8b_1 );
9634
9641
const __m256i dot2 = mul_add_epi8 (q1b_2 , q8b_2 );
9635
- const __m256i p1 = _mm256_madd_epi16 (dot1 , _mm256_set1_epi16 (2 * (qh [ib + 0 ] >> 12 ) + 1 ));
9636
- const __m256i p2 = _mm256_madd_epi16 (dot2 , _mm256_set1_epi16 (2 * (qh [ib + 1 ] >> 12 ) + 1 ));
9642
+ const int16_t ls1 = 2 * ((qh [ib + 0 ] >> 12 ) & 7 ) + 1 ;
9643
+ const int16_t ls2 = 2 * ((qh [ib + 1 ] >> 12 ) & 7 ) + 1 ;
9644
+ const __m256i p1 = _mm256_madd_epi16 (dot1 , _mm256_set1_epi16 (ls1 ));
9645
+ const __m256i p2 = _mm256_madd_epi16 (dot2 , _mm256_set1_epi16 (ls2 ));
9637
9646
9638
9647
sumi = _mm256_add_epi32 (sumi , _mm256_add_epi32 (p1 , p2 ));
9648
+ sumi1 += (y [i ].bsums [2 * ib + 0 ] + y [i ].bsums [2 * ib + 1 ]) * (qh [ib + 0 ] & 0x8000 ? -1 : 1 ) * ls1
9649
+ + (y [i ].bsums [2 * ib + 2 ] + y [i ].bsums [2 * ib + 3 ]) * (qh [ib + 1 ] & 0x8000 ? -1 : 1 ) * ls2 ;
9639
9650
}
9640
9651
9641
- accum = _mm256_fmadd_ps (_mm256_set1_ps (y [i ].d * GGML_FP16_TO_FP32 (x [i ].d )), _mm256_cvtepi32_ps (sumi ), accum );
9652
+ const float d = y [i ].d * GGML_FP16_TO_FP32 (x [i ].d );
9653
+ accum = _mm256_fmadd_ps (_mm256_set1_ps (d ), _mm256_cvtepi32_ps (sumi ), accum );
9654
+ accum1 += d * sumi1 ;
9642
9655
9643
9656
}
9644
9657
9645
- * s = hsum_float_8 (accum );
9658
+ * s = hsum_float_8 (accum ) + IQ1S_DELTA * accum1 ;
9646
9659
9647
9660
#else
9648
9661
@@ -9653,9 +9666,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9653
9666
const uint8_t * qs = x [i ].qs ;
9654
9667
const uint16_t * qh = x [i ].qh ;
9655
9668
9656
- int sumi = 0 ;
9669
+ int sumi = 0 , sumi1 = 0 ;
9657
9670
for (int ib = 0 ; ib < QK_K /32 ; ++ ib ) {
9658
- const int ls = 2 * (qh [ib ] >> 12 ) + 1 ;
9671
+ const int ls = 2 * ((qh [ib ] >> 12 ) & 7 ) + 1 ;
9672
+ const int delta = qh [ib ] & 0x8000 ? -1 : 1 ;
9659
9673
int lsum = 0 ;
9660
9674
for (int l = 0 ; l < 4 ; ++ l ) {
9661
9675
const int8_t * grid = (const int8_t * )(iq1s_grid + (qs [l ] | (((qh [ib ] >> 3 * l ) & 7 ) << 8 )));
@@ -9664,11 +9678,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9664
9678
}
9665
9679
q8 += 8 ;
9666
9680
}
9667
- sumi += ls * lsum ;
9681
+ sumi += ls * lsum ;
9682
+ sumi1 += ls * delta * (y [i ].bsums [2 * ib + 0 ] + y [i ].bsums [2 * ib + 1 ]);
9668
9683
qs += 4 ;
9669
9684
}
9670
9685
9671
- sumf += GGML_FP16_TO_FP32 (x [i ].d ) * y [i ].d * sumi ;
9686
+ sumf += GGML_FP16_TO_FP32 (x [i ].d ) * y [i ].d * ( sumi + IQ1S_DELTA * sumi1 ) ;
9672
9687
}
9673
9688
9674
9689
* s = sumf ;
@@ -11438,7 +11453,7 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
11438
11453
}
11439
11454
11440
11455
static int iq1_find_best_neighbour2 (const uint16_t * restrict neighbours , const uint64_t * restrict grid ,
11441
- const float * restrict xval , const float * restrict weight , float scale , int8_t * restrict L , int ngrid ) {
11456
+ const float * restrict xval , const float * restrict weight , float scale , const float * restrict xg , int8_t * restrict L , int ngrid ) {
11442
11457
int num_neighbors = neighbours [0 ];
11443
11458
GGML_ASSERT (num_neighbors > 0 );
11444
11459
float best_score = FLT_MAX ;
@@ -11447,7 +11462,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
11447
11462
const int8_t * pg = (const int8_t * )(grid + neighbours [j ]);
11448
11463
float d2 = 0 ;
11449
11464
for (int i = 0 ; i < 8 ; ++ i ) {
11450
- float q = (pg [i ] - 3 )/2 ;
11465
+ float q = xg [ (pg [i ] - 1 )/2 ] ;
11451
11466
float w = weight [i ];
11452
11467
float diff = scale * q - xval [i ];
11453
11468
d2 += w * diff * diff ;
@@ -11463,7 +11478,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
11463
11478
float d2 = 0 ;
11464
11479
for (int j = 0 ; j < 8 ; ++ j ) {
11465
11480
float w = weight [j ];
11466
- float q = (grid_i [j ] - 3 )/2 ;
11481
+ float q = xg [ (grid_i [j ] - 1 )/2 ] ;
11467
11482
float diff = scale * q - xval [i ];
11468
11483
d2 += w * diff * diff ;
11469
11484
}
@@ -11480,7 +11495,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
11480
11495
const int8_t * pg = (const int8_t * )(grid + neighbours [j ]);
11481
11496
float sumqx = 0 , sumq2 = 0 ;
11482
11497
for (int i = 0 ; i < 8 ; ++ i ) {
11483
- float q = (pg [i ] - 3 )/2 ;
11498
+ float q = xg [ (pg [i ] - 1 )/2 ] ;
11484
11499
float w = weight [i ];
11485
11500
sumqx += w * q * xval [i ];
11486
11501
sumq2 += w * q * q ;
@@ -11519,6 +11534,9 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11519
11534
11520
11535
block_iq1_s * y = vy ;
11521
11536
11537
+ const float x_p [3 ] = {-1 + IQ1S_DELTA , IQ1S_DELTA , 1 + IQ1S_DELTA };
11538
+ const float x_m [3 ] = {-1 - IQ1S_DELTA , - IQ1S_DELTA , 1 - IQ1S_DELTA };
11539
+
11522
11540
float scales [QK_K /IQ1S_BLOCK_SIZE ];
11523
11541
float weight [IQ1S_BLOCK_SIZE ];
11524
11542
int8_t L [IQ1S_BLOCK_SIZE ];
@@ -11527,6 +11545,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11527
11545
float pairs [2 * IQ1S_BLOCK_SIZE ];
11528
11546
int * idx = (int * )(pairs + 1 );
11529
11547
uint16_t index [IQ1S_BLOCK_SIZE /8 ];
11548
+ int8_t shifts [QK_K /IQ1S_BLOCK_SIZE ];
11530
11549
11531
11550
for (int ibl = 0 ; ibl < nbl ; ++ ibl ) {
11532
11551
@@ -11572,33 +11591,41 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11572
11591
}
11573
11592
}
11574
11593
float best_score = 0 , scale = max ;
11575
- int besti1 = 0 , besti2 = 0 ;
11594
+ int besti1 = -1 , besti2 = -1 , best_shift = 0 ;
11576
11595
for (int i1 = 0 ; i1 <= IQ1S_BLOCK_SIZE ; ++ i1 ) {
11577
11596
for (int i2 = i1 ; i2 <= IQ1S_BLOCK_SIZE ; ++ i2 ) {
11578
- float sumqx = - (sumx [i1 ] - sumx [0 ]) + (sumx [IQ1S_BLOCK_SIZE ] - sumx [i2 ]);
11579
- float sumq2 = (sumw [i1 ] - sumw [0 ]) + (sumw [IQ1S_BLOCK_SIZE ] - sumw [i2 ]);
11597
+ float sumqx = (sumx [i1 ] - sumx [0 ])* x_p [0 ] + (sumx [i2 ] - sumx [i1 ])* x_p [1 ] + (sumx [IQ1S_BLOCK_SIZE ] - sumx [i2 ])* x_p [2 ];
11598
+ float sumq2 = (sumw [i1 ] - sumw [0 ])* x_p [0 ]* x_p [0 ] + (sumw [i2 ] - sumw [i1 ])* x_p [1 ]* x_p [1 ] + (sumw [IQ1S_BLOCK_SIZE ] - sumw [i2 ])* x_p [2 ]* x_p [2 ];
11599
+ if (sumq2 > 0 && sumqx * sumqx > best_score * sumq2 ) {
11600
+ scale = sumqx /sumq2 ; best_score = scale * sumqx ;
11601
+ besti1 = i1 ; besti2 = i2 ; best_shift = 1 ;
11602
+ }
11603
+ sumqx = (sumx [i1 ] - sumx [0 ])* x_m [0 ] + (sumx [i2 ] - sumx [i1 ])* x_m [1 ] + (sumx [IQ1S_BLOCK_SIZE ] - sumx [i2 ])* x_m [2 ];
11604
+ sumq2 = (sumw [i1 ] - sumw [0 ])* x_m [0 ]* x_m [0 ] + (sumw [i2 ] - sumw [i1 ])* x_m [1 ]* x_m [1 ] + (sumw [IQ1S_BLOCK_SIZE ] - sumw [i2 ])* x_m [2 ]* x_m [2 ];
11580
11605
if (sumq2 > 0 && sumqx * sumqx > best_score * sumq2 ) {
11581
11606
scale = sumqx /sumq2 ; best_score = scale * sumqx ;
11582
- besti1 = i1 ; besti2 = i2 ;
11607
+ besti1 = i1 ; besti2 = i2 ; best_shift = -1 ;
11583
11608
}
11584
11609
}
11585
11610
}
11611
+ GGML_ASSERT (besti1 >= 0 && besti2 >= 0 && best_shift != 0 );
11586
11612
for (int j = 0 ; j < besti1 ; ++ j ) L [idx [2 * j ]] = 0 ;
11587
11613
for (int j = besti1 ; j < besti2 ; ++ j ) L [idx [2 * j ]] = 1 ;
11588
11614
for (int j = besti2 ; j < IQ1S_BLOCK_SIZE ; ++ j ) L [idx [2 * j ]] = 2 ;
11589
11615
if (scale < 0 ) {
11590
11616
for (int j = 0 ; j < IQ1S_BLOCK_SIZE ; ++ j ) L [j ] = 2 - L [j ];
11591
- scale = - scale ;
11617
+ scale = - scale ; best_shift = - best_shift ;
11592
11618
}
11593
11619
bool all_on_grid = true;
11620
+ const float * xx = best_shift == 1 ? x_p : x_m ;
11594
11621
for (int k = 0 ; k < IQ1S_BLOCK_SIZE /8 ; ++ k ) {
11595
11622
uint16_t u = 0 ;
11596
11623
for (int j = 0 ; j < 8 ; ++ j ) u |= (L [8 * k + j ] << 2 * j );
11597
11624
int grid_index = kmap_q2xs [u ];
11598
11625
if (grid_index < 0 ) {
11599
11626
all_on_grid = false;
11600
11627
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs [u ] - 1 ;
11601
- grid_index = iq1_find_best_neighbour2 (neighbours , kgrid_q2xs , xb + 8 * k , weight + 8 * k , scale , L + 8 * k , NGRID_IQ1S );
11628
+ grid_index = iq1_find_best_neighbour2 (neighbours , kgrid_q2xs , xb + 8 * k , weight + 8 * k , scale , xx , L + 8 * k , NGRID_IQ1S );
11602
11629
GGML_ASSERT (grid_index >= 0 );
11603
11630
}
11604
11631
index [k ] = grid_index ;
@@ -11609,7 +11636,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11609
11636
const int8_t * pg = (const int8_t * )(kgrid_q2xs + index [k ]);
11610
11637
for (int j = 0 ; j < 8 ; ++ j ) {
11611
11638
float w = weight [8 * k + j ];
11612
- float q = (pg [j ] - 3 )/2 ;
11639
+ float q = xx [ (pg [j ] - 1 )/2 ] ;
11613
11640
sumqx += w * q * xb [8 * k + j ];
11614
11641
sumq2 += w * q * q ;
11615
11642
}
@@ -11624,6 +11651,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11624
11651
y [ibl ].qh [ib ] = h ;
11625
11652
GGML_ASSERT (scale >= 0 );
11626
11653
scales [ib ] = scale ;
11654
+ shifts [ib ] = best_shift ;
11627
11655
max_scale = MAX (max_scale , scale );
11628
11656
}
11629
11657
@@ -11632,12 +11660,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11632
11660
continue ;
11633
11661
}
11634
11662
11635
- float d = max_scale /31 ;
11663
+ float d = max_scale /15 ;
11636
11664
y [ibl ].d = GGML_FP32_TO_FP16 (d * 1.125f ); // 1.085f is another fudge factor. Don't ask me why it is needed.
11637
11665
float id = 1 /d ;
11638
11666
for (int ib = 0 ; ib < QK_K /IQ1S_BLOCK_SIZE ; ++ ib ) {
11639
11667
int l = nearest_int (0.5f * (id * scales [ib ]- 1 ));
11640
- l = MAX (0 , MIN (15 , l ));
11668
+ l = MAX (0 , MIN (7 , l ));
11669
+ if (shifts [ib ] == -1 ) l |= 8 ;
11641
11670
y [ibl ].qh [ib ] |= (l << 12 );
11642
11671
}
11643
11672
}
0 commit comments