@@ -10716,14 +10716,14 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
10716
10716
10717
10717
for (int ib = 0; ib < QK_K/32; ib += 2) {
10718
10718
10719
- q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((( qh[ib+0] >> 0 ) & 7) << 8 )))),
10720
- vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((( qh[ib+0] >> 3 ) & 7) << 8 )))));
10721
- q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((( qh[ib+0] >> 6 ) & 7) << 8 )))),
10722
- vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((( qh[ib+0] >> 9 ) & 7) << 8 )))));
10723
- q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((( qh[ib+1] >> 0 ) & 7) << 8 )))),
10724
- vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((( qh[ib+1] >> 3 ) & 7) << 8 )))));
10725
- q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((( qh[ib+1] >> 6 ) & 7) << 8 )))),
10726
- vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((( qh[ib+1] >> 9 ) & 7) << 8 )))));
10719
+ q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8 ) & 0x700 )))),
10720
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5 ) & 0x700 )))));
10721
+ q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2 ) & 0x700 )))),
10722
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1 ) & 0x700 )))));
10723
+ q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8 ) & 0x700 )))),
10724
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5 ) & 0x700 )))));
10725
+ q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2 ) & 0x700 )))),
10726
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1 ) & 0x700 )))));
10727
10727
qs += 8;
10728
10728
10729
10729
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
0 commit comments