Skip to content

Commit ee9dc9a

Browse files
committed
iq1s_blocks16: slightly faster Neon dot product
1 parent e4ff3dc commit ee9dc9a

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

ggml-quants.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10716,14 +10716,14 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1071610716

1071710717
for (int ib = 0; ib < QK_K/32; ib += 2) {
1071810718

10719-
q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | (((qh[ib+0] >> 0) & 7) << 8)))),
10720-
vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | (((qh[ib+0] >> 3) & 7) << 8)))));
10721-
q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | (((qh[ib+0] >> 6) & 7) << 8)))),
10722-
vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | (((qh[ib+0] >> 9) & 7) << 8)))));
10723-
q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | (((qh[ib+1] >> 0) & 7) << 8)))),
10724-
vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | (((qh[ib+1] >> 3) & 7) << 8)))));
10725-
q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | (((qh[ib+1] >> 6) & 7) << 8)))),
10726-
vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | (((qh[ib+1] >> 9) & 7) << 8)))));
10719+
q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
10720+
vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
10721+
q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
10722+
vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
10723+
q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
10724+
vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
10725+
q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
10726+
vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
1072710727
qs += 8;
1072810728

1072910729
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;

0 commit comments

Comments
 (0)