ggml : AVX2 optimizations for Q5_0, Q5_1 (#1195)

sw · web-flow · commit 2bfa1fe8e76c · 2023-04-26T22:38:15.000+03:00
diff --git a/ggml.c b/ggml.c
@@ -328,8 +328,18 @@ static ggml_fp16_t table_exp_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB)
 static float table_f32_f16[1 << 16];
 
-// precomputed table for expanding 8bits to 8 bytes (shl 4)
-static uint64_t table_b2b[1 << 8];
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes (shl 4)
+static const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) };
+static const uint64_t table_b2b_i[1 << 8] = { B8(F0, 00) };
 
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
@@ -688,7 +698,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
 typedef struct {
     ggml_fp16_t d;         // delta
     ggml_fp16_t m;         // min
-    uint32_t qh;           // 5-th bit of quants
+    uint8_t qh[4];         // 5-th bit of quants
     uint8_t qs[QK5_1 / 2]; // nibbles / quants
 } block_q5_1;
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
@@ -1376,7 +1386,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
 
         y[i].d = GGML_FP32_TO_FP16(d);
         y[i].m = GGML_FP32_TO_FP16(min);
-        y[i].qh = 0;
+
+        uint32_t qh = 0;
 
         for (int l = 0; l < QK5_1; l += 2) {
             const float v0 = (x[i*QK5_1 + l + 0] - min)*id;
@@ -1388,9 +1399,11 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
             y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
 
             // get the 5-th bit and store it in qh at the right position
-            y[i].qh |= ((vi0 & 0x10) >> 4) << (l + 0);
-            y[i].qh |= ((vi1 & 0x10) >> 4) << (l + 1);
+            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
+            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
         }
+
+        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
     }
 }
 
@@ -1966,7 +1979,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
 
         const uint8_t * restrict pp = x[i].qs;
 
-        const uint32_t qh = x[i].qh;
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
 
         for (int l = 0; l < QK5_1; l += 2) {
             const uint8_t vi = pp[l/2];
@@ -3297,10 +3311,10 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         uint32_t qh;
         memcpy(&qh, x0->qh, sizeof(qh));
 
-        tmp[0] = table_b2b[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b[(qh >> 24)       ];
+        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_u[(qh >> 24)       ];
 
         const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
         const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
@@ -3350,17 +3364,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
     // Main loop
     for (int i = 0; i < nb; i++) {
         /* Compute combined scale for the block */
-        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
-        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
-        const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d));
-
-        __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
-        __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
-        __m256i bx = _mm256_set_m128i(bx1, bx0);
+        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
 
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8(8);
-        bx = _mm256_sub_epi8(bx, off);
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = _mm256_set_epi64x(
+            table_b2b_i[x[i].qh[3]], table_b2b_i[x[i].qh[2]],
+            table_b2b_i[x[i].qh[1]], table_b2b_i[x[i].qh[0]]);
+        bx = _mm256_or_si256(bx, bxhi);
 
         __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 
@@ -3379,7 +3389,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         const  int8_t * restrict y0 = y[i].qs;
 
         uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
+        memcpy(&qh, x[i].qh, sizeof(qh));
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
 
@@ -3430,12 +3440,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
 
         // extract the 5th bit
-        const uint32_t qh = x0->qh;
+        uint32_t qh;
+        memcpy(&qh, x0->qh, sizeof(qh));
 
-        tmp[0] = table_b2b[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b[(qh >> 24)       ];
+        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_u[(qh >> 24)       ];
 
         const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
         const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
@@ -3485,16 +3496,15 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 
     // Main loop
     for (int i = 0; i < nb; i++) {
-        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
-        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
-        const __m256 dx = _mm256_set_m128(d1, d0);
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
 
-        summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0
-               + GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1;
+        summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1);
 
-        const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
-        const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
-        const __m256i bx = _mm256_set_m128i(bx1, bx0);
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = _mm256_set_epi64x(
+            table_b2b_u[x[i].qh[3]], table_b2b_u[x[i].qh[2]],
+            table_b2b_u[x[i].qh[1]], table_b2b_u[x[i].qh[0]]);
+        bx = _mm256_or_si256(bx, bxhi);
 
         const __m256 dy = _mm256_broadcast_ss(&y[i].d);
         const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
@@ -3512,7 +3522,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         const uint8_t * restrict x0 = x[i].qs;
         const  int8_t * restrict y0 = y[i].qs;
 
-        const uint32_t qh = x[i].qh;
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
         const float m = GGML_FP16_TO_FP32(x[i].m);
@@ -4297,15 +4308,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
                 table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
             }
 
-            for (int i = 0; i < 256; ++i) {
-                table_b2b[i] = 0;
-                for (int b = 0; b < 8; ++b) {
-                    table_b2b[i] |= ((uint64_t)(((i >> b) & 0x01) << 4)) << (8*b);
-                }
-
-                //printf("%3d %016llx\n", i, table_b2b[i]);
-            }
-
             const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
 
             GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
@@ -12855,10 +12857,10 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
         quantize_row_q5_0_reference(src + j, y, k);
 
         for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK5_0; l += 2) {
-                uint32_t qh;
-                memcpy(&qh, &y[i].qh, sizeof(qh));
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
 
+            for (int l = 0; l < QK5_0; l += 2) {
                 const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
                 const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
 
@@ -12885,9 +12887,12 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
         quantize_row_q5_1_reference(src + j, y, k);
 
         for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
             for (int l = 0; l < QK5_1; l += 2) {
-                const uint8_t vh0 = ((y[i].qh & (1 << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((y[i].qh & (1 << (l + 1))) >> (l + 1)) << 4;
+                const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;