@@ -892,12 +892,16 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
892
892
#endif
893
893
#endif
894
894
895
+ // we know the values are in the [-1 .. 1] range, so abs(d) cannot be more than 1/8 when using 4 bits
896
+ #define Q4_0DM (1.0f/8.0f)
897
+ #define Q4_0D(x) (((x)*Q4_0DM) / 127.0f)
898
+
895
899
#define QK4_0 32
896
900
typedef struct {
897
- ggml_fp16_t d; // delta
901
+ int8_t d; // delta
898
902
uint8_t qs[QK4_0 / 2]; // nibbles / quants
899
903
} block_q4_0;
900
- static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t ) + QK4_0 / 2, "wrong q4_0 block size/padding");
904
+ static_assert(sizeof(block_q4_0) == sizeof(int8_t ) + QK4_0 / 2, "wrong q4_0 block size/padding");
901
905
902
906
#define QK4_1 32
903
907
typedef struct {
@@ -915,14 +919,21 @@ typedef struct {
915
919
} block_q5_0;
916
920
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
917
921
922
+ // we know the values are in the [-1 .. 1] range, so:
923
+ // - d is unsigned 4-bit that represents maximum value of 2.0/31 when using 5 bits
924
+ // - m is unsigned 4-bit that represents offset from -1.0 which cannot be more than 2.0
925
+ #define Q5_1DM (2.0f/31.0f)
926
+ #define Q5_1MM (2.0f )
927
+ #define Q5_1D(x) ( (((x) & 0x0F)*Q5_1DM) / 15.0f)
928
+ #define Q5_1M(x) (-1.0f + (((x) >> 4)*Q5_1MM) / 15.0f)
929
+
918
930
#define QK5_1 32
919
931
typedef struct {
920
- ggml_fp16_t d; // delta
921
- ggml_fp16_t m; // min
932
+ uint8_t dm; // 4-bit delta + 4-bit min
922
933
uint8_t qh[4]; // 5-th bit of quants
923
934
uint8_t qs[QK5_1 / 2]; // nibbles / quants
924
935
} block_q5_1;
925
- static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t ) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
936
+ static_assert(sizeof(block_q5_1) == sizeof(uint8_t ) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
926
937
927
938
#define QK8_0 32
928
939
typedef struct {
@@ -959,10 +970,13 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
959
970
}
960
971
}
961
972
962
- const float d = max / -8;
963
- const float id = d ? 1.0f/d : 0.0f;
973
+ float d = max / -8;
964
974
965
- y[i].d = GGML_FP32_TO_FP16(d);
975
+ y[i].d = (int8_t)(ceilf((127.0f * d) / Q4_0DM));
976
+
977
+ d = Q4_0D(y[i].d);
978
+
979
+ const float id = d ? 1.0f/d : 0.0f;
966
980
967
981
for (int j = 0; j < qk/2; ++j) {
968
982
const float x0 = x[i*qk + 0 + j]*id;
@@ -1088,11 +1102,17 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
1088
1102
if (v > max) max = v;
1089
1103
}
1090
1104
1091
- const float d = (max - min) / ((1 << 5) - 1);
1092
- const float id = d ? 1.0f/d : 0.0f;
1105
+ y[i].dm = (uint8_t)(floorf((15.0f * (min + 1.0f)) / Q5_1MM)) << 4;
1093
1106
1094
- y[i].d = GGML_FP32_TO_FP16(d);
1095
- y[i].m = GGML_FP32_TO_FP16(min);
1107
+ min = Q5_1M(y[i].dm);
1108
+
1109
+ float d = (max - min) / ((1 << 5) - 1);
1110
+
1111
+ y[i].dm |= (uint8_t)(ceilf((15.0f * d) / Q5_1DM));
1112
+
1113
+ d = Q5_1D(y[i].dm);
1114
+
1115
+ const float id = d ? 1.0f/d : 0.0f;
1096
1116
1097
1117
uint32_t qh = 0;
1098
1118
@@ -1530,7 +1550,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
1530
1550
const int nb = k / qk;
1531
1551
1532
1552
for (int i = 0; i < nb; i++) {
1533
- const float d = GGML_FP16_TO_FP32 (x[i].d);
1553
+ const float d = Q4_0D (x[i].d);
1534
1554
1535
1555
for (int j = 0; j < qk/2; ++j) {
1536
1556
const int x0 = (x[i].qs[j] & 0x0F) - 8;
@@ -1597,8 +1617,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
1597
1617
const int nb = k / qk;
1598
1618
1599
1619
for (int i = 0; i < nb; i++) {
1600
- const float d = GGML_FP16_TO_FP32 (x[i].d );
1601
- const float m = GGML_FP16_TO_FP32 (x[i].m );
1620
+ const float d = Q5_1D (x[i].dm );
1621
+ const float m = Q5_1M (x[i].dm );
1602
1622
1603
1623
uint32_t qh;
1604
1624
memcpy(&qh, x[i].qh, sizeof(qh));
@@ -2407,8 +2427,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2407
2427
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
2408
2428
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
2409
2429
2410
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32 (x0->d)*GGML_FP16_TO_FP32(y0->d));
2411
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32 (x1->d)*GGML_FP16_TO_FP32(y1->d));
2430
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), Q4_0D (x0->d)*GGML_FP16_TO_FP32(y0->d));
2431
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), Q4_0D (x1->d)*GGML_FP16_TO_FP32(y1->d));
2412
2432
#else
2413
2433
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
2414
2434
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
@@ -2425,8 +2445,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2425
2445
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2426
2446
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2427
2447
2428
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32 (x0->d)*GGML_FP16_TO_FP32(y0->d));
2429
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32 (x1->d)*GGML_FP16_TO_FP32(y1->d));
2448
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), Q4_0D (x0->d)*GGML_FP16_TO_FP32(y0->d));
2449
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), Q4_0D (x1->d)*GGML_FP16_TO_FP32(y1->d));
2430
2450
#endif
2431
2451
}
2432
2452
@@ -2438,7 +2458,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2438
2458
// Main loop
2439
2459
for (int i = 0; i < nb; ++i) {
2440
2460
/* Compute combined scale for the block */
2441
- const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32 (x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2461
+ const __m256 d = _mm256_set1_ps( Q4_0D (x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2442
2462
2443
2463
__m256i bx = bytes_from_nibbles_32(x[i].qs);
2444
2464
@@ -2462,7 +2482,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2462
2482
// Main loop
2463
2483
for (int i = 0; i < nb; ++i) {
2464
2484
// Compute combined scale for the block
2465
- const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32 (x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2485
+ const __m256 d = _mm256_set1_ps( Q4_0D (x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2466
2486
2467
2487
const __m128i lowMask = _mm_set1_epi8(0xF);
2468
2488
const __m128i off = _mm_set1_epi8(8);
@@ -2504,7 +2524,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2504
2524
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
2505
2525
2506
2526
// Compute combined scale for the block 0 and 1
2507
- const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32 (x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
2527
+ const __m128 d_0_1 = _mm_set1_ps( Q4_0D (x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
2508
2528
2509
2529
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
2510
2530
@@ -2522,7 +2542,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2522
2542
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
2523
2543
2524
2544
// Compute combined scale for the block 2 and 3
2525
- const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32 (x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
2545
+ const __m128 d_2_3 = _mm_set1_ps( Q4_0D (x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
2526
2546
2527
2547
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
2528
2548
@@ -2555,7 +2575,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2555
2575
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
2556
2576
2557
2577
// Compute combined scale for the block 0 and 1
2558
- const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32 (x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2578
+ const __m128 d_0_1 = _mm_set1_ps( Q4_0D (x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2559
2579
2560
2580
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
2561
2581
@@ -2573,7 +2593,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2573
2593
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
2574
2594
2575
2595
// Compute combined scale for the block 2 and 3
2576
- const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32 (x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
2596
+ const __m128 d_2_3 = _mm_set1_ps( Q4_0D (x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
2577
2597
2578
2598
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
2579
2599
@@ -2621,7 +2641,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2621
2641
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2622
2642
}
2623
2643
2624
- sumf += sumi*GGML_FP16_TO_FP32 (x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2644
+ sumf += sumi*Q4_0D (x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2625
2645
}
2626
2646
2627
2647
*s = sumf;
@@ -3026,8 +3046,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3026
3046
3027
3047
const uint8x16_t m4b = vdupq_n_u8(0x0F);
3028
3048
3029
- summs0 += GGML_FP16_TO_FP32 (x0->m ) * y0->s;
3030
- summs1 += GGML_FP16_TO_FP32 (x1->m ) * y1->s;
3049
+ summs0 += Q5_1M (x0->dm ) * y0->s;
3050
+ summs1 += Q5_1M (x1->dm ) * y1->s;
3031
3051
3032
3052
// extract the 5th bit via lookup table ((b) << 4)
3033
3053
memcpy(&qh0, x0->qh, sizeof(qh0));
@@ -3072,10 +3092,10 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3072
3092
#if defined(__ARM_FEATURE_DOTPROD)
3073
3093
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3074
3094
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
3075
- vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32 (x0->d )*y0->d);
3095
+ vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), Q5_1D (x0->dm )*y0->d);
3076
3096
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3077
3097
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
3078
- vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32 (x1->d )*y1->d);
3098
+ vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), Q5_1D (x1->dm )*y1->d);
3079
3099
#else
3080
3100
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
3081
3101
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -3092,8 +3112,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3092
3112
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
3093
3113
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
3094
3114
3095
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32 (x0->d )*y0->d);
3096
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32 (x1->d )*y1->d);
3115
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), Q5_1D (x0->dm )*y0->d);
3116
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), Q5_1D (x1->dm )*y1->d);
3097
3117
#endif
3098
3118
}
3099
3119
@@ -3111,7 +3131,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3111
3131
const block_q5_1 * restrict x0 = &x[i];
3112
3132
const block_q8_1 * restrict y0 = &y[i];
3113
3133
3114
- summs += GGML_FP16_TO_FP32 (x0->m ) * y0->s;
3134
+ summs += Q5_1M (x0->dm ) * y0->s;
3115
3135
3116
3136
const v128_t m4b = wasm_i8x16_splat(0x0F);
3117
3137
@@ -3158,7 +3178,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3158
3178
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
3159
3179
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
3160
3180
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
3161
- wasm_f32x4_splat(GGML_FP16_TO_FP32 (x0->d ) * y0->d)));
3181
+ wasm_f32x4_splat(Q5_1D (x0->dm ) * y0->d)));
3162
3182
}
3163
3183
3164
3184
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -3171,9 +3191,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3171
3191
3172
3192
// Main loop
3173
3193
for (int i = 0; i < nb; i++) {
3174
- const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32 (x[i].d ));
3194
+ const __m256 dx = _mm256_set1_ps(Q5_1D (x[i].dm ));
3175
3195
3176
- summs += GGML_FP16_TO_FP32 (x[i].m ) * y[i].s;
3196
+ summs += Q5_1M (x[i].dm ) * y[i].s;
3177
3197
3178
3198
__m256i bx = bytes_from_nibbles_32(x[i].qs);
3179
3199
__m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -3198,9 +3218,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3198
3218
3199
3219
// Main loop
3200
3220
for (int i = 0; i < nb; i++) {
3201
- const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32 (x[i].d ));
3221
+ const __m256 dx = _mm256_set1_ps(Q5_1D (x[i].dm ));
3202
3222
3203
- summs += GGML_FP16_TO_FP32 (x[i].m ) * y[i].s;
3223
+ summs += Q5_1M (x[i].dm ) * y[i].s;
3204
3224
3205
3225
__m256i bx = bytes_from_nibbles_32(x[i].qs);
3206
3226
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -3243,7 +3263,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3243
3263
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
3244
3264
}
3245
3265
3246
- sumf += (GGML_FP16_TO_FP32 (x[i].d )*y[i].d)*sumi + GGML_FP16_TO_FP32 (x[i].m )*y[i].s;
3266
+ sumf += (Q5_1D (x[i].dm )*y[i].d)*sumi + Q5_1M (x[i].dm )*y[i].s;
3247
3267
}
3248
3268
3249
3269
*s = sumf;
@@ -5470,7 +5490,7 @@ struct ggml_tensor * ggml_sum_rows(
5470
5490
}
5471
5491
5472
5492
int64_t ne[4] = {1,1,1,1};
5473
- for (int i= 1; i< a->n_dims; ++i) {
5493
+ for (int i = 1; i < a->n_dims; ++i) {
5474
5494
ne[i] = a->ne[i];
5475
5495
}
5476
5496
0 commit comments