Skip to content

Commit a9559bf

Browse files
authored
ggml : workaround for missing _mm256_setr_m128i in GCC < 8 in k_quants.c (#2405)
1 parent ee1b497 commit a9559bf

File tree

1 file changed

+32
-30
lines changed

1 file changed

+32
-30
lines changed

k_quants.c

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
#define MIN(a, b) ((a) < (b) ? (a) : (b))
4040
#define MAX(a, b) ((a) > (b) ? (a) : (b))
4141

42+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
43+
4244
//
4345
// 2-6 bit quantization in super-blocks
4446
//
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
13531355
const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
13541356
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
13551357
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1356-
const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1358+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
13571359

13581360
__m256i sumi = _mm256_setzero_si256();
13591361

@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
14211423
const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
14221424

14231425
// sumf += -dmin * summs in 32bits*8
1424-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
1426+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
14251427

14261428
const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
14271429
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
14931495
}
14941496

14951497
// sumf += dall * isum - dmin * summs in 32bits
1496-
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
1498+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
14971499
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
14981500
}
14991501

@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
16441646
summs += dmin * smin;
16451647

16461648
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1647-
const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1648-
const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649+
const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1650+
const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
16491651

16501652
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
16511653
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
17091711
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
17101712
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
17111713

1712-
const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1713-
const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1714-
const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1715-
const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1714+
const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1715+
const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1716+
const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1717+
const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
17161718

17171719
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
17181720
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
19171919
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
19181920
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
19191921
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1920-
const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1922+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
19211923

19221924
// high bit
19231925
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
21282130
}
21292131

21302132
// multiply with block scale and accumulate
2131-
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2133+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
21322134
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
21332135

21342136
}
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
23032305
aux16[0] = a & 0x0f0f;
23042306
aux16[1] = (a >> 4) & 0x0f0f;
23052307

2306-
const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2307-
const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308+
const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2309+
const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
23082310

23092311
memcpy(&aux64, x[i].hmask, 8);
23102312

23112313
const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2312-
__m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
2314+
__m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
23132315
__m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
23142316
q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
23152317
q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
23182320
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
23192321

23202322
// prepare low and high bits
2321-
const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
2323+
const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
23222324
const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
23232325
const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
23242326

@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
24292431

24302432
p16_0 = _mm_add_epi32(p16_0, p16_2);
24312433
p16_1 = _mm_add_epi32(p16_1, p16_3);
2432-
__m256i p16 = _mm256_set_m128i(p16_1, p16_0);
2434+
__m256i p16 = MM256_SET_M128I(p16_1, p16_0);
24332435

24342436
// multiply with block scale and accumulate
24352437
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
26202622
acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
26212623

26222624
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
2623-
const __m256i scales = _mm256_set_m128i(sc128, sc128);
2625+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
26242626

26252627
__m256i sumi = _mm256_setzero_si256();
26262628

@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
27272729
}
27282730

27292731
__m256 vd = _mm256_set1_ps(d);
2730-
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2732+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
27312733
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
27322734

27332735
}
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
29682970

29692971
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
29702972
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2971-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
2973+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
29722974

29732975
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
29742976
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2975-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
2977+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
29762978

29772979
}
29782980

@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
31603162
summs += dmin * _mm_extract_epi32(hsum, 0);
31613163

31623164
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
3163-
const __m256i scales = _mm256_set_m128i(sc128, sc128);
3165+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
31643166

31653167
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
31663168
__m256i hmask = mone;
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
32993301
}
33003302

33013303
__m256 vd = _mm256_set1_ps(d);
3302-
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3304+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
33033305
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
33043306

33053307
}
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
34623464

34633465
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
34643466

3465-
const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3466-
const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467+
const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3468+
const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
34673469

34683470
int64_t aux64;
34693471
memcpy(&aux64, x[i].qh, 8);
34703472
const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
3471-
const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
3473+
const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
34723474

34733475
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
34743476
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
35433545
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
35443546
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
35453547

3546-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
3548+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
35473549

35483550
}
35493551

@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
39253927

39263928
}
39273929

3928-
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3930+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
39293931
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
39303932
}
39313933

@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
40834085
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
40844086
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
40854087

4086-
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4087-
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088+
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4089+
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
40884090

40894091
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
40904092
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
41774179
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
41784180
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
41794181

4180-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
4182+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
41814183
}
41824184

41834185
*s = hsum_float_8(acc);

0 commit comments

Comments
 (0)