Skip to content

Commit 15aee10

Browse files
committed
Remove broken AVX optimizations, fix Q2 scalar
1 parent 98c6278 commit 15aee10

File tree

1 file changed

+3
-28
lines changed

1 file changed

+3
-28
lines changed

ggml.c

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2176,7 +2176,7 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
21762176

21772177
float sumf = 0.0f;
21782178

2179-
#if __AVX__
2179+
#if defined(__AVX2__)
21802180
// Initialize accumulator with zeros
21812181
__m128 acc = _mm_setzero_ps();
21822182

@@ -2225,7 +2225,7 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
22252225
sumf = _mm_cvtss_f32(res);
22262226
#else
22272227
for (int i = 0; i < nb; i++) {
2228-
const float d0 = x[i].d;
2228+
const float d0 = GGML_FP16_TO_FP32(x[i].d);
22292229
const float d1 = y[i/2].d;
22302230

22312231
uint_fast32_t qs0 = x[i].qs;
@@ -2256,14 +2256,13 @@ static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void *
22562256

22572257
float sumf = 0.0f;
22582258

2259-
#if __AVX__ || __AVX2__
2259+
#if defined(__AVX2__)
22602260
// Initialize accumulator with zeros
22612261
__m128 acc = _mm_setzero_ps();
22622262
for (int i = 0; i < nb; i++) {
22632263
// Compute combined scale for the block
22642264
const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d);
22652265

2266-
#if __AVX2__
22672266
const __m256i shift_l = _mm256_set_epi64x(2*3, 64, 4*3, 0);
22682267
const __m256i shift_r = _mm256_set_epi64x( 64, 2*3, 64, 64);
22692268

@@ -2306,30 +2305,6 @@ static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void *
23062305
bxx = _mm256_shuffle_epi8(bxx, shufmask);
23072306

23082307
__m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1));
2309-
#elif __AVX__
2310-
// same as AVX2 but using 2 x 128-bit instructions, only slightly slower
2311-
const __m128i shift_l0 = _mm_set_epi64x( 4*3, 0);
2312-
const __m128i shift_l1 = _mm_set_epi64x(2*3, 64 );
2313-
const __m128i shift_r1 = _mm_set_epi64x( 64, 2*3 );
2314-
2315-
__m128i bx = _mm_set1_epi64x(x[i].qs);
2316-
2317-
// shift the copies to be able to reach all values
2318-
__m128i bx0 = _mm_sllv_epi64(bx, shift_l0);
2319-
__m128i bx1 = _mm_or_si128(_mm_sllv_epi64(bx, shift_l1), _mm_srlv_epi64(bx, shift_r1));
2320-
2321-
// add to itself in masked places to shift some values left one bit
2322-
const __m128i doublemask = _mm_set1_epi64x(0x078000078000);
2323-
bx0 = _mm_add_epi64(bx0, _mm_and_si128(doublemask, bx0));
2324-
bx1 = _mm_add_epi64(bx1, _mm_and_si128(doublemask, bx1));
2325-
2326-
// collect 16 bytes from 256 into 128 bits
2327-
const __m128i shufhi = _mm_set_epi8( 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1);
2328-
const __m128i shuflo = _mm_set_epi8(-1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0);
2329-
bx0 = _mm_shuffle_epi8(bx0, shuflo);
2330-
bx1 = _mm_shuffle_epi8(bx1, shufhi);
2331-
bx = _mm_or_si128(bx0, bx1);
2332-
#endif
23332308

23342309
const __m128i mask = _mm_set1_epi8(7);
23352310
bx = _mm_and_si128(mask, bx);

0 commit comments

Comments
 (0)