@@ -2176,7 +2176,7 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
2176
2176
2177
2177
float sumf = 0.0f ;
2178
2178
2179
- #if __AVX__
2179
+ #if defined( __AVX2__ )
2180
2180
// Initialize accumulator with zeros
2181
2181
__m128 acc = _mm_setzero_ps ();
2182
2182
@@ -2225,7 +2225,7 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
2225
2225
sumf = _mm_cvtss_f32 (res );
2226
2226
#else
2227
2227
for (int i = 0 ; i < nb ; i ++ ) {
2228
- const float d0 = x [i ].d ;
2228
+ const float d0 = GGML_FP16_TO_FP32 ( x [i ].d ) ;
2229
2229
const float d1 = y [i /2 ].d ;
2230
2230
2231
2231
uint_fast32_t qs0 = x [i ].qs ;
@@ -2256,14 +2256,13 @@ static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void *
2256
2256
2257
2257
float sumf = 0.0f ;
2258
2258
2259
- #if __AVX__ || __AVX2__
2259
+ #if defined( __AVX2__ )
2260
2260
// Initialize accumulator with zeros
2261
2261
__m128 acc = _mm_setzero_ps ();
2262
2262
for (int i = 0 ; i < nb ; i ++ ) {
2263
2263
// Compute combined scale for the block
2264
2264
const __m128 scale = _mm_set1_ps (GGML_FP16_TO_FP32 (x [i ].d ) * y [i /2 ].d );
2265
2265
2266
- #if __AVX2__
2267
2266
const __m256i shift_l = _mm256_set_epi64x (2 * 3 , 64 , 4 * 3 , 0 );
2268
2267
const __m256i shift_r = _mm256_set_epi64x ( 64 , 2 * 3 , 64 , 64 );
2269
2268
@@ -2306,30 +2305,6 @@ static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void *
2306
2305
bxx = _mm256_shuffle_epi8 (bxx , shufmask );
2307
2306
2308
2307
__m128i bx = _mm_or_si128 (_mm256_castsi256_si128 (bxx ), _mm256_extracti128_si256 (bxx , 1 ));
2309
- #elif __AVX__
2310
- // same as AVX2 but using 2 x 128-bit instructions, only slightly slower
2311
- const __m128i shift_l0 = _mm_set_epi64x ( 4 * 3 , 0 );
2312
- const __m128i shift_l1 = _mm_set_epi64x (2 * 3 , 64 );
2313
- const __m128i shift_r1 = _mm_set_epi64x ( 64 , 2 * 3 );
2314
-
2315
- __m128i bx = _mm_set1_epi64x (x [i ].qs );
2316
-
2317
- // shift the copies to be able to reach all values
2318
- __m128i bx0 = _mm_sllv_epi64 (bx , shift_l0 );
2319
- __m128i bx1 = _mm_or_si128 (_mm_sllv_epi64 (bx , shift_l1 ), _mm_srlv_epi64 (bx , shift_r1 ));
2320
-
2321
- // add to itself in masked places to shift some values left one bit
2322
- const __m128i doublemask = _mm_set1_epi64x (0x078000078000 );
2323
- bx0 = _mm_add_epi64 (bx0 , _mm_and_si128 (doublemask , bx0 ));
2324
- bx1 = _mm_add_epi64 (bx1 , _mm_and_si128 (doublemask , bx1 ));
2325
-
2326
- // collect 16 bytes from 256 into 128 bits
2327
- const __m128i shufhi = _mm_set_epi8 ( 5 ,14 ,-1 ,-1 ,13 , 3 ,-1 ,-1 , 2 ,11 ,-1 ,-1 ,10 , 0 ,-1 ,-1 );
2328
- const __m128i shuflo = _mm_set_epi8 (-1 ,-1 , 5 ,14 ,-1 ,-1 ,13 , 3 ,-1 ,-1 , 2 ,11 ,-1 ,-1 ,10 , 0 );
2329
- bx0 = _mm_shuffle_epi8 (bx0 , shuflo );
2330
- bx1 = _mm_shuffle_epi8 (bx1 , shufhi );
2331
- bx = _mm_or_si128 (bx0 , bx1 );
2332
- #endif
2333
2308
2334
2309
const __m128i mask = _mm_set1_epi8 (7 );
2335
2310
bx = _mm_and_si128 (mask , bx );
0 commit comments