Skip to content

Commit 4f46a13

Browse files
committed
Speedup the AVX-512 implementation of ggml_vec_dot_q4_0()
1 parent 0ad9646 commit 4f46a13

File tree

4 files changed

+238
-47
lines changed

4 files changed

+238
-47
lines changed

CMakeLists.txt

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer"
5555
option(LLAMA_AVX "llama: enable AVX" ON)
5656
option(LLAMA_AVX2 "llama: enable AVX2" ON)
5757
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
58+
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
59+
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
5860
option(LLAMA_FMA "llama: enable FMA" ON)
5961
# in MSVC F16C is implied with AVX2/AVX512
6062
if (NOT MSVC)
@@ -220,6 +222,16 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
220222
if (MSVC)
221223
if (LLAMA_AVX512)
222224
add_compile_options(/arch:AVX512)
225+
# MSVC has no compile-time flags enabling specific
226+
# AVX512 extensions, neither it defines the
227+
# macros corresponding to the extensions.
228+
# Do it manually.
229+
if (LLAMA_AVX512_VBMI)
230+
add_compile_definitions(__AVX512VBMI__)
231+
endif()
232+
if (LLAMA_AVX512_VNNI)
233+
add_compile_definitions(__AVX512VNNI__)
234+
endif()
223235
elseif (LLAMA_AVX2)
224236
add_compile_options(/arch:AVX2)
225237
elseif (LLAMA_AVX)
@@ -240,9 +252,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
240252
endif()
241253
if (LLAMA_AVX512)
242254
add_compile_options(-mavx512f)
243-
# add_compile_options(-mavx512cd)
244-
# add_compile_options(-mavx512dq)
245-
# add_compile_options(-mavx512bw)
255+
add_compile_options(-mavx512bw)
256+
endif()
257+
if (LLAMA_AVX512_VBMI)
258+
add_compile_options(-mavx512vbmi)
259+
endif()
260+
if (LLAMA_AVX512_VNNI)
261+
add_compile_options(-mavx512vnni)
246262
endif()
247263
endif()
248264
else()

ggml.c

Lines changed: 203 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1977,33 +1977,187 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
19771977
}
19781978

19791979
#if __AVX512F__ && QK4_0 == 32
1980-
static inline __m512 dot_q4_0_oneblock_avx512(
1980+
static inline __m512i bytes_from_q4_0_twoblocks_avx512( const __m512i blocks ) {
1981+
// The 64 bytes of `blocks` contain two consecutive Q4_0 blocks loaded from memory:
1982+
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
1983+
// |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32|
1984+
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
1985+
// | :. =_ () [] <> () Zz Yy|
1986+
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
1987+
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
1988+
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
1989+
// |Xx Ww Vv Uu Tt Ss Rr Qq Pp Oo Nn Mm Ll Kk Jj Ii Hh Gg Ff Ee Dd Cc Bb Aa |
1990+
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
1991+
//
1992+
// Bytes 04..19 (block #0) and 24..39 (block #1) both contain 32 nibbles (4-bit unsigned integers).
1993+
// We have exactly 64 nibbles, so we want to place each nibble into a separate byte.
1994+
// Bytes 00..03 and 20..23 contain scales, which are irrelevant to this function.
1995+
// Bytes 40..63 are masked when loading the data, so they are zeroed out.
1996+
#ifdef __AVX512VBMI__
1997+
const __m512i byte_perm = _mm512_set_epi8(
1998+
39, 38, 39, 38, 37, 36, 37, 36, 35, 34, 35, 34, 33, 32, 33, 32,
1999+
31, 30, 31, 30, 29, 28, 29, 28, 27, 26, 27, 26, 25, 24, 25, 24,
2000+
19, 18, 19, 18, 17, 16, 17, 16, 15, 14, 15, 14, 13, 12, 13, 12,
2001+
11, 10, 11, 10, 9, 8, 9, 8, 7, 6, 7, 6, 5, 4, 5, 4
2002+
);
2003+
const __m512i permuted = _mm512_permutexvar_epi8( byte_perm, blocks );
2004+
// After applying VPERMB, `permuted` looks like this:
2005+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2006+
// |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32|
2007+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2008+
// |:. =_ :. =_ () [] () [] <> () <> () Zz Yy Zz Yy Xx Ww Xx Ww Vv Uu Vv Uu Tt Ss Tt Ss Rr Qq Rr Qq|
2009+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2010+
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2011+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2012+
// |Pp Oo Pp Oo Nn Mm Nn Mm Ll Kk Ll Kk Jj Ii Jj Ii Hh Gg Hh Gg Ff Ee Ff Ee Dd Cc Dd Cc Bb Aa Bb Aa|
2013+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2014+
#else
2015+
const __m512i word_perm = _mm512_set_epi16(
2016+
19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
2017+
9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2
2018+
);
2019+
const __m512i permuted = _mm512_permutexvar_epi16( word_perm, blocks );
2020+
// This is the fallback path for CPUs that don't support VPERMB. Since we permute 16-bit groups only,
2021+
// VPERMB can be replaced with VPERMW. We could always use VPERMW, but at least on Tiger Lake and
2022+
// Ice Lake VPERMW followed by a right shift is quite noticeably slower than VPERMB.
2023+
#endif
2024+
2025+
// Shift every odd-numbered 16-bit group to the right by 4 bits.
2026+
const __mmask32 shift_mask = 0xaaaaaaaa;
2027+
const __m512i shifted = _mm512_mask_srai_epi16( permuted, shift_mask, permuted, 4 );
2028+
// After applying VPSRAW, `shifted` looks like this (the "empty" nibbles are filled with zeroes):
2029+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2030+
// |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2031+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2032+
// | : .= :. =_ ( )[ () [] < >( <> () Z zY Zz Yy X xW Xx Ww V vU Vv Uu T tS Tt Ss R rQ Rr Qq
2033+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2034+
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2035+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2036+
// | P pO Pp Oo N nM Nn Mm L lK Ll Kk J jI Jj Ii H hG Hh Gg F fE Ff Ee D dC Dd Cc B bA Bb Aa|
2037+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2038+
2039+
// Now we just need to zero out the higher nibble in each byte, and we're done.
2040+
const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf );
2041+
return _mm512_and_si512( low_nibble_mask, shifted );
2042+
// The final result looks like this:
2043+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2044+
// |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32|
2045+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2046+
// | : = . _ ( [ ) ] < ( > ) Z Y z y X W x w V U v u T S t s R Q r q|
2047+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2048+
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2049+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2050+
// | P O p o N M n m L K l k J I j i H G h g F E f e D C d c B A b a|
2051+
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
2052+
}
2053+
2054+
static inline __m512 dot_q4_0_twoblocks_avx512(
19812055
__m512 acc,
19822056
const block_q4_0 * restrict x,
19832057
const block_q4_0 * restrict y,
19842058
int i
19852059
) {
1986-
// Compute combined scale for the block
1987-
__m512 d = _mm512_set1_ps( x[i].d * y[i].d );
1988-
1989-
__m256i bx = bytesFromNibbles( x[i].qs );
1990-
__m256i by = bytesFromNibbles( y[i].qs );
1991-
1992-
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
1993-
const __m256i off = _mm256_set1_epi8( 8 );
1994-
bx = _mm256_sub_epi8( bx, off );
1995-
by = _mm256_sub_epi8( by, off );
1996-
1997-
// Sign-extend 16 signed bytes into int16_t
1998-
__m512i x32 = _mm512_cvtepi8_epi16( bx );
1999-
__m512i y32 = _mm512_cvtepi8_epi16( by );
2000-
// Compute products of int16_t integers, add pairwise
2001-
__m512i i64 = _mm512_madd_epi16( x32, y32 );
2060+
// A pair of Q4_0 blocks spans 40 bytes, while an AVX-512 register has 64. The remaining 24 bytes
2061+
// can potentially be unaddressable, so we make sure to mask them out before the load, even though
2062+
// we don't use them at all. This might hurt the performance slightly, since the compiler is forced
2063+
// to use e.g. `VMOVDQU64 REG, MASK, [ADDR] + VPERMB ..., REG` instead of just `VPERMB ..., [ADDR]`.
2064+
const __mmask8 load_mask = 0x1f;
2065+
const __m512i blocks_0 = _mm512_maskz_loadu_epi64( load_mask, &x[i] );
2066+
const __m512i blocks_1 = _mm512_maskz_loadu_epi64( load_mask, &y[i] );
2067+
2068+
// We want to multiply the scales, so we interpret both registers as 16 32-bit floats:
2069+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2070+
// | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 |
2071+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2072+
// blocks_0_float
2073+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2074+
// | | | | | | | xx | xx | xx | xx | B | xx | xx | xx | xx | A |
2075+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2076+
// blocks_1_float
2077+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2078+
// | | | | | | | xx | xx | xx | xx | D | xx | xx | xx | xx | C |
2079+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2080+
const __m512 blocks_0_float = _mm512_castsi512_ps( blocks_0 );
2081+
const __m512 blocks_1_float = _mm512_castsi512_ps( blocks_1 );
2082+
// We absolutely shouldn't touch the floats marked with `xx`: they contain some
2083+
// random data, which might very well underflow. At least on Intel, this leads
2084+
// to a huge penalty that can't be ignored (easily 100x or more) unless you
2085+
// compile your code with something like `-ffast-math` to enable FTZ/DAZ flags.
2086+
// (and ggml can't assume that you do)...
2087+
const __mmask16 scale_mul_mask = 0x21;
2088+
#ifdef __clang__
2089+
// ...however, clang decides to optimize the multiplication mask away:
2090+
// https://godbolt.org/z/P8PqdsfvW
2091+
// gcc and MSVC do the sane thing. This horrible workaround forces clang to emit the mask.
2092+
__m512i scales;
2093+
__asm__(
2094+
"vmulps %1, %2, %0%{%3%}"
2095+
: "=v" ( scales )
2096+
: "vm" ( blocks_0_float ), "v" ( blocks_1_float ), "Yk" ( scale_mul_mask )
2097+
);
2098+
#else
2099+
const __m512 scales = _mm512_maskz_mul_ps( scale_mul_mask, blocks_0_float, blocks_1_float );
2100+
#endif
2101+
const __m512i scale_perm = _mm512_set_epi32(
2102+
5, 5, 5, 5, 5, 5, 5, 5,
2103+
0, 0, 0, 0, 0, 0, 0, 0
2104+
);
2105+
const __m512 permuted_scales = _mm512_permutexvar_ps( scale_perm, scales );
2106+
// After VMULPS and VPERMPS, `permuted_scales` looks like this:
2107+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2108+
// | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 |
2109+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2110+
// | B*D| B*D| B*D| B*D| B*D| B*D| B*D| B*D| A*C| A*C| A*C| A*C| A*C| A*C| A*C| A*C|
2111+
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
2112+
2113+
const __m512i bytes_0 = bytes_from_q4_0_twoblocks_avx512( blocks_0 );
2114+
const __m512i bytes_1 = bytes_from_q4_0_twoblocks_avx512( blocks_1 );
2115+
2116+
// Now we want to compute dot products of 4-element byte vectors and store them in
2117+
// 32-bit integers. That is (only one 4-element vector is shown for clarity):
2118+
// +----+----+----+----+
2119+
// ... | 03 | 02 | 01 | 00 |
2120+
// +----+----+----+----+
2121+
// bytes_0
2122+
// +----+----+----+----+
2123+
// ... | D | C | B | A |
2124+
// +----+----+----+----+
2125+
// bytes_1
2126+
// +----+----+----+----+
2127+
// ... | H | G | F | E |
2128+
// +----+----+----+----+
2129+
// final_res_int
2130+
// +----+----+----+----+
2131+
// ... | A*E+B*F+C*G+D*H |
2132+
// +----+----+----+----+
2133+
const __m512i plus_8 = _mm512_set1_epi8( 8 );
2134+
const __m512i bytes_1_minus_8 = _mm512_sub_epi8( bytes_1, plus_8 );
2135+
2136+
#ifdef __AVX512VNNI__
2137+
// We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch:
2138+
// the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8
2139+
// from each nibble, so they can be negative. So, instead of `(bytes_0 - 8) * (bytes_1 - 8)`,
2140+
// we compute `bytes_0 * (bytes_1 - 8) + bytes_1 * (-8) + 64`. VPDPBUSDS uses an accumulator,
2141+
// which means we only need 2 instructions.
2142+
const __m512i dot_init = _mm512_set1_epi32( 4 * 64 );
2143+
const __m512i minus_8 = _mm512_set1_epi8( -8 );
2144+
const __m512i prod_0 = _mm512_dpbusds_epi32( dot_init, bytes_1, minus_8 );
2145+
const __m512i final_res_int = _mm512_dpbusds_epi32( prod_0, bytes_0, bytes_1_minus_8 );
2146+
#else
2147+
// As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones.
2148+
// It has the same catch as VPDPBUSDS: the left operand should be unsigned.
2149+
// This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me
2150+
// ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119
2151+
const __m512i one = _mm512_set1_epi16( 1 );
2152+
const __m512i prod_0 = _mm512_maddubs_epi16( bytes_0, bytes_1_minus_8 );
2153+
const __m512i prod_1 = _mm512_maddubs_epi16( plus_8, bytes_1_minus_8 );
2154+
const __m512i diff = _mm512_sub_epi16( prod_0, prod_1 );
2155+
const __m512i final_res_int = _mm512_madd_epi16( diff, one );
2156+
#endif
20022157

2003-
// Convert int32_t to float
2004-
__m512 p = _mm512_cvtepi32_ps( i64 );
2005-
// Apply the scale, and accumulate
2006-
return _mm512_fmadd_ps( d, p, acc );
2158+
// Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate.
2159+
const __m512 final_res_float = _mm512_cvtepi32_ps( final_res_int );
2160+
return _mm512_fmadd_ps( permuted_scales, final_res_float, acc );
20072161
}
20082162
#endif
20092163

@@ -2135,25 +2289,26 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
21352289
__m512 acc0 = _mm512_setzero_ps();
21362290
__m512 acc1 = _mm512_setzero_ps();
21372291

2138-
const int superblock_size = 8;
2292+
const int superblock_size = 16;
2293+
21392294
const int superblock_count = nb / superblock_size;
21402295

21412296
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
21422297
int i = superblock_ix * superblock_size;
21432298

2144-
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+0 );
2145-
acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+1 );
2146-
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+2 );
2147-
acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+3 );
2148-
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+4 );
2149-
acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+5 );
2150-
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+6 );
2151-
acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+7 );
2299+
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+0 );
2300+
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+2 );
2301+
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+4 );
2302+
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+6 );
2303+
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+8 );
2304+
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+10 );
2305+
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+12 );
2306+
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+14 );
21522307
}
21532308

21542309
// Remainders
2155-
for (int i = superblock_count * superblock_size; i < nb; ++i) {
2156-
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i );
2310+
for (int i = superblock_count * superblock_size; i < nb; i += 2) {
2311+
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i );
21572312
}
21582313

21592314
// Horizontal sum of all lanes of the accumulator
@@ -11303,6 +11458,22 @@ int ggml_cpu_has_avx512(void) {
1130311458
#endif
1130411459
}
1130511460

11461+
int ggml_cpu_has_avx512_vbmi(void) {
11462+
#if defined(__AVX512VBMI__)
11463+
return 1;
11464+
#else
11465+
return 0;
11466+
#endif
11467+
}
11468+
11469+
int ggml_cpu_has_avx512_vnni(void) {
11470+
#if defined(__AVX512VNNI__)
11471+
return 1;
11472+
#else
11473+
return 0;
11474+
#endif
11475+
}
11476+
1130611477
int ggml_cpu_has_fma(void) {
1130711478
#if defined(__FMA__)
1130811479
return 1;

ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
808808
int ggml_cpu_has_avx(void);
809809
int ggml_cpu_has_avx2(void);
810810
int ggml_cpu_has_avx512(void);
811+
int ggml_cpu_has_avx512_vbmi(void);
812+
int ggml_cpu_has_avx512_vnni(void);
811813
int ggml_cpu_has_fma(void);
812814
int ggml_cpu_has_neon(void);
813815
int ggml_cpu_has_arm_fma(void);

llama.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1914,18 +1914,20 @@ const char * llama_print_system_info(void) {
19141914
static std::string s;
19151915

19161916
s = "";
1917-
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1918-
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1919-
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1920-
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1921-
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1922-
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1923-
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1924-
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1925-
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1926-
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1927-
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1928-
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1917+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1918+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1919+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1920+
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
1921+
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
1922+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1923+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1924+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1925+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1926+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1927+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1928+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1929+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1930+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
19291931

19301932
return s.c_str();
19311933
}

0 commit comments

Comments
 (0)