Skip to content

Commit 0a29fe9

Browse files
committed
llamafile : improve moe prompt eval speed on cpu
This change introduces a llamafile_mixmul() API that allows tinyBLAS to speed up "Mixture of Expert" models. On my Threadripper, Mixtral's 8x7b F16 weights now process prompts 2x faster. I'm also seeing a 60 percent improvement with Mixtral 8x22b Q4_0. The same applies to Q8_0, which is also supported by tinyBLAS. MoE models spend the majority of their time inside MUL_MAT_ID rather than MUL_MAT, which is why llamafile_sgemm was not able to help them before. llamafile_mixmul works by decomposing the mixmul operation into sgemm calls.
1 parent 152da28 commit 0a29fe9

File tree

6 files changed

+734
-219
lines changed

6 files changed

+734
-219
lines changed

common/common.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ using json = nlohmann::ordered_json;
7878
//
7979

8080
int32_t cpu_get_num_physical_cores() {
81-
#ifdef __linux__
81+
#if defined(__linux__) || defined(__COSMOPOLITAN__)
8282
// enumerate the set of thread siblings, num entries is num cores
8383
std::unordered_set<std::string> siblings;
8484
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
@@ -113,7 +113,7 @@ int32_t cpu_get_num_physical_cores() {
113113
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
114114
}
115115

116-
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
116+
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
117117
#include <pthread.h>
118118

119119
static void cpuid(unsigned leaf, unsigned subleaf,
@@ -167,7 +167,7 @@ static int cpu_count_math_cpus(int n_cpu) {
167167
* Returns number of CPUs on system that are useful for math.
168168
*/
169169
int32_t cpu_get_num_math() {
170-
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
170+
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
171171
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
172172
if (n_cpu < 1) {
173173
return cpu_get_num_physical_cores();

ggml-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
#define MIN(a, b) ((a) < (b) ? (a) : (b))
1818
#define MAX(a, b) ((a) > (b) ? (a) : (b))
1919

20+
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
21+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
22+
2023
#if defined(_WIN32)
2124

2225
#define m512bh(p) p

ggml-quants.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,6 @@
2828

2929
#define UNUSED GGML_UNUSED
3030

31-
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
32-
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
33-
3431
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
3532
// multiply int8_t, add results pairwise twice
3633
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {

ggml.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12615,11 +12615,16 @@ static void ggml_compute_forward_mul_mat_id(
1261512615
const struct ggml_tensor * src1 = dst->src[1];
1261612616
const struct ggml_tensor * ids = dst->src[2];
1261712617

12618-
GGML_TENSOR_BINARY_OP_LOCALS
12618+
#if GGML_USE_LLAMAFILE
12619+
if (llamafile_mixmul(params, src0, src1, ids, dst))
12620+
return;
12621+
#endif
1261912622

1262012623
const int ith = params->ith;
1262112624
const int nth = params->nth;
1262212625

12626+
GGML_TENSOR_BINARY_OP_LOCALS
12627+
1262312628
const enum ggml_type type = src0->type;
1262412629

1262512630
const bool src1_cont = ggml_is_contiguous(src1);
@@ -19535,6 +19540,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1953519540
cur = 0;
1953619541
const struct ggml_tensor * src0 = node->src[0];
1953719542
const struct ggml_tensor * src1 = node->src[1];
19543+
#if GGML_USE_LLAMAFILE
19544+
const struct ggml_tensor * src2 = node->src[2];
19545+
#endif
1953819546
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
1953919547
if (src1->type != vec_dot_type) {
1954019548
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
@@ -19543,6 +19551,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1954319551
cur += GGML_PAD(cur, sizeof(int64_t)); // align
1954419552
cur += n_as * sizeof(int64_t); // matrix_row_counts
1954519553
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
19554+
#if GGML_USE_LLAMAFILE
19555+
size_t cur2 = llamafile_mixmul_needs(src0, src1, src2);
19556+
cur = cur > cur2 ? cur : cur2;
19557+
#endif
1954619558
} break;
1954719559
case GGML_OP_OUT_PROD:
1954819560
{

0 commit comments

Comments
 (0)