Skip to content

Commit f1a134a

Browse files
committed
llamafile : improve moe prompt eval speed on cpu
This change introduces a llamafile_mixmul() API that allows tinyBLAS to speed up "Mixture of Expert" models. On my Threadripper, Mixtral's 8x7b F16 weights now process prompts 2x faster. I'm also seeing a 60 percent improvement with Mixtral 8x22b Q4_0. The same applies to Q8_0, which is also supported by tinyBLAS. MoE models spend the majority of their time inside MUL_MAT_ID rather than MUL_MAT, which is why llamafile_sgemm was not able to help them before. llamafile_mixmul works by decomposing the mixmul operation into sgemm calls.
1 parent 4b1c3c9 commit f1a134a

File tree

4 files changed

+599
-216
lines changed

4 files changed

+599
-216
lines changed

common/common.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
using json = nlohmann::ordered_json;
7474

7575
int32_t get_num_physical_cores() {
76-
#ifdef __linux__
76+
#if defined(__linux__) || defined(__COSMOPOLITAN__)
7777
// enumerate the set of thread siblings, num entries is num cores
7878
std::unordered_set<std::string> siblings;
7979
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
@@ -108,7 +108,7 @@ int32_t get_num_physical_cores() {
108108
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
109109
}
110110

111-
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
111+
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
112112
#include <pthread.h>
113113

114114
static void cpuid(unsigned leaf, unsigned subleaf,
@@ -162,7 +162,7 @@ static int count_math_cpus(int cpu_count) {
162162
* Returns number of CPUs on system that are useful for math.
163163
*/
164164
int get_math_cpu_count() {
165-
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
165+
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
166166
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
167167
if (cpu_count < 1) {
168168
return get_num_physical_cores();

ggml.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10991,11 +10991,14 @@ static void ggml_compute_forward_mul_mat_id(
1099110991
const struct ggml_tensor * src1 = dst->src[1];
1099210992
const struct ggml_tensor * ids = dst->src[2];
1099310993

10994-
GGML_TENSOR_BINARY_OP_LOCALS
10994+
if (llamafile_mixmul(params, src0, src1, ids, dst))
10995+
return;
1099510996

1099610997
const int ith = params->ith;
1099710998
const int nth = params->nth;
1099810999

11000+
GGML_TENSOR_BINARY_OP_LOCALS
11001+
1099911002
const enum ggml_type type = src0->type;
1100011003

1100111004
const bool src1_cont = ggml_is_contiguous(src1);
@@ -18492,6 +18495,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1849218495
cur = 0;
1849318496
const struct ggml_tensor * src0 = node->src[0];
1849418497
const struct ggml_tensor * src1 = node->src[1];
18498+
const struct ggml_tensor * src2 = node->src[2];
1849518499
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
1849618500
if (src1->type != vec_dot_type) {
1849718501
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
@@ -18500,6 +18504,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1850018504
cur += GGML_PAD(cur, sizeof(int64_t)); // align
1850118505
cur += n_as * sizeof(int64_t); // matrix_row_counts
1850218506
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
18507+
size_t cur2 = llamafile_mixmul_needs(src0, src1, src2);
18508+
cur = cur > cur2 ? cur : cur2;
1850318509
} break;
1850418510
case GGML_OP_OUT_PROD:
1850518511
{

0 commit comments

Comments
 (0)