ggml-org
diff --git a/‎common/common.cpp
Lines changed: 3 additions & 3 deletions b/‎common/common.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎ggml.c
Lines changed: 7 additions & 1 deletion b/‎ggml.c
Lines changed: 7 additions & 1 deletion
@@ -73,7 +73,7 @@
 using json = nlohmann::ordered_json;
 
 int32_t get_num_physical_cores() {
-#ifdef __linux__
+#if defined(__linux__) || defined(__COSMOPOLITAN__)
     // enumerate the set of thread siblings, num entries is num cores
     std::unordered_set<std::string> siblings;
     for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
@@ -108,7 +108,7 @@ int32_t get_num_physical_cores() {
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
-#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
 #include <pthread.h>
 
 static void cpuid(unsigned leaf, unsigned subleaf,
@@ -162,7 +162,7 @@ static int count_math_cpus(int cpu_count) {
  * Returns number of CPUs on system that are useful for math.
  */
 int get_math_cpu_count() {
-#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
     int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
     if (cpu_count < 1) {
         return get_num_physical_cores();
 
@@ -10991,11 +10991,14 @@ static void ggml_compute_forward_mul_mat_id(
     const struct ggml_tensor * src1 = dst->src[1];
     const struct ggml_tensor * ids = dst->src[2];
 
-    GGML_TENSOR_BINARY_OP_LOCALS
+    if (llamafile_mixmul(params, src0, src1, ids, dst))
+        return;
 
     const int ith = params->ith;
     const int nth = params->nth;
 
+    GGML_TENSOR_BINARY_OP_LOCALS
+
     const enum ggml_type type = src0->type;
 
     const bool src1_cont = ggml_is_contiguous(src1);
@@ -18492,6 +18495,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     cur = 0;
                     const struct ggml_tensor * src0 = node->src[0];
                     const struct ggml_tensor * src1 = node->src[1];
+                    const struct ggml_tensor * src2 = node->src[2];
                     const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
                     if (src1->type != vec_dot_type) {
                         cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
@@ -18500,6 +18504,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     cur += GGML_PAD(cur, sizeof(int64_t));       // align
                     cur += n_as * sizeof(int64_t);               // matrix_row_counts
                     cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
+                    size_t cur2 = llamafile_mixmul_needs(src0, src1, src2);
+                    cur = cur > cur2 ? cur : cur2;
                 } break;
             case GGML_OP_OUT_PROD:
                 {