Address review comments

jart · jart · commit 492b76d9bbbc · 2024-04-11T00:26:31.000-07:00
diff --git a/Makefile b/Makefile
@@ -219,6 +219,11 @@ ifdef LLAMA_DISABLE_LOGS
 	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
 endif # LLAMA_DISABLE_LOGS
 
+# disable ggml.c's use of sgemm.cpp
+ifdef LLAMA_NO_LLAMAFILE
+	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE=0
+endif
+
 # warnings
 WARN_FLAGS    = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
diff --git a/common/common.cpp b/common/common.cpp
@@ -109,11 +109,11 @@ int32_t get_num_physical_cores() {
 
 static void cpuid(unsigned leaf, unsigned subleaf,
                   unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
-  __asm__("movq\t%%rbx,%%rsi\n\t"
-          "cpuid\n\t"
-          "xchgq\t%%rbx,%%rsi"
-          : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
-          : "0"(leaf), "2"(subleaf));
+    __asm__("movq\t%%rbx,%%rsi\n\t"
+            "cpuid\n\t"
+            "xchgq\t%%rbx,%%rsi"
+            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+            : "0"(leaf), "2"(subleaf));
 }
 
 static int pin_cpu(int cpu) {
@@ -140,10 +140,12 @@ static bool is_running_on_efficiency_core(void) {
 static int count_math_cpus(int cpu_count) {
     int result = 0;
     for (int cpu = 0; cpu < cpu_count; ++cpu) {
-        if (pin_cpu(cpu))
+        if (pin_cpu(cpu)) {
             return -1;
-        if (is_running_on_efficiency_core())
+        }
+        if (is_running_on_efficiency_core()) {
             continue; // efficiency cores harm lockstep threading
+        }
         ++cpu; // hyperthreading isn't useful for linear algebra
         ++result;
     }
@@ -158,15 +160,17 @@ static int count_math_cpus(int cpu_count) {
 int get_math_cpu_count() {
 #if defined(__x86_64__) && defined(__linux__)
     int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
-    if (cpu_count < 1)
+    if (cpu_count < 1) {
         return get_num_physical_cores();
+    }
     if (is_hybrid_cpu()) {
         cpu_set_t affinity;
         if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
             int result = count_math_cpus(cpu_count);
             pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
-            if (result > 0)
+            if (result > 0) {
                 return result;
+            }
         }
     }
 #endif
diff --git a/ggml.c b/ggml.c
@@ -33,6 +33,10 @@
 #include <unistd.h>
 #endif
 
+#ifndef GGML_USE_LLAMAFILE
+#define GGML_USE_LLAMAFILE 1
+#endif
+
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@@ -10811,7 +10815,8 @@ static void ggml_compute_forward_mul_mat(
     }
 #endif
 
-    if (src1_cont) {
+#if GGML_USE_LLAMAFILE
+    if (nb10 == ggml_type_size(src1->type)) {
         for (int64_t j = 0; j < ne13; j++)
             for (int64_t i = 0; i < ne12; i++)
                 if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -10830,6 +10835,7 @@ static void ggml_compute_forward_mul_mat(
         return;
     }
 UseGgmlGemm1:;
+#endif
 
     if (params->type == GGML_TASK_TYPE_INIT) {
         if (ith != 0) {
@@ -10862,7 +10868,8 @@ UseGgmlGemm1:;
     const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
     const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
-    if (src1_cont) {
+#if GGML_USE_LLAMAFILE
+    if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
         for (int64_t j = 0; j < ne13; j++)
             for (int64_t i = 0; i < ne12; i++)
                 if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -10882,6 +10889,7 @@ UseGgmlGemm1:;
         return;
     }
 UseGgmlGemm2:;
+#endif
 
     const int64_t nr0 = ne01;          // src0 rows
     const int64_t nr1 = ne1*ne12*ne13; // src1 rows
diff --git a/sgemm.cpp b/sgemm.cpp
@@ -1079,10 +1079,8 @@ bool llamafile_sgemm(int m, int n, int k, const void *A, int lda, const void *B,
     }
 
     case GGML_TYPE_Q8_0: {
-        if (k % 32)
-            return false;
-       if (Btype != GGML_TYPE_Q8_0)
-            return false;
+        if (Btype != GGML_TYPE_Q8_0)
+           return false;
 #if defined(__AVX2__) || defined(__AVX512F__)
         tinyBLAS_Q0_AVX2<block_q8_0, block_q8_0, float> tb{
             k, (const block_q8_0 *)A, lda,
@@ -1105,8 +1103,6 @@ bool llamafile_sgemm(int m, int n, int k, const void *A, int lda, const void *B,
     }
 
     case GGML_TYPE_Q4_0: {
-        if (k % 32)
-            return false;
         if (Btype != GGML_TYPE_Q8_0)
             return false;
 #if defined(__AVX2__) || defined(__AVX512F__)