Alternate OpenCL support via the CLBlast Netlib BLAS API

trholding · trholding · commit edd43553db34 · 2023-05-08T11:04:42.000+05:30
Experimental alternate OpenCL support via the CLBlast Netlib BLAS API. The performance is quite similar to the CLBlast optimized implementation when tested on the same low end / old AMD A9 APU. CLBlast needs to be compiled with ```-DNETLIB=ON``` flag. Rationale: Support More Hardware. This is meant to be used as a last resort for GPU acceleration when other methods don't work or are not compatible. Since OpenCL 1.x EMBEDDED PROFILE is supported, I anticipate that this could enable acceleration on Single Board Computers and Smart Phones. Also serves as a template for pre-emptive OpenCL support for projects that use ggml. This could provide baseline GPU acceleration without custom OpenCL code or added effort due to CLBlast being a drop in BLAS with the Netlib API enabled. More Info: https://github.com/CNugteren/CLBlast/blob/master/doc/bindings.md CNugteren/CLBlast#227 Usage: ``` Makefile: cd whisper.cpp WHISPER_CLBLAST_NETLIB=1 make CMake: cd whisper.cpp ; mkdir build ; cd build cmake -DWHISPER_CLBLAST_NETLIB=ON .. make ```
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -59,9 +59,10 @@ if (APPLE)
     option(WHISPER_COREML                "whisper: enable Core ML framework"     OFF)
     option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
 else()
-    option(WHISPER_OPENBLAS              "whisper: support for OpenBLAS" OFF)
-    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"   OFF)
-    option(WHISPER_CLBLAST               "whisper: use CLBlast"          OFF)
+    option(WHISPER_OPENBLAS              "whisper: support for OpenBLAS"            OFF)
+    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"              OFF)
+    option(WHISPER_CLBLAST               "whisper: support for CLBlast"             OFF)
+    option(WHISPER_CLBLAST_NETLIB        "whisper: support for CLBlast Netlib API"  OFF)    
 endif()
 
 option(WHISPER_PERF "whisper: enable perf timings" OFF)
@@ -180,6 +181,19 @@ if (WHISPER_CLBLAST)
     endif()
 endif()
 
+if (WHISPER_CLBLAST_NETLIB)
+    find_package(CLBlast)
+    if (CLBlast_FOUND)
+        message(STATUS "CLBlast found")
+
+        add_compile_definitions(GGML_USE_CLBLASTNETLIB)
+
+        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
+    else()
+        message(WARNING "CLBlast not found")
+    endif()
+endif()
+
 # compiler flags
 
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
diff --git a/Makefile b/Makefile
@@ -180,6 +180,11 @@ ggml-opencl.o: ggml-opencl.c ggml-opencl.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif
 
+ifdef WHISPER_CLBLAST_NETLIB
+	CFLAGS  += -DGGML_USE_CLBLASTNETLIB
+	LDFLAGS += -lclblast -lOpenCL
+endif
+
 ifdef WHISPER_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
diff --git a/ggml.c b/ggml.c
@@ -143,6 +143,8 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
+#elif defined(GGML_USE_CLBLASTNETLIB)
+#include <clblast_netlib_c.h>
 #endif
 
 #undef MIN
@@ -8187,7 +8189,7 @@ static void ggml_compute_forward_rms_norm(
 
 // ggml_compute_forward_mul_mat
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(
@@ -8228,7 +8230,7 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
     const int64_t ne10 = src1->ne[0];
 #endif
     const int64_t ne11 = src1->ne[1];
@@ -8294,7 +8296,7 @@ static void ggml_compute_forward_mul_mat_f32(
     }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -8469,7 +8471,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
 
@@ -8689,7 +8691,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
     }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -11738,7 +11740,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         else
 #endif
                         if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
@@ -11752,13 +11754,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 #endif
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                             }
 #endif
                         } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -13247,7 +13249,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
     return 1;
 #else
     return 0;
diff --git a/ggml.h b/ggml.h
@@ -901,6 +901,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_blas       (void);
     GGML_API int ggml_cpu_has_cublas     (void);
     GGML_API int ggml_cpu_has_clblast    (void);
+    GGML_API int ggml_cpu_has_clblastnetlib    (void);    
     GGML_API int ggml_cpu_has_gpublas    (void);
     GGML_API int ggml_cpu_has_sse3       (void);
     GGML_API int ggml_cpu_has_vsx        (void);