Skip to content

Commit edd4355

Browse files
committed
Alternate OpenCL support via the CLBlast Netlib BLAS API
Experimental alternate OpenCL support via the CLBlast Netlib BLAS API. The performance is quite similar to the CLBlast optimized implementation when tested on the same low end / old AMD A9 APU. CLBlast needs to be compiled with ```-DNETLIB=ON``` flag. Rationale: Support More Hardware. This is meant to be used as a last resort for GPU acceleration when other methods don't work or are not compatible. Since OpenCL 1.x EMBEDDED PROFILE is supported, I anticipate that this could enable acceleration on Single Board Computers and Smart Phones. Also serves as a template for pre-emptive OpenCL support for projects that use ggml. This could provide baseline GPU acceleration without custom OpenCL code or added effort due to CLBlast being a drop in BLAS with the Netlib API enabled. More Info: https://github.com/CNugteren/CLBlast/blob/master/doc/bindings.md CNugteren/CLBlast#227 Usage: ``` Makefile: cd whisper.cpp WHISPER_CLBLAST_NETLIB=1 make CMake: cd whisper.cpp ; mkdir build ; cd build cmake -DWHISPER_CLBLAST_NETLIB=ON .. make ```
1 parent 9b40b43 commit edd4355

File tree

4 files changed

+34
-12
lines changed

4 files changed

+34
-12
lines changed

CMakeLists.txt

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,10 @@ if (APPLE)
5959
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
6060
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
6161
else()
62-
option(WHISPER_OPENBLAS "whisper: support for OpenBLAS" OFF)
63-
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
64-
option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
62+
option(WHISPER_OPENBLAS "whisper: support for OpenBLAS" OFF)
63+
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
64+
option(WHISPER_CLBLAST "whisper: support for CLBlast" OFF)
65+
option(WHISPER_CLBLAST_NETLIB "whisper: support for CLBlast Netlib API" OFF)
6566
endif()
6667

6768
option(WHISPER_PERF "whisper: enable perf timings" OFF)
@@ -180,6 +181,19 @@ if (WHISPER_CLBLAST)
180181
endif()
181182
endif()
182183

184+
if (WHISPER_CLBLAST_NETLIB)
185+
find_package(CLBlast)
186+
if (CLBlast_FOUND)
187+
message(STATUS "CLBlast found")
188+
189+
add_compile_definitions(GGML_USE_CLBLASTNETLIB)
190+
191+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
192+
else()
193+
message(WARNING "CLBlast not found")
194+
endif()
195+
endif()
196+
183197
# compiler flags
184198

185199
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,11 @@ ggml-opencl.o: ggml-opencl.c ggml-opencl.h
180180
$(CC) $(CFLAGS) -c $< -o $@
181181
endif
182182

183+
ifdef WHISPER_CLBLAST_NETLIB
184+
CFLAGS += -DGGML_USE_CLBLASTNETLIB
185+
LDFLAGS += -lclblast -lOpenCL
186+
endif
187+
183188
ifdef WHISPER_GPROF
184189
CFLAGS += -pg
185190
CXXFLAGS += -pg

ggml.c

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ inline static void* ggml_aligned_malloc(size_t size) {
143143
#include "ggml-cuda.h"
144144
#elif defined(GGML_USE_CLBLAST)
145145
#include "ggml-opencl.h"
146+
#elif defined(GGML_USE_CLBLASTNETLIB)
147+
#include <clblast_netlib_c.h>
146148
#endif
147149

148150
#undef MIN
@@ -8187,7 +8189,7 @@ static void ggml_compute_forward_rms_norm(
81878189

81888190
// ggml_compute_forward_mul_mat
81898191

8190-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8192+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
81918193
// helper function to determine if it is better to use BLAS or not
81928194
// for large matrices, BLAS is faster
81938195
static bool ggml_compute_forward_mul_mat_use_blas(
@@ -8228,7 +8230,7 @@ static void ggml_compute_forward_mul_mat_f32(
82288230
const int64_t ne02 = src0->ne[2];
82298231
const int64_t ne03 = src0->ne[3];
82308232

8231-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8233+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
82328234
const int64_t ne10 = src1->ne[0];
82338235
#endif
82348236
const int64_t ne11 = src1->ne[1];
@@ -8294,7 +8296,7 @@ static void ggml_compute_forward_mul_mat_f32(
82948296
}
82958297
#endif
82968298

8297-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8299+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
82988300
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
82998301
if (params->ith != 0) {
83008302
return;
@@ -8469,7 +8471,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
84698471
}
84708472
#endif
84718473

8472-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8474+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
84738475
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
84748476
GGML_ASSERT(nb10 == sizeof(float));
84758477

@@ -8689,7 +8691,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
86898691
}
86908692
#endif
86918693

8692-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8694+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
86938695
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
86948696
if (params->ith != 0) {
86958697
return;
@@ -11738,7 +11740,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1173811740
else
1173911741
#endif
1174011742
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
11741-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11743+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
1174211744
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
1174311745
node->n_tasks = 1; // TODO: this actually is doing nothing
1174411746
// the threads are still spinning
@@ -11752,13 +11754,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1175211754
#endif
1175311755
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
1175411756
cur = 0;
11755-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11757+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
1175611758
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
1175711759
node->n_tasks = 1;
1175811760
}
1175911761
#endif
1176011762
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
11761-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11763+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
1176211764
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
1176311765
node->n_tasks = 1;
1176411766
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -13247,7 +13249,7 @@ int ggml_cpu_has_wasm_simd(void) {
1324713249
}
1324813250

1324913251
int ggml_cpu_has_blas(void) {
13250-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
13252+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_CLBLASTNETLIB)
1325113253
return 1;
1325213254
#else
1325313255
return 0;

ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,7 @@ extern "C" {
901901
GGML_API int ggml_cpu_has_blas (void);
902902
GGML_API int ggml_cpu_has_cublas (void);
903903
GGML_API int ggml_cpu_has_clblast (void);
904+
GGML_API int ggml_cpu_has_clblastnetlib (void);
904905
GGML_API int ggml_cpu_has_gpublas (void);
905906
GGML_API int ggml_cpu_has_sse3 (void);
906907
GGML_API int ggml_cpu_has_vsx (void);

0 commit comments

Comments
 (0)