Skip to content

Commit f578b86

Browse files
slarenggerganov
andauthored
move BLAS to a separate backend (#6210)
* move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 1c641e6 commit f578b86

17 files changed

+821
-379
lines changed

CMakeLists.txt

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,12 @@ endif()
3939

4040
if (APPLE)
4141
set(LLAMA_METAL_DEFAULT ON)
42+
set(LLAMA_BLAS_DEFAULT ON)
43+
set(LLAMA_BLAS_VENDOR_DEFAULT "Apple")
4244
else()
4345
set(LLAMA_METAL_DEFAULT OFF)
46+
set(LLAMA_BLAS_DEFAULT OFF)
47+
set(LLAMA_BLAS_VENDOR_DEFAULT "Generic")
4448
endif()
4549

4650
set(LLAMA_LLAMAFILE_DEFAULT ON)
@@ -91,9 +95,10 @@ endif()
9195

9296
# 3rd party libs
9397
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
94-
option(LLAMA_BLAS "llama: use BLAS" OFF)
98+
option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT})
99+
set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
100+
"llama: BLAS library vendor")
95101
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
96-
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
97102
option(LLAMA_CUDA "llama: use CUDA" OFF)
98103
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
99104
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
@@ -311,17 +316,17 @@ if (LLAMA_BLAS)
311316
if (LLAMA_STATIC)
312317
set(BLA_STATIC ON)
313318
endif()
314-
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
315-
set(BLA_SIZEOF_INTEGER 8)
316-
endif()
319+
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
320+
# set(BLA_SIZEOF_INTEGER 8)
321+
#endif()
317322

318323
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
319324
find_package(BLAS)
320325

321326
if (BLAS_FOUND)
322327
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
323328

324-
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
329+
if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple"))
325330
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
326331
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
327332
find_package(PkgConfig REQUIRED)
@@ -374,12 +379,15 @@ if (LLAMA_BLAS)
374379

375380
add_compile_options(${BLAS_LINKER_FLAGS})
376381

377-
add_compile_definitions(GGML_USE_OPENBLAS)
382+
add_compile_definitions(GGML_USE_BLAS)
378383

379384
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
380385
add_compile_definitions(GGML_BLAS_USE_MKL)
381386
endif()
382387

388+
set(GGML_HEADERS_BLAS ggml-blas.h)
389+
set(GGML_SOURCES_BLAS ggml-blas.cpp)
390+
383391
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
384392
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
385393
else()
@@ -1258,6 +1266,7 @@ add_library(ggml OBJECT
12581266
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
12591267
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
12601268
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
1269+
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
12611270
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
12621271
)
12631272

Makefile

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -440,10 +440,11 @@ ifndef LLAMA_NO_ACCELERATE
440440
# Mac OS - include Accelerate framework.
441441
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
442442
ifeq ($(UNAME_S),Darwin)
443-
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
443+
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
444444
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
445445
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
446446
MK_LDFLAGS += -framework Accelerate
447+
OBJS += ggml-blas.o
447448
endif
448449
endif # LLAMA_NO_ACCELERATE
449450

@@ -454,21 +455,30 @@ ifndef LLAMA_NO_OPENMP
454455
endif # LLAMA_NO_OPENMP
455456

456457
ifdef LLAMA_OPENBLAS
457-
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
458+
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
458459
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
459460
MK_LDFLAGS += $(shell pkg-config --libs openblas)
461+
OBJS += ggml-blas.o
460462
endif # LLAMA_OPENBLAS
461463

462-
ifndef LLAMA_NO_LLAMAFILE
463-
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
464-
OBJS += sgemm.o
465-
endif
464+
ifdef LLAMA_OPENBLAS64
465+
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
466+
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
467+
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
468+
OBJS += ggml-blas.o
469+
endif # LLAMA_OPENBLAS64
466470

467471
ifdef LLAMA_BLIS
468-
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
472+
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
469473
MK_LDFLAGS += -lblis -L/usr/local/lib
474+
OBJS += ggml-blas.o
470475
endif # LLAMA_BLIS
471476

477+
ifndef LLAMA_NO_LLAMAFILE
478+
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
479+
OBJS += sgemm.o
480+
endif
481+
472482
ifdef LLAMA_RPC
473483
MK_CPPFLAGS += -DGGML_USE_RPC
474484
OBJS += ggml-rpc.o
@@ -776,6 +786,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
776786
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
777787
$(CC) $(CFLAGS) -c $< -o $@
778788

789+
ggml-blas.o: ggml-blas.cpp ggml-blas.h
790+
$(CXX) $(CXXFLAGS) -c $< -o $@
791+
779792
unicode.o: unicode.cpp unicode.h
780793
$(CXX) $(CXXFLAGS) -c $< -o $@
781794

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
293293
params.output_format = cmd_params_defaults.output_format;
294294
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
295295
params.reps = cmd_params_defaults.reps;
296+
params.numa = cmd_params_defaults.numa;
296297

297298
for (int i = 1; i < argc; i++) {
298299
arg = argv[i];

ggml-alloc.c

Lines changed: 77 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ struct hash_node {
339339
};
340340

341341
struct tensor_alloc {
342+
int buffer_id;
342343
size_t offset;
343344
size_t size_max; // 0 = pre-allocated, unused, or view
344345
};
@@ -349,7 +350,6 @@ struct leaf_alloc {
349350
};
350351

351352
struct node_alloc {
352-
int buffer_id;
353353
struct tensor_alloc dst;
354354
struct tensor_alloc src[GGML_MAX_SRC];
355355
};
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
386386
for (int i = 0; i < n_bufs; i++) {
387387
galloc->bufts[i] = bufts[i];
388388
galloc->buffers[i] = NULL;
389-
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
390-
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
389+
390+
// check if the same buffer type is used multiple times and reuse the same allocator
391+
for (int j = 0; j < i; j++) {
392+
if (bufts[i] == bufts[j]) {
393+
galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
394+
break;
395+
}
396+
}
397+
398+
if (galloc->buf_tallocs[i] == NULL) {
399+
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
400+
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
401+
}
391402
}
392403
galloc->n_buffers = n_bufs;
393404

@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
405416

406417
for (int i = 0; i < galloc->n_buffers; i++) {
407418
if (galloc->buffers != NULL) {
408-
ggml_backend_buffer_free(galloc->buffers[i]);
419+
// skip if already freed
420+
bool freed = false;
421+
for (int j = 0; j < i; j++) {
422+
if (galloc->buffers[j] == galloc->buffers[i]) {
423+
freed = true;
424+
break;
425+
}
426+
}
427+
if (!freed) {
428+
ggml_backend_buffer_free(galloc->buffers[i]);
429+
}
409430
}
410431
if (galloc->buf_tallocs != NULL) {
411-
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
432+
// skip if already freed
433+
bool freed = false;
434+
for (int j = 0; j < i; j++) {
435+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
436+
freed = true;
437+
break;
438+
}
439+
}
440+
if (!freed) {
441+
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
442+
}
412443
}
413444
}
414445

@@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
511542
}
512543
}
513544

514-
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
545+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
515546
// graph outputs are never freed
516547
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
517548
AT_PRINTF("not freeing output %s\n", node->name);
518549
return;
519550
}
520551

521-
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
522-
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
523552
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
524553
size_t offset = hn->offset;
554+
int buffer_id = hn->buffer_id;
555+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
556+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
525557
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
526558
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
527559
hn->allocated = false;
@@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
626658
AT_PRINTF("view_src %s: %d children, %d views\n",
627659
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
628660
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
629-
ggml_gallocr_free_node(galloc, view_src, buffer_id);
661+
ggml_gallocr_free_node(galloc, view_src);
630662
}
631663
}
632664
else if (p_hn->allocated) {
633-
ggml_gallocr_free_node(galloc, parent, buffer_id);
665+
ggml_gallocr_free_node(galloc, parent);
634666
}
635667
}
636668
AT_PRINTF("\n");
@@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
674706
for (int i = 0; i < graph->n_nodes; i++) {
675707
struct ggml_tensor * node = graph->nodes[i];
676708
struct node_alloc * node_alloc = &galloc->node_allocs[i];
677-
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
678709
if (node->view_src || node->data) {
710+
node_alloc->dst.buffer_id = -1;
679711
node_alloc->dst.offset = SIZE_MAX;
680712
node_alloc->dst.size_max = 0;
681713
} else {
682714
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
683-
node_alloc->dst.offset = hn->offset;
684-
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
715+
node_alloc->dst.buffer_id = hn->buffer_id;
716+
node_alloc->dst.offset = hn->offset;
717+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
685718
}
686719
for (int j = 0; j < GGML_MAX_SRC; j++) {
687720
struct ggml_tensor * src = node->src[j];
688721
if (!src || src->view_src || src->data) {
722+
node_alloc->src[j].buffer_id = -1;
689723
node_alloc->src[j].offset = SIZE_MAX;
690724
node_alloc->src[j].size_max = 0;
691725
} else {
692726
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
727+
node_alloc->src[j].buffer_id = hn->buffer_id;
693728
node_alloc->src[j].offset = hn->offset;
694729
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
695730
}
@@ -706,16 +741,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
706741
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707742
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708743
if (leaf->view_src || leaf->data) {
744+
galloc->leaf_allocs[i].leaf.buffer_id = -1;
709745
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710746
galloc->leaf_allocs[i].leaf.size_max = 0;
711747
} else {
748+
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
712749
galloc->leaf_allocs[i].leaf.offset = hn->offset;
713750
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714751
}
715752
}
716753

717754
// reallocate buffers if needed
718755
for (int i = 0; i < galloc->n_buffers; i++) {
756+
// if the buffer type is used multiple times, we reuse the same buffer
757+
for (int j = 0; j < i; j++) {
758+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
759+
galloc->buffers[i] = galloc->buffers[j];
760+
break;
761+
}
762+
}
763+
719764
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
720765
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
721766

@@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
724769
#ifndef NDEBUG
725770
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
726771
#endif
772+
727773
ggml_backend_buffer_free(galloc->buffers[i]);
728774
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
729775
if (galloc->buffers[i] == NULL) {
@@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
740786
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
741787
}
742788

743-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
789+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
790+
int buffer_id = tensor_alloc->buffer_id;
744791
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
745792

746793
if (tensor->view_src != NULL) {
@@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
768815
}
769816
}
770817

771-
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
772-
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
818+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
819+
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
773820
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
774821
return talloc->size_max >= node_size;
775822
}
@@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
793840
struct ggml_tensor * node = graph->nodes[i];
794841
struct node_alloc * node_alloc = &galloc->node_allocs[i];
795842

796-
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
843+
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
797844
#ifndef NDEBUG
798845
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
799846
#endif
@@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
805852
if (src == NULL) {
806853
continue;
807854
}
808-
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
855+
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
809856
#ifndef NDEBUG
810857
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
811858
#endif
@@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
846893
for (int i = 0; i < graph->n_leafs; i++) {
847894
struct ggml_tensor * leaf = graph->leafs[i];
848895
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
849-
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
896+
ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
850897
}
851898
// nodes
852899
for (int i = 0; i < graph->n_nodes; i++) {
@@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
857904
if (src == NULL) {
858905
continue;
859906
}
860-
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
907+
ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
861908
}
862-
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
909+
ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
863910
}
864911

865912
return true;
@@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
871918
if (galloc->buffers[buffer_id] == NULL) {
872919
return 0;
873920
}
921+
922+
for (int i = 0; i < buffer_id; i++) {
923+
if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
924+
// this buffer is the same as a previous one due to the same buffer type being used multiple times
925+
// only return the buffer size the first time it appears to avoid double counting
926+
return 0;
927+
}
928+
}
929+
874930
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
875931
}
876932

0 commit comments

Comments
 (0)