Skip to content

Commit ca91205

Browse files
committed
move BLAS to a separate backend
1 parent b90dc56 commit ca91205

16 files changed

+588
-338
lines changed

CMakeLists.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -311,9 +311,9 @@ if (LLAMA_BLAS)
311311
if (LLAMA_STATIC)
312312
set(BLA_STATIC ON)
313313
endif()
314-
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
315-
set(BLA_SIZEOF_INTEGER 8)
316-
endif()
314+
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
315+
# set(BLA_SIZEOF_INTEGER 8)
316+
#endif()
317317

318318
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
319319
find_package(BLAS)
@@ -380,6 +380,9 @@ if (LLAMA_BLAS)
380380
add_compile_definitions(GGML_BLAS_USE_MKL)
381381
endif()
382382

383+
set(GGML_HEADERS_BLAS ggml-blas.h)
384+
set(GGML_SOURCES_BLAS ggml-blas.c)
385+
383386
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
384387
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
385388
else()
@@ -1255,6 +1258,7 @@ add_library(ggml OBJECT
12551258
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
12561259
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
12571260
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
1261+
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
12581262
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
12591263
)
12601264

Makefile

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,7 @@ ifndef LLAMA_NO_ACCELERATE
408408
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
409409
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
410410
MK_LDFLAGS += -framework Accelerate
411+
OBJS += ggml-blas.o
411412
endif
412413
endif # LLAMA_NO_ACCELERATE
413414

@@ -421,23 +422,35 @@ ifdef LLAMA_OPENBLAS
421422
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
422423
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
423424
MK_LDFLAGS += $(shell pkg-config --libs openblas)
425+
OBJS += ggml-blas.o
424426
endif # LLAMA_OPENBLAS
425427

426-
ifndef LLAMA_NO_LLAMAFILE
427-
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
428-
OBJS += sgemm.o
429-
endif
428+
ifdef LLAMA_OPENBLAS64
429+
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas64)
430+
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
431+
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
432+
OBJS += ggml-blas.o
433+
endif # LLAMA_OPENBLAS64
430434

431435
ifdef LLAMA_BLIS
432436
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
433437
MK_LDFLAGS += -lblis -L/usr/local/lib
438+
OBJS += ggml-blas.o
434439
endif # LLAMA_BLIS
435440

441+
ifndef LLAMA_NO_LLAMAFILE
442+
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
443+
OBJS += sgemm.o
444+
endif
445+
436446
ifdef LLAMA_RPC
437447
MK_CPPFLAGS += -DGGML_USE_RPC
438448
OBJS += ggml-rpc.o
439449
endif # LLAMA_RPC
440450

451+
ggml-blas.o: ggml-blas.c ggml-blas.h
452+
$(CC) $(CFLAGS) -c $< -o $@
453+
441454
ifdef LLAMA_CUBLAS
442455
# LLAMA_CUBLAS is deprecated and will be removed in the future
443456
LLAMA_CUDA := 1

ggml-alloc.c

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ struct hash_node {
339339
};
340340

341341
struct tensor_alloc {
342+
int buffer_id;
342343
size_t offset;
343344
size_t size_max; // 0 = pre-allocated, unused, or view
344345
};
@@ -349,7 +350,6 @@ struct leaf_alloc {
349350
};
350351

351352
struct node_alloc {
352-
int buffer_id;
353353
struct tensor_alloc dst;
354354
struct tensor_alloc src[GGML_MAX_SRC];
355355
};
@@ -511,17 +511,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
511511
}
512512
}
513513

514-
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
514+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
515515
// graph outputs are never freed
516516
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
517517
AT_PRINTF("not freeing output %s\n", node->name);
518518
return;
519519
}
520520

521-
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
522-
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
523521
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
524522
size_t offset = hn->offset;
523+
int buffer_id = hn->buffer_id;
524+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
525+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
525526
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
526527
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
527528
hn->allocated = false;
@@ -626,11 +627,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
626627
AT_PRINTF("view_src %s: %d children, %d views\n",
627628
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
628629
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
629-
ggml_gallocr_free_node(galloc, view_src, buffer_id);
630+
ggml_gallocr_free_node(galloc, view_src);
630631
}
631632
}
632633
else if (p_hn->allocated) {
633-
ggml_gallocr_free_node(galloc, parent, buffer_id);
634+
ggml_gallocr_free_node(galloc, parent);
634635
}
635636
}
636637
AT_PRINTF("\n");
@@ -674,22 +675,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
674675
for (int i = 0; i < graph->n_nodes; i++) {
675676
struct ggml_tensor * node = graph->nodes[i];
676677
struct node_alloc * node_alloc = &galloc->node_allocs[i];
677-
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
678+
//node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
678679
if (node->view_src || node->data) {
680+
node_alloc->dst.buffer_id = -1;
679681
node_alloc->dst.offset = SIZE_MAX;
680682
node_alloc->dst.size_max = 0;
681683
} else {
682684
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
683-
node_alloc->dst.offset = hn->offset;
684-
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
685+
node_alloc->dst.buffer_id = hn->buffer_id;
686+
node_alloc->dst.offset = hn->offset;
687+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
685688
}
686689
for (int j = 0; j < GGML_MAX_SRC; j++) {
687690
struct ggml_tensor * src = node->src[j];
688691
if (!src || src->view_src || src->data) {
692+
node_alloc->src[j].buffer_id = -1;
689693
node_alloc->src[j].offset = SIZE_MAX;
690694
node_alloc->src[j].size_max = 0;
691695
} else {
692696
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
697+
node_alloc->src[j].buffer_id = hn->buffer_id;
693698
node_alloc->src[j].offset = hn->offset;
694699
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
695700
}
@@ -706,9 +711,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
706711
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707712
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708713
if (leaf->view_src || leaf->data) {
714+
galloc->leaf_allocs[i].leaf.buffer_id = -1;
709715
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710716
galloc->leaf_allocs[i].leaf.size_max = 0;
711717
} else {
718+
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
712719
galloc->leaf_allocs[i].leaf.offset = hn->offset;
713720
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714721
}
@@ -740,7 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
740747
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
741748
}
742749

743-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
750+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
751+
int buffer_id = tensor_alloc->buffer_id;
744752
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
745753

746754
if (tensor->view_src != NULL) {
@@ -768,8 +776,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
768776
}
769777
}
770778

771-
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
772-
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
779+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
780+
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
773781
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
774782
return talloc->size_max >= node_size;
775783
}
@@ -793,7 +801,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
793801
struct ggml_tensor * node = graph->nodes[i];
794802
struct node_alloc * node_alloc = &galloc->node_allocs[i];
795803

796-
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
804+
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
797805
#ifndef NDEBUG
798806
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
799807
#endif
@@ -805,7 +813,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
805813
if (src == NULL) {
806814
continue;
807815
}
808-
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
816+
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
809817
#ifndef NDEBUG
810818
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
811819
#endif
@@ -846,7 +854,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
846854
for (int i = 0; i < graph->n_leafs; i++) {
847855
struct ggml_tensor * leaf = graph->leafs[i];
848856
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
849-
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
857+
ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
850858
}
851859
// nodes
852860
for (int i = 0; i < graph->n_nodes; i++) {
@@ -857,9 +865,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
857865
if (src == NULL) {
858866
continue;
859867
}
860-
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
868+
ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
861869
}
862-
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
870+
ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
863871
}
864872

865873
return true;

ggml-backend-impl.h

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@ extern "C" {
1717

1818
struct ggml_backend_buffer_type_i {
1919
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20+
// allocate a buffer of this type
2021
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21-
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22-
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
23-
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
24-
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
22+
// tensor alignment
23+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
24+
// max buffer size that can be allocated
25+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
26+
// data size needed to allocate the tensor, including padding
27+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
2528
// check if tensor data is in host memory
26-
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
2729
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
2830
};
2931

@@ -92,27 +94,37 @@ extern "C" {
9294
void (*GGML_CALL synchronize)(ggml_backend_t backend);
9395

9496
// compute graph with a plan (not used currently)
97+
// create a new plan for a graph
9598
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
9699
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100+
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
101+
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
102+
// compute the graph with the plan
103+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97104

98-
// compute graph with a plan
99-
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100105
// compute graph without a plan (async)
101106
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
102107

103-
// check if the backend supports an operation
108+
// check if the backend can compute an operation
104109
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105110

111+
// check if the backend can use tensors allocated in a buffer type
112+
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
113+
106114
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107115
// these should be expensive operations with large batch sizes that may benefit from running on this backend
108116
// even if the weight has to be copied from the CPU temporarily
109117
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110118

111119
// (optional) event synchronization
120+
// create a new event that can record events on this backend instance
112121
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
113122
void (*GGML_CALL event_free) (ggml_backend_event_t event);
123+
// record an event on the backend instance that created it
114124
void (*GGML_CALL event_record) (ggml_backend_event_t event);
125+
// wait for an event on on a different backend instance
115126
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
127+
// block until an event is recorded
116128
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
117129
};
118130

0 commit comments

Comments
 (0)