Skip to content

Commit e712935

Browse files
committed
llama : ggml-backend integration
1 parent 7bed7eb commit e712935

8 files changed

+753
-1511
lines changed

ggml-alloc.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,10 +779,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
779779

780780
if (nbytes == 0) {
781781
// all the tensors in the context are already allocated
782+
#ifndef NDEBUG
783+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
784+
#endif
782785
return NULL;
783786
}
784787

785788
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
789+
if (buffer == NULL) {
790+
// failed to allocate buffer
791+
#ifndef NDEBUG
792+
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
793+
#endif
794+
return NULL;
795+
}
796+
786797
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
787798

788799
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {

ggml-backend-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ extern "C" {
5151
ggml_backend_buffer_type_t buft;
5252
ggml_backend_buffer_context_t context;
5353
size_t size;
54+
enum ggml_backend_buffer_usage usage;
5455
};
5556

5657
ggml_backend_buffer_t ggml_backend_buffer_init(

ggml-backend.c

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
5858
/* .buft = */ buft,
5959
/* .context = */ context,
6060
/* .size = */ size,
61+
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
6162
};
6263

6364
return buffer;
@@ -109,6 +110,10 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109110
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
110111
}
111112

113+
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
114+
buffer->usage = usage;
115+
}
116+
112117
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
113118
return buffer->buft;
114119
}
@@ -773,7 +778,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
773778
}
774779

775780
#if 0
776-
static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
781+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
777782
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
778783
#define GET_CAUSE(node) causes[hash_id(node)]
779784
#else
@@ -808,17 +813,25 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
808813
if (src == NULL) {
809814
break;
810815
}
816+
811817
ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
812-
if (src_backend != NULL) {
818+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
819+
// operations with weights are always on the same backend as the weights
820+
cur_backend = src_backend;
821+
SET_CAUSE(node, "1.wgt%d", i);
822+
break;
823+
}
824+
825+
//if (src_backend != NULL) {
813826
int src_prio = sched_backend_prio(sched, src_backend);
814827
size_t src_size = ggml_nbytes(src);
815-
if (src_prio < cur_prio && src_size >= cur_size) {
828+
if (/*src_prio < cur_prio &&*/ src_size >= cur_size) {
816829
cur_prio = src_prio;
817830
cur_size = src_size;
818831
cur_backend = src_backend;
819832
SET_CAUSE(node, "1.src%d", i);
820833
}
821-
}
834+
//}
822835
}
823836
return cur_backend;
824837
}
@@ -929,6 +942,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
929942
}
930943
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
931944

945+
#if 0
932946
// pass 2: assign backends to ops from current assignments
933947
// TODO:
934948
// - reuse sched_backend_from_cur
@@ -960,6 +974,23 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
960974
}
961975
}
962976
}
977+
#else
978+
// pass 2: assign backends to ops from current assignments
979+
// start from the end and assign the same backend to previous ops
980+
{
981+
ggml_tallocr_t cur_allocr = NULL;
982+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
983+
struct ggml_tensor * node = graph->nodes[i];
984+
ggml_tallocr_t node_allocr = node_allocr(node);
985+
if (node_allocr != NULL) {
986+
cur_allocr = node_allocr;
987+
} else {
988+
node_allocr(node) = cur_allocr;
989+
SET_CAUSE(node, "2.cur");
990+
}
991+
}
992+
}
993+
#endif
963994
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
964995

965996
// pass 3: assign backends to remaining src from dst (should only be leafs)
@@ -1025,9 +1056,21 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
10251056
}
10261057
ggml_tallocr_t src_allocr = node_allocr(src);
10271058
if (src_allocr != node_allocr) {
1028-
int n_inputs = sched->splits[cur_split].n_inputs++;
1029-
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1030-
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
1059+
// check if the input is already in the split
1060+
bool found = false;
1061+
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
1062+
if (sched->splits[cur_split].inputs[k] == src) {
1063+
found = true;
1064+
break;
1065+
}
1066+
}
1067+
1068+
if (!found) {
1069+
int n_inputs = sched->splits[cur_split].n_inputs++;
1070+
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
1071+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1072+
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
1073+
}
10311074

10321075
// create copies
10331076
size_t id = hash_id(src);
@@ -1231,6 +1274,10 @@ void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
12311274
sched_reset(sched);
12321275
}
12331276

1277+
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1278+
return sched->n_splits;
1279+
}
1280+
12341281
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
12351282
int backend_index = sched_backend_prio(sched, backend);
12361283
return sched->tallocs[backend_index];
@@ -1316,6 +1363,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
13161363

13171364
struct ggml_tensor * dst = node_copies[id];
13181365
if (dst->view_src != NULL) {
1366+
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
13191367
ggml_backend_view_init(dst->view_src->buffer, dst);
13201368
}
13211369
else {

ggml-backend.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ extern "C" {
2424
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
2525

2626
// buffer
27+
enum ggml_backend_buffer_usage {
28+
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
29+
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
30+
};
31+
2732
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
2833
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
2934
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
@@ -32,8 +37,10 @@ extern "C" {
3237
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
3338
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
3439
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
40+
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
3541
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
3642

43+
3744
//
3845
// Backend
3946
//
@@ -146,6 +153,7 @@ extern "C" {
146153

147154
// Initialize backend buffers from a measure graph
148155
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
156+
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
149157

150158
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
151159
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);

0 commit comments

Comments
 (0)