Skip to content

Commit 9167c84

Browse files
committed
metal: enable ggml-alloc
Make ggml-alloc work with concurrently dispatch.
1 parent 3ebb009 commit 9167c84

File tree

5 files changed

+61
-26
lines changed

5 files changed

+61
-26
lines changed

ggml-alloc.c

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ struct ggml_allocr {
6767
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
6868
size_t max_size;
6969
bool measure;
70+
int concur_list[GGML_MAX_NODES];
71+
bool has_concur_list;
7072

7173
#ifdef GGML_ALLOCATOR_DEBUG
7274
struct ggml_tensor * allocated_tensors[1024];
@@ -229,6 +231,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
229231
alloc->n_free_blocks++;
230232
}
231233

234+
void ggml_allocr_set_concur_list(struct ggml_allocr * alloc, int * list, int n) {
235+
int pos = 0;
236+
for (int i = 0; i < n; i++) {
237+
if (list[i] != -1) {
238+
alloc->concur_list[pos] = list[i];
239+
pos++;
240+
}
241+
}
242+
alloc->has_concur_list = true;
243+
}
244+
232245
void ggml_allocr_reset(struct ggml_allocr * alloc) {
233246
alloc->n_free_blocks = 1;
234247
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@@ -248,6 +261,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
248261
/*.hash_table = */ {{0}},
249262
/*.max_size = */ 0,
250263
/*.measure = */ false,
264+
/*.concur_list = */ {0},
265+
/*.has_concur_lis= */ false,
251266
#ifdef GGML_ALLOCATOR_DEBUG
252267
/*.allocated_tensors = */ = {0},
253268
#endif
@@ -275,6 +290,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
275290
/*.hash_table = */ {{0}},
276291
/*.max_size = */ 0,
277292
/*.measure = */ true,
293+
/*.concur_list = */ {0},
294+
/*.has_concur_lis= */ false,
278295
#ifdef GGML_ALLOCATOR_DEBUG
279296
/*.allocated_tensors = */ = {0},
280297
#endif
@@ -473,7 +490,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
473490
allocate_node(alloc, input);
474491
}
475492
}
476-
for (int i = 0; i < gf->n_nodes; i++) {
493+
for (int ind = 0; ind < gf->n_nodes; ind++) {
494+
int i;
495+
if (alloc->has_concur_list) {
496+
i = alloc->concur_list[ind];
497+
} else {
498+
i = ind;
499+
}
477500
struct ggml_tensor * node = gf->nodes[i];
478501

479502
// allocate parents (leafs)

ggml-alloc.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ extern "C" {
1010
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
1111
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
1212

13+
// tell the allocator to parse nodes following the order describe in the list
14+
// you should call this if your graph are optimized and excuted out-of-order
15+
GGML_API void ggml_allocr_set_concur_list(struct ggml_allocr * alloc, int * list, int n);
16+
1317
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
1418
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
1519
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);

ggml-metal.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
6363

6464
// try to find operations that can be run concurrently in the graph
6565
// you should run it again if the topology of your graph changes
66-
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
66+
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
6767

68-
// if the graph has been optimized for concurrently dispatch
69-
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
68+
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
69+
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
70+
71+
// output the concur_list for ggml_alloc
72+
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
7073

7174
// same as ggml_graph_compute but uses Metal
7275
// creates gf->n_threads command buffers in parallel

ggml-metal.m

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
228228
ctx->n_cb = n_cb;
229229
}
230230

231-
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
232-
if (ctx->concur_list_len) {
233-
return true;
234-
}
235-
return false;
231+
int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
232+
return ctx->concur_list_len;
233+
}
234+
235+
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
236+
return ctx->concur_list;
236237
}
237238

238239
// finds the Metal buffer that contains the tensor data on the GPU device
@@ -375,7 +376,7 @@ void ggml_metal_get_tensor(
375376

376377
void ggml_metal_graph_find_concurrency(
377378
struct ggml_metal_context * ctx,
378-
struct ggml_cgraph * gf) {
379+
struct ggml_cgraph * gf, bool check_mem) {
379380
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
380381
int nodes_unused[GGML_MAX_CONCUR];
381382

@@ -422,7 +423,7 @@ void ggml_metal_graph_find_concurrency(
422423
}
423424
}
424425
}
425-
if (exe_flag) {
426+
if (exe_flag && check_mem) {
426427
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
427428
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
428429
int64_t data_start = (int64_t) gf->nodes[i]->data;

llama.cpp

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
6363
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
6464

6565

66-
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
66+
#if !defined(GGML_USE_CUBLAS)
6767
#include "ggml-alloc.h"
6868
#define LLAMA_USE_ALLOCATOR
6969
#else
@@ -1846,10 +1846,6 @@ static bool llama_eval_internal(
18461846

18471847
#ifdef GGML_USE_METAL
18481848
if (lctx.ctx_metal && N == 1) {
1849-
// TODO: disabled until #2413 is resolved
1850-
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1851-
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1852-
//}
18531849
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
18541850
ggml_metal_graph_compute(lctx.ctx_metal, gf);
18551851
ggml_metal_get_tensor (lctx.ctx_metal, res);
@@ -3303,7 +3299,18 @@ struct llama_context * llama_new_context_with_model(
33033299
int n_past = hparams.n_ctx - n_tokens;
33043300
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
33053301
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3306-
3302+
#ifdef GGML_USE_METAL
3303+
if(params.n_gpu_layers > 0) {
3304+
ctx->ctx_metal = ggml_metal_init(1);
3305+
if (!ctx->ctx_metal) {
3306+
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3307+
llama_free(ctx);
3308+
return NULL;
3309+
}
3310+
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
3311+
ggml_allocr_set_concur_list(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal),ggml_metal_if_optimized(ctx->ctx_metal));
3312+
}
3313+
#endif
33073314
// measure memory requirements for the graph
33083315
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
33093316

@@ -3321,6 +3328,11 @@ struct llama_context * llama_new_context_with_model(
33213328

33223329
ctx->buf_alloc.resize(alloc_size);
33233330
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3331+
#ifdef GGML_USE_METAL
3332+
if(ctx->ctx_metal) {
3333+
ggml_allocr_set_concur_list(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal),ggml_metal_if_optimized(ctx->ctx_metal));
3334+
}
3335+
#endif
33243336
}
33253337
#else
33263338
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@@ -3335,13 +3347,6 @@ struct llama_context * llama_new_context_with_model(
33353347
#ifdef GGML_USE_METAL
33363348
if (params.n_gpu_layers > 0) {
33373349
// this allocates all Metal resources and memory buffers
3338-
ctx->ctx_metal = ggml_metal_init(1);
3339-
3340-
if (!ctx->ctx_metal) {
3341-
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3342-
llama_free(ctx);
3343-
return NULL;
3344-
}
33453350

33463351
void * data_ptr = NULL;
33473352
size_t data_size = 0;
@@ -3370,8 +3375,7 @@ struct llama_context * llama_new_context_with_model(
33703375
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
33713376
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
33723377

3373-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
3374-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
3378+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
33753379
#undef LLAMA_METAL_CHECK_BUF
33763380
}
33773381
#endif

0 commit comments

Comments
 (0)