Skip to content

Commit 38b16df

Browse files
authored
metal : bug-fix when enable ggml-alloc (#2757)
* metal: better memory alloc w/ concurrency dispatch The ggml-alloc should only free tensors at memory barriers. * ggml-alloc: avoid return silently In certain cases, the allocate_node() function may silently return without performing any memory allocation.
1 parent 8f8c28e commit 38b16df

File tree

2 files changed

+74
-66
lines changed

2 files changed

+74
-66
lines changed

ggml-alloc.c

Lines changed: 74 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ struct ggml_allocr {
6868
size_t max_size;
6969
bool measure;
7070
int parse_seq[GGML_MAX_NODES];
71-
bool has_parse_seq;
71+
int parse_seq_len;
7272

7373
#ifdef GGML_ALLOCATOR_DEBUG
7474
struct ggml_tensor * allocated_tensors[1024];
@@ -239,14 +239,10 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
239239
}
240240

241241
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
242-
int pos = 0;
243242
for (int i = 0; i < n; i++) {
244-
if (list[i] != -1) {
245-
alloc->parse_seq[pos] = list[i];
246-
pos++;
247-
}
243+
alloc->parse_seq[i] = list[i];
248244
}
249-
alloc->has_parse_seq = true;
245+
alloc->parse_seq_len = n;
250246
}
251247

252248
void ggml_allocr_reset(struct ggml_allocr * alloc) {
@@ -269,7 +265,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
269265
/*.max_size = */ 0,
270266
/*.measure = */ false,
271267
/*.parse_seq = */ {0},
272-
/*.has_parse_seq = */ false,
268+
/*.parse_seq_len = */ 0,
273269
#ifdef GGML_ALLOCATOR_DEBUG
274270
/*.allocated_tensors = */ = {0},
275271
#endif
@@ -298,7 +294,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298294
/*.max_size = */ 0,
299295
/*.measure = */ true,
300296
/*.parse_seq = */ {0},
301-
/*.has_parse_seq = */ false,
297+
/*.parse_seq_len = */ 0,
302298
#ifdef GGML_ALLOCATOR_DEBUG
303299
/*.allocated_tensors = */ = {0},
304300
#endif
@@ -445,8 +441,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
445441
else {
446442
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
447443
node->data = parent->data;
444+
return;
448445
}
449-
return;
450446
}
451447
}
452448
}
@@ -497,69 +493,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
497493
allocate_node(alloc, input);
498494
}
499495
}
500-
for (int ind = 0; ind < gf->n_nodes; ind++) {
501-
int i;
502-
if (alloc->has_parse_seq) {
503-
i = alloc->parse_seq[ind];
504-
} else {
505-
i = ind;
506-
}
507-
struct ggml_tensor * node = gf->nodes[i];
508-
509-
// allocate parents (leafs)
510-
for (int j = 0; j < GGML_MAX_SRC; j++) {
511-
struct ggml_tensor * parent = node->src[j];
512-
if (parent == NULL) {
513-
break;
496+
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
497+
int last_barrier_pos = 0;
498+
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
499+
500+
for (int ind = 0; ind < n_nodes; ind++) {
501+
// allocate a node if there is no parse_seq or this is not a barrier
502+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
503+
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
504+
struct ggml_tensor * node = gf->nodes[i];
505+
506+
// allocate parents (leafs)
507+
for (int j = 0; j < GGML_MAX_SRC; j++) {
508+
struct ggml_tensor * parent = node->src[j];
509+
if (parent == NULL) {
510+
break;
511+
}
512+
allocate_node(alloc, parent);
514513
}
515-
allocate_node(alloc, parent);
516-
}
517514

518-
// allocate node
519-
allocate_node(alloc, node);
515+
// allocate node
516+
allocate_node(alloc, node);
520517

521-
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
522-
for (int j = 0; j < GGML_MAX_SRC; j++) {
523-
struct ggml_tensor * parent = node->src[j];
524-
if (parent == NULL) {
525-
break;
526-
}
527-
AT_PRINTF("%s", parent->name);
528-
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
529-
AT_PRINTF(", ");
518+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
519+
for (int j = 0; j < GGML_MAX_SRC; j++) {
520+
struct ggml_tensor * parent = node->src[j];
521+
if (parent == NULL) {
522+
break;
523+
}
524+
AT_PRINTF("%s", parent->name);
525+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
526+
AT_PRINTF(", ");
527+
}
530528
}
529+
AT_PRINTF("\n");
531530
}
532-
AT_PRINTF("\n");
531+
533532

534533
// update parents
535-
for (int j = 0; j < GGML_MAX_SRC; j++) {
536-
struct ggml_tensor * parent = node->src[j];
537-
if (parent == NULL) {
538-
break;
539-
}
540-
struct hash_node * p_hn = hash_get(ht, parent);
541-
p_hn->n_children -= 1;
542-
543-
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
544-
545-
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
546-
if (ggml_is_view(parent)) {
547-
struct ggml_tensor * view_src = get_view_source(parent);
548-
struct hash_node * view_src_hn = hash_get(ht, view_src);
549-
view_src_hn->n_views -= 1;
550-
AT_PRINTF("view_src %s\n", view_src->name);
551-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
552-
ggml_allocator_free_tensor(alloc, view_src);
534+
// update immediately if there is no parse_seq
535+
// update only at barriers if there is parse_seq
536+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
537+
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
538+
int update_end = alloc->parse_seq_len ? ind : ind + 1;
539+
for (int i = update_start; i < update_end; i++) {
540+
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
541+
struct ggml_tensor * node = gf->nodes[node_i];
542+
543+
for (int j = 0; j < GGML_MAX_SRC; j++) {
544+
struct ggml_tensor * parent = node->src[j];
545+
if (parent == NULL) {
546+
break;
553547
}
554-
}
555-
else {
556-
if (parent->data != node->data) {
557-
ggml_allocator_free_tensor(alloc, parent);
548+
struct hash_node * p_hn = hash_get(ht, parent);
549+
p_hn->n_children -= 1;
550+
551+
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
552+
553+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
554+
if (ggml_is_view(parent)) {
555+
struct ggml_tensor * view_src = get_view_source(parent);
556+
struct hash_node * view_src_hn = hash_get(ht, view_src);
557+
view_src_hn->n_views -= 1;
558+
AT_PRINTF("view_src %s\n", view_src->name);
559+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
560+
ggml_allocator_free_tensor(alloc, view_src);
561+
}
562+
}
563+
else {
564+
if (parent->data != node->data) {
565+
ggml_allocator_free_tensor(alloc, parent);
566+
}
567+
}
558568
}
559569
}
560570
}
571+
AT_PRINTF("\n");
572+
if (alloc->parse_seq_len) {
573+
last_barrier_pos = ind + 1;
574+
}
561575
}
562-
AT_PRINTF("\n");
563576
}
564577
// free graph outputs here that wouldn't be freed otherwise because they have no children
565578
if (outputs != NULL && outputs[g] != NULL) {

llama.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2707,11 +2707,6 @@ static struct ggml_cgraph * llm_build_falcon(
27072707
struct ggml_tensor * inpFF = attn_norm;
27082708

27092709
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
2710-
2711-
// TODO: this is temporary needed to introduce artificial dependency between FF and ATTN
2712-
// adding this, because there seems to be a bug in the Metal concurrency optimization
2713-
// without this line, the results are non-deterministic and wrong
2714-
cur->src[2] = attn_out;
27152710
offload_func(cur);
27162711

27172712
cur = ggml_gelu(ctx0, cur);

0 commit comments

Comments
 (0)