Skip to content

Commit 9c72e7e

Browse files
committed
rebase to master (except ggml-cuda)
1 parent 33ab185 commit 9c72e7e

File tree

5 files changed

+21
-30
lines changed

5 files changed

+21
-30
lines changed

ggml-backend.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_t
289289

290290
if ((*inputs[0])->backend == ggml_get_ctx_backend(ctx)) {
291291
if (splits->n_splits > 0) {
292-
char name[GGML_MAX_NAME - 1]; // silence -Wformat-truncation
292+
char name[GGML_MAX_NAME];
293293
vsnprintf(name, sizeof(name), fmt, args);
294294
char new_name[GGML_MAX_NAME];
295295
snprintf(new_name, sizeof(new_name), "%s,%s", splits->splits[splits->n_splits - 1].name, name);

ggml-cuda.cu

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1475,8 +1475,8 @@ static void ggml_cuda_mul_mat(ggml_cuda_context * ctx, ggml_tensor * src0, ggml_
14751475
}
14761476

14771477
static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cudaStream_t stream) {
1478-
ggml_tensor * src0 = node->src0;
1479-
ggml_tensor * src1 = node->src1;
1478+
ggml_tensor * src0 = node->src[0];
1479+
ggml_tensor * src1 = node->src[1];
14801480
ggml_tensor * dst = node;
14811481

14821482
#if 0
@@ -1551,8 +1551,6 @@ static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cud
15511551
}
15521552
}
15531553

1554-
static const int GGML_MAX_PARENTS = 2 + GGML_MAX_OPT;
1555-
15561554
static bool ggml_is_noop(ggml_tensor * t) {
15571555
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
15581556
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE;
@@ -1581,26 +1579,20 @@ static void ggml_cuda_graph_exec_parallel(ggml_cuda_context * ctx, ggml_cgraph *
15811579
ggml_tensor * node = gf->nodes[i];
15821580
const bool is_noop = ggml_is_noop(node);
15831581

1584-
// build a list of parents
1585-
ggml_tensor * parents[GGML_MAX_PARENTS] = { node->src0, node->src1 };
1586-
for (int j = 0; j < GGML_MAX_OPT; j++) {
1587-
parents[j + 2] = node->opt[j];
1588-
}
1589-
15901582
// assign an stream for the node
15911583
cudaStream_t stream = nullptr;
15921584

15931585
// take a stream from a parent
1594-
for (int j = 0; j < GGML_MAX_PARENTS; j++) {
1595-
if (parents[j] && stream_map.count(parents[j]) && stream_map[parents[j]] != nullptr) {
1596-
stream = stream_map[parents[j]];
1597-
stream_map.erase(parents[j]);
1586+
for (int j = 0; j < GGML_MAX_SRC; j++) {
1587+
if (node->src[j] && stream_map.count(node->src[j]) && stream_map[node->src[j]] != nullptr) {
1588+
stream = stream_map[node->src[j]];
1589+
stream_map.erase(node->src[j]);
15981590

15991591
if (is_noop) {
16001592
// if this is a noop, we can use the parent's event
16011593
stream_map[node] = stream;
1602-
if (event_map.count(parents[j]) > 0) {
1603-
event_map[node] = event_map[parents[j]];
1594+
if (event_map.count(node->src[j]) > 0) {
1595+
event_map[node] = event_map[node->src[j]];
16041596
}
16051597
}
16061598
break;
@@ -1624,9 +1616,9 @@ static void ggml_cuda_graph_exec_parallel(ggml_cuda_context * ctx, ggml_cgraph *
16241616

16251617
// wait on parent streams
16261618
bool waited = false;
1627-
for (int j = 0; j < GGML_MAX_PARENTS; j++) {
1628-
if (parents[j] && event_map.count(parents[j]) > 0) {
1629-
CUDA_CHECK(cudaStreamWaitEvent(stream, event_map[parents[j]], 0));
1619+
for (int j = 0; j < GGML_MAX_SRC; j++) {
1620+
if (node->src[j] && event_map.count(node->src[j]) > 0) {
1621+
CUDA_CHECK(cudaStreamWaitEvent(stream, event_map[node->src[j]], 0));
16301622
waited = true;
16311623
}
16321624
}

ggml.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6855,7 +6855,9 @@ struct ggml_tensor * ggml_rope_impl(
68556855
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
68566856

68576857
// TODO: just use a struct
6858-
int32_t params[] = { n_past, n_dims, mode, n_ctx, *(int32_t*)&freq_base, *(int32_t*)&freq_scale};
6858+
int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6859+
memcpy(params + 4, &freq_base, sizeof(float));
6860+
memcpy(params + 5, &freq_scale, sizeof(float));
68596861
assert(GGML_MAX_OP_PARAMS >= sizeof(params));
68606862
memcpy(result->params, &params, sizeof(params));
68616863

@@ -7127,13 +7129,11 @@ struct ggml_tensor* ggml_pool_1d(
71277129
};
71287130
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
71297131

7130-
ggml_scratch_save(ctx);
71317132
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
71327133
((int32_t*)c->data)[0] = op;
71337134
((int32_t*)c->data)[1] = k0;
71347135
((int32_t*)c->data)[2] = s0;
71357136
((int32_t*)c->data)[3] = p0;
7136-
ggml_scratch_load(ctx);
71377137

71387138
result->op = GGML_OP_POOL_1D;
71397139
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7170,7 +7170,6 @@ struct ggml_tensor* ggml_pool_2d(
71707170
};
71717171
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
71727172

7173-
ggml_scratch_save(ctx);
71747173
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
71757174
((int32_t*)c->data)[0] = op;
71767175
((int32_t*)c->data)[1] = k0;
@@ -7179,7 +7178,6 @@ struct ggml_tensor* ggml_pool_2d(
71797178
((int32_t*)c->data)[4] = s1;
71807179
((int32_t*)c->data)[5] = p0;
71817180
((int32_t*)c->data)[6] = p1;
7182-
ggml_scratch_load(ctx);
71837181

71847182
result->op = GGML_OP_POOL_2D;
71857183
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -15823,7 +15821,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
1582315821
}
1582415822
}
1582515823

15826-
if (node->op == GGML_OP_NONE && node->src0 == NULL && node->src1 == NULL && node->grad == NULL) {
15824+
// TODO: add ggml_dependency instead of checking for NULL
15825+
if (node->op == GGML_OP_NONE && node->src[0] == NULL && node->src[1] == NULL && node->grad == NULL) {
1582715826
// reached a leaf node, not part of the gradient graph (e.g. a constant)
1582815827
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
1582915828

ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@
199199
#define GGML_MAX_CONTEXTS 64
200200
#define GGML_MAX_SRC 6
201201
#define GGML_MAX_NAME 48
202-
#define GGML_MAX_OP_PARAMS 16
202+
#define GGML_MAX_OP_PARAMS 32
203203
#define GGML_DEFAULT_N_THREADS 4
204204

205205

llama.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,7 +1168,7 @@ static ggml_graph_splits llama_build_graph(
11681168

11691169
struct ggml_graph_splits splits = ggml_graph_split_init();
11701170

1171-
// initalize contexts for every backend
1171+
// initialize contexts for every backend
11721172

11731173
struct ggml_context * ctx_cpu = nullptr;
11741174
// TODO: don't create context if there are no CPU layers
@@ -1295,8 +1295,8 @@ static ggml_graph_splits llama_build_graph(
12951295
// TODO: replace with ggml_dependency / ggml_depends_on
12961296
k = ggml_view_tensor(ctx_kv, kv_self.k);
12971297
v = ggml_view_tensor(ctx_kv, kv_self.v);
1298-
k->src0 = k_cpy;
1299-
v->src0 = v_cpy;
1298+
k->src[0] = k_cpy;
1299+
v->src[0] = v_cpy;
13001300
}
13011301

13021302
struct ggml_tensor * Q =

0 commit comments

Comments
 (0)