Skip to content

Commit db3db9e

Browse files
committed
metal : clean-up stuff, fix typos
1 parent b252acb commit db3db9e

File tree

3 files changed

+14
-29
lines changed

3 files changed

+14
-29
lines changed

examples/metal/metal.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Evaluate a statically export ggml computation graph with Metal
1+
// Evaluate a statically exported ggml computation graph with Metal
22
//
33
// - First, export a LLaMA graph:
44
//

ggml.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15049,7 +15049,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
1504915049

1505015050
// create the tensor
1505115051
// "view" operations are handled differently
15052-
// TODO: handle inplac ops - currentl a copy is always made
15052+
// TODO: handle inplace ops - currently a copy is always made
1505315053

1505415054
struct ggml_tensor * tensor = NULL;
1505515055

@@ -15084,10 +15084,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
1508415084
} break;
1508515085
}
1508615086

15087-
1508815087
memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
1508915088

15090-
// TODO: double-check this is needed
1509115089
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
1509215090
tensor->nb[j] = nb[j];
1509315091
}

llama.cpp

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,7 +1201,7 @@ static bool llama_model_load(
12011201
// - tokens: new batch of tokens to process
12021202
// - n_past: the context size so far
12031203
// - n_threads: number of threads to use
1204-
// - cgraph_fname: filename of the exported computation graph (TODO: TMP!!!)
1204+
// - cgraph_fname: filename of the exported computation graph
12051205
//
12061206
static bool llama_eval_internal(
12071207
llama_context & lctx,
@@ -1256,7 +1256,7 @@ static bool llama_eval_internal(
12561256
memcpy(embd->data, tokens, N*ggml_element_size(embd));
12571257

12581258
#ifdef GGML_USE_METAL
1259-
if (lctx.ctx_metal) {
1259+
if (lctx.ctx_metal && N == 1) {
12601260
ggml_metal_set_tensor(lctx.ctx_metal, embd);
12611261
}
12621262
#endif
@@ -1279,18 +1279,10 @@ static bool llama_eval_internal(
12791279

12801280
// self-attention
12811281
{
1282-
//auto * x = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1283-
//struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, x, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1284-
12851282
// compute Q and K and RoPE them
12861283

1287-
struct ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N);
1288-
struct ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N);
1289-
ggml_set_name(Qpre, "Qpre");
1290-
ggml_set_name(Kpre, "Kpre");
1291-
1292-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, Qpre, n_past, n_rot, 0);
1293-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, Kpre, n_past, n_rot, 0);
1284+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1285+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
12941286
ggml_set_name(Qcur, "Qcur");
12951287
ggml_set_name(Kcur, "Kcur");
12961288

@@ -1305,9 +1297,6 @@ static bool llama_eval_internal(
13051297
( n_ctx)*ggml_element_size(kv_self.v),
13061298
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
13071299

1308-
ggml_set_name(k, "k");
1309-
ggml_set_name(v, "v");
1310-
13111300
// important: storing RoPE-ed version of K in the KV cache!
13121301
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
13131302
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
@@ -2341,21 +2330,19 @@ struct llama_context * llama_init_from_file(
23412330
#ifdef GGML_USE_METAL
23422331
if (params.n_gpu_layers > 0) {
23432332
// this allocates all Metal resources and memory buffers
2333+
ctx->ctx_metal = ggml_metal_init();
2334+
23442335
if (params.use_mmap) {
2345-
ctx->ctx_metal = ggml_metal_init();
2346-
ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size);
2347-
ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
2348-
ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
2349-
ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size);
2350-
ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size);
2336+
ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size);
2337+
ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
23512338
} else {
2352-
ctx->ctx_metal = ggml_metal_init();
23532339
ggml_metal_add_buffer(ctx->ctx_metal, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
23542340
ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
2355-
ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
2356-
ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size);
2357-
ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size);
23582341
}
2342+
2343+
ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
2344+
ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size);
2345+
ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size);
23592346
}
23602347
#endif
23612348

0 commit comments

Comments
 (0)