@@ -1201,7 +1201,7 @@ static bool llama_model_load(
1201
1201
// - tokens: new batch of tokens to process
1202
1202
// - n_past: the context size so far
1203
1203
// - n_threads: number of threads to use
1204
- // - cgraph_fname: filename of the exported computation graph (TODO: TMP!!!)
1204
+ // - cgraph_fname: filename of the exported computation graph
1205
1205
//
1206
1206
static bool llama_eval_internal (
1207
1207
llama_context & lctx,
@@ -1256,7 +1256,7 @@ static bool llama_eval_internal(
1256
1256
memcpy (embd->data , tokens, N*ggml_element_size (embd));
1257
1257
1258
1258
#ifdef GGML_USE_METAL
1259
- if (lctx.ctx_metal ) {
1259
+ if (lctx.ctx_metal && N == 1 ) {
1260
1260
ggml_metal_set_tensor (lctx.ctx_metal , embd);
1261
1261
}
1262
1262
#endif
@@ -1279,18 +1279,10 @@ static bool llama_eval_internal(
1279
1279
1280
1280
// self-attention
1281
1281
{
1282
- // auto * x = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1283
- // struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, x, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1284
-
1285
1282
// compute Q and K and RoPE them
1286
1283
1287
- struct ggml_tensor * Qpre = ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wq , cur), n_embd/n_head, n_head, N);
1288
- struct ggml_tensor * Kpre = ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wk , cur), n_embd/n_head, n_head, N);
1289
- ggml_set_name (Qpre, " Qpre" );
1290
- ggml_set_name (Kpre, " Kpre" );
1291
-
1292
- struct ggml_tensor * Qcur = ggml_rope_inplace (ctx0, Qpre, n_past, n_rot, 0 );
1293
- struct ggml_tensor * Kcur = ggml_rope_inplace (ctx0, Kpre, n_past, n_rot, 0 );
1284
+ struct ggml_tensor * Qcur = ggml_rope_inplace (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wq , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
1285
+ struct ggml_tensor * Kcur = ggml_rope_inplace (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wk , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
1294
1286
ggml_set_name (Qcur, " Qcur" );
1295
1287
ggml_set_name (Kcur, " Kcur" );
1296
1288
@@ -1305,9 +1297,6 @@ static bool llama_eval_internal(
1305
1297
( n_ctx)*ggml_element_size (kv_self.v ),
1306
1298
(il*n_ctx)*ggml_element_size (kv_self.v )*n_embd + n_past*ggml_element_size (kv_self.v ));
1307
1299
1308
- ggml_set_name (k, " k" );
1309
- ggml_set_name (v, " v" );
1310
-
1311
1300
// important: storing RoPE-ed version of K in the KV cache!
1312
1301
ggml_build_forward_expand (&gf, ggml_cpy (ctx0, Kcur, k));
1313
1302
ggml_build_forward_expand (&gf, ggml_cpy (ctx0, Vcur, v));
@@ -2341,21 +2330,19 @@ struct llama_context * llama_init_from_file(
2341
2330
#ifdef GGML_USE_METAL
2342
2331
if (params.n_gpu_layers > 0 ) {
2343
2332
// this allocates all Metal resources and memory buffers
2333
+ ctx->ctx_metal = ggml_metal_init ();
2334
+
2344
2335
if (params.use_mmap ) {
2345
- ctx->ctx_metal = ggml_metal_init ();
2346
- ggml_metal_add_buffer (ctx->ctx_metal , " data" , ctx->model .mapping ->addr , ctx->model .mapping ->size );
2347
- ggml_metal_add_buffer (ctx->ctx_metal , " eval" , ctx->buf_compute .addr , ctx->buf_compute .size );
2348
- ggml_metal_add_buffer (ctx->ctx_metal , " kv" , ctx->model .kv_self .buf .addr , ctx->model .kv_self .buf .size );
2349
- ggml_metal_add_buffer (ctx->ctx_metal , " scr0" , ctx->buf_scratch [0 ].addr , ctx->buf_scratch [0 ].size );
2350
- ggml_metal_add_buffer (ctx->ctx_metal , " scr1" , ctx->buf_scratch [1 ].addr , ctx->buf_scratch [1 ].size );
2336
+ ggml_metal_add_buffer (ctx->ctx_metal , " data" , ctx->model .mapping ->addr , ctx->model .mapping ->size );
2337
+ ggml_metal_add_buffer (ctx->ctx_metal , " eval" , ctx->buf_compute .addr , ctx->buf_compute .size );
2351
2338
} else {
2352
- ctx->ctx_metal = ggml_metal_init ();
2353
2339
ggml_metal_add_buffer (ctx->ctx_metal , " data" , ggml_get_mem_buffer (ctx->model .ctx ), ggml_get_mem_size (ctx->model .ctx ));
2354
2340
ggml_metal_add_buffer (ctx->ctx_metal , " eval" , ctx->buf_compute .addr , ctx->buf_compute .size );
2355
- ggml_metal_add_buffer (ctx->ctx_metal , " kv" , ctx->model .kv_self .buf .addr , ctx->model .kv_self .buf .size );
2356
- ggml_metal_add_buffer (ctx->ctx_metal , " scr0" , ctx->buf_scratch [0 ].addr , ctx->buf_scratch [0 ].size );
2357
- ggml_metal_add_buffer (ctx->ctx_metal , " scr1" , ctx->buf_scratch [1 ].addr , ctx->buf_scratch [1 ].size );
2358
2341
}
2342
+
2343
+ ggml_metal_add_buffer (ctx->ctx_metal , " kv" , ctx->model .kv_self .buf .addr , ctx->model .kv_self .buf .size );
2344
+ ggml_metal_add_buffer (ctx->ctx_metal , " scr0" , ctx->buf_scratch [0 ].addr , ctx->buf_scratch [0 ].size );
2345
+ ggml_metal_add_buffer (ctx->ctx_metal , " scr1" , ctx->buf_scratch [1 ].addr , ctx->buf_scratch [1 ].size );
2359
2346
}
2360
2347
#endif
2361
2348
0 commit comments