@@ -240,8 +240,8 @@ struct llama_model {
240240#endif
241241
242242 // backend assigned to each layer
243- ggml_backend * backend_input = NULL ;
244- ggml_backend * backend_output = NULL ;
243+ ggml_backend * backend_inp = NULL ;
244+ ggml_backend * backend_out = NULL ;
245245 std::vector<ggml_backend *> backend_layers;
246246
247247 ~llama_model () {
@@ -965,15 +965,15 @@ static void llama_model_load_internal(
965965#endif
966966#ifdef GGML_USE_METAL
967967 if (n_gpu_layers > 0 ) {
968- model.backend_metal = ggml_backend_metal_init ();
968+ model.backend_metal = ggml_backend_cpu_init ();
969969 backend_gpu = &model.backend_metal ;
970970 }
971971#endif
972972
973973 // assign splits to the backends
974974 const int i_gpu_start = std::max (0 , (int )n_layer - n_gpu_layers);
975- model.backend_input = n_gpu_layers > (int )n_layer ? backend_gpu : backend_cpu;
976- model.backend_output = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
975+ model.backend_inp = n_gpu_layers > (int )n_layer ? backend_gpu : backend_cpu;
976+ model.backend_out = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
977977
978978 model.backend_layers .resize (n_layer);
979979 std::fill (model.backend_layers .begin (), model.backend_layers .begin () + i_gpu_start, backend_cpu);
@@ -983,10 +983,10 @@ static void llama_model_load_internal(
983983 std::unordered_map<struct ggml_backend *, size_t > ctx_sizes;
984984 for (const llama_load_tensor & lt : ml->tensors_map .tensors ) {
985985 if (lt.name == " tok_embeddings.weight" ) {
986- ctx_sizes[model.backend_input ] += lt.size ;
986+ ctx_sizes[model.backend_inp ] += lt.size ;
987987 }
988988 else if (lt.name == " norm.weight" || lt.name == " output.weight" ) {
989- ctx_sizes[model.backend_output ] += lt.size ;
989+ ctx_sizes[model.backend_out ] += lt.size ;
990990 }
991991 else {
992992 // parse layer number from name
@@ -1032,6 +1032,7 @@ static void llama_model_load_internal(
10321032 }
10331033
10341034 ggml_context * ctx_gpu = model.ctx_cpu ;
1035+
10351036#ifdef GGML_USE_CUDA
10361037 if (n_gpu_layers > 0 ) {
10371038 size_t gpu_num_tensors = ml->tensors_map .tensors .size ();
@@ -1043,15 +1044,35 @@ static void llama_model_load_internal(
10431044 if (!model.ctx_cuda ) {
10441045 throw std::runtime_error (format (" ggml_init() failed for CUDA backend" ));
10451046 }
1047+
10461048 ctx_gpu = model.ctx_cuda ;
10471049 }
10481050#endif
10491051
1052+ #ifdef GGML_USE_METAL
1053+ if (n_gpu_layers > 0 ) {
1054+ // the metal context is actually a CPU context because we have unified memory
1055+ const size_t ctx_size = ctx_sizes[&model.backend_metal ];
1056+ const size_t n_tensors = ml->tensors_map .tensors .size ();
1057+
1058+ model.buf_metal = ggml_backend_alloc_buffer (&model.backend_metal , ctx_size, n_tensors);
1059+
1060+ struct ggml_init_params params = ggml_init_params_default ();
1061+ params.buffer = &model.buf_metal ;
1062+ params.no_alloc = ml->use_mmap ;
1063+
1064+ model.ctx_metal = ggml_init (params);
1065+ if (!model.ctx_metal ) {
1066+ throw std::runtime_error (format (" ggml_init() failed for CPU backend" ));
1067+ }
1068+
1069+ ctx_gpu = model.ctx_metal ;
1070+ }
1071+ #endif
1072+
10501073 // TODO: clean this
1051- ggml_context * ctx_input = model.ctx_cpu ;
1052- if (model.backend_input == backend_gpu) ctx_input = ctx_gpu;
1053- ggml_context * ctx_output = model.ctx_cpu ;
1054- if (model.backend_output == backend_gpu) ctx_output = ctx_gpu;
1074+ ggml_context * ctx_input = (model.backend_inp == backend_gpu) ? ctx_gpu : model.ctx_cpu ;
1075+ ggml_context * ctx_output = (model.backend_out == backend_gpu) ? ctx_gpu : model.ctx_cpu ;
10551076
10561077 std::vector<ggml_context *> ctx_layers (n_layer, model.ctx_cpu );
10571078 for (uint32_t i = 0 ; i < n_layer; ++i) {
@@ -1102,7 +1123,6 @@ static void llama_model_load_internal(
11021123 (void ) low_vram;
11031124 (void ) n_batch;
11041125
1105-
11061126 // print memory requirements
11071127 {
11081128 const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1 ;
@@ -1224,29 +1244,30 @@ static ggml_graph_splits llama_build_graph(
12241244#endif
12251245
12261246 // TODO: clean this
1227- struct ggml_context * ctx_i = nullptr ;
1247+ struct ggml_context * ctx_i = nullptr ;
12281248 struct ggml_context * ctx_ls[80 ] = {nullptr };
1229- struct ggml_context * ctx_o = nullptr ;
1230- struct ggml_context * ctx_kv = nullptr ;
1249+ struct ggml_context * ctx_o = nullptr ;
1250+ struct ggml_context * ctx_kv = nullptr ;
12311251
1232- if (lctx.model .backend_input == &lctx.model .backend_cpu ) ctx_i = ctx_cpu;
1233- if (lctx.model .backend_output == &lctx.model .backend_cpu ) ctx_o = ctx_cpu;
1252+ if (lctx.model .backend_inp == &lctx.model .backend_cpu ) ctx_i = ctx_cpu;
1253+ if (lctx.model .backend_out == &lctx.model .backend_cpu ) ctx_o = ctx_cpu;
12341254#ifdef GGML_USE_CUDA
1235- if (lctx.model .backend_input == &lctx.model .backend_cuda ) ctx_i = ctx_cuda;
1236- if (lctx.model .backend_output == &lctx.model .backend_cuda ) ctx_o = ctx_cuda;
1255+ if (lctx.model .backend_inp == &lctx.model .backend_cuda ) ctx_i = ctx_cuda;
1256+ if (lctx.model .backend_out == &lctx.model .backend_cuda ) ctx_o = ctx_cuda;
12371257#endif
1258+
12381259 for (int il = 0 ; il < n_layer; il++) {
1239- if (lctx.model .backend_layers [il] == &lctx.model .backend_cpu ) ctx_ls[il] = ctx_cpu;
1260+ if (lctx.model .backend_layers [il] == &lctx.model .backend_cpu ) ctx_ls[il] = ctx_cpu;
12401261#ifdef GGML_USE_CUDA
12411262 if (lctx.model .backend_layers [il] == &lctx.model .backend_cuda ) ctx_ls[il] = ctx_cuda;
12421263#endif
12431264 }
1244- if (lctx.backend_kv == &lctx.model .backend_cpu ) ctx_kv = ctx_cpu;
1265+
1266+ if (lctx.backend_kv == &lctx.model .backend_cpu ) ctx_kv = ctx_cpu;
12451267#ifdef GGML_USE_CUDA
12461268 if (lctx.backend_kv == &lctx.model .backend_cuda ) ctx_kv = ctx_cuda;
12471269#endif
12481270
1249-
12501271 struct ggml_tensor * inpL;
12511272
12521273 if (embeddings_input) {
@@ -2678,7 +2699,7 @@ struct llama_context * llama_new_context_with_model(
26782699 buf_input_size += hparams.n_ctx * ggml_type_size (GGML_TYPE_F32); // input tokens
26792700 // TODO: input embeddings should be optional to save memory
26802701 buf_input_size += hparams.n_embd * hparams.n_ctx * ggml_type_size (GGML_TYPE_F32); // input embeddings
2681- ctx->buf_input = ggml_backend_alloc_buffer (model->backend_input , buf_input_size, 2 );
2702+ ctx->buf_input = ggml_backend_alloc_buffer (model->backend_inp , buf_input_size, 2 );
26822703
26832704 struct ggml_init_params ggml_params = ggml_init_params_default ();
26842705 ggml_params.buffer = &ctx->buf_input ;
@@ -2702,7 +2723,7 @@ struct llama_context * llama_new_context_with_model(
27022723 if (params.embedding ) {
27032724 buf_output_size += hparams.n_embd * ggml_type_size (GGML_TYPE_F32);
27042725 }
2705- ctx->buf_output = ggml_backend_alloc_buffer (model->backend_output , buf_output_size, 2 );
2726+ ctx->buf_output = ggml_backend_alloc_buffer (model->backend_out , buf_output_size, 2 );
27062727
27072728 struct ggml_init_params ggml_params = ggml_init_params_default ();
27082729 ggml_params.buffer = &ctx->buf_output ;
@@ -2731,7 +2752,7 @@ struct llama_context * llama_new_context_with_model(
27312752 }
27322753
27332754 fprintf (stderr, " %s: layer backends: " , __func__);
2734- fprintf (stderr, " input: %s, " , ggml_backend_name (ctx->model .backend_input ));
2755+ fprintf (stderr, " input: %s, " , ggml_backend_name (ctx->model .backend_inp ));
27352756
27362757 int start = 0 ;
27372758 struct ggml_backend * prev_backend = ctx->model .backend_layers [0 ];
@@ -2746,7 +2767,7 @@ struct llama_context * llama_new_context_with_model(
27462767 prev_backend = ctx->model .backend_layers [i];
27472768 }
27482769 }
2749- fprintf (stderr, " output: %s, " , ggml_backend_name (ctx->model .backend_output ));
2770+ fprintf (stderr, " output: %s, " , ggml_backend_name (ctx->model .backend_out ));
27502771 fprintf (stderr, " kv: %s\n " , ggml_backend_name (ctx->backend_kv ));
27512772
27522773#ifdef GGML_USE_MPI
0 commit comments