@@ -233,6 +233,11 @@ struct llama_model {
233233 ggml_buffer buf_cuda;
234234 ggml_context * ctx_cuda = NULL ;
235235#endif
236+ #ifdef GGML_USE_METAL
237+ ggml_backend backend_metal;
238+ ggml_buffer buf_metal;
239+ ggml_context * ctx_metal = NULL ;
240+ #endif
236241
237242 // backend assigned to each layer
238243 ggml_backend * backend_input = NULL ;
@@ -249,6 +254,12 @@ struct llama_model {
249254 ggml_free (ctx_cuda);
250255 ggml_backend_free_buffer (&buf_cuda);
251256 }
257+ #endif
258+ #ifdef GGML_USE_METAL
259+ if (ctx_metal) {
260+ ggml_free (ctx_metal);
261+ ggml_backend_free_buffer (&buf_metal);
262+ }
252263#endif
253264 }
254265};
@@ -290,6 +301,9 @@ struct llama_context {
290301#ifdef GGML_USE_CUDA
291302 ggml_buffer buf_compute_cuda = {};
292303#endif
304+ #ifdef GGML_USE_METAL
305+ ggml_buffer buf_compute_metal = {};
306+ #endif
293307
294308 // input tensors
295309 struct ggml_tensor * graph_tokens_in = nullptr ;
@@ -940,21 +954,30 @@ static void llama_model_load_internal(
940954 const uint32_t n_layer = hparams.n_layer ;
941955
942956 model.backend_cpu = ggml_backend_cpu_init ();
957+
958+ ggml_backend * backend_cpu = &model.backend_cpu ;
943959 ggml_backend * backend_gpu = &model.backend_cpu ; // hack until we have a proper backend selection
944960#ifdef GGML_USE_CUDA
945961 if (n_gpu_layers > 0 ) {
946962 model.backend_cuda = ggml_backend_cuda_init ();
947963 backend_gpu = &model.backend_cuda ;
948964 }
949965#endif
966+ #ifdef GGML_USE_METAL
967+ if (n_gpu_layers > 0 ) {
968+ model.backend_metal = ggml_backend_metal_init ();
969+ backend_gpu = &model.backend_metal ;
970+ }
971+ #endif
950972
951973 // assign splits to the backends
952974 const int i_gpu_start = std::max (0 , (int )n_layer - n_gpu_layers);
953- model.backend_input = n_gpu_layers > (int )n_layer ? backend_gpu : &model.backend_cpu ;
954- model.backend_output = n_gpu_layers > 0 ? backend_gpu : &model.backend_cpu ;
975+ model.backend_input = n_gpu_layers > (int )n_layer ? backend_gpu : backend_cpu;
976+ model.backend_output = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
977+
955978 model.backend_layers .resize (n_layer);
956- std::fill (model.backend_layers .begin (), model.backend_layers .begin () + i_gpu_start, &model. backend_cpu );
957- std::fill (model.backend_layers .begin () + i_gpu_start, model.backend_layers .end (), backend_gpu);
979+ std::fill (model.backend_layers .begin (), model.backend_layers .begin () + i_gpu_start, backend_cpu);
980+ std::fill (model.backend_layers .begin () + i_gpu_start, model.backend_layers .end (), backend_gpu);
958981
959982 // calculate the size of each context
960983 std::unordered_map<struct ggml_backend *, size_t > ctx_sizes;
@@ -977,17 +1000,18 @@ static void llama_model_load_internal(
9771000 ctx_sizes[model.backend_layers [layer]] += lt.size ;
9781001 }
9791002 }
1003+
9801004 // TODO: generalize support for mmap
9811005 size_t mmap_size = 0 ;
9821006 if (ml->use_mmap ) {
983- mmap_size = ctx_sizes[&model. backend_cpu ];
984- ctx_sizes[&model. backend_cpu ] = 0 ;
1007+ mmap_size = ctx_sizes[backend_cpu];
1008+ ctx_sizes[backend_cpu] = 0 ;
9851009 }
9861010
9871011 fprintf (stderr, " %s: ggml ctx sizes:\n " , __func__);
9881012 for (const auto & it : ctx_sizes) {
9891013 fprintf (stderr, " %8s = %7.2f MB" , ggml_backend_name (it.first ), it.second / 1024.0 / 1024.0 );
990- if (it.first == &model. backend_cpu && ml->use_mmap ) {
1014+ if (it.first == backend_cpu && ml->use_mmap ) {
9911015 fprintf (stderr, " + %7.2f MB (mmap)" , mmap_size / 1024.0 / 1024.0 );
9921016 }
9931017 fprintf (stderr, " \n " );
@@ -996,8 +1020,8 @@ static void llama_model_load_internal(
9961020 // create the buffers and contexts
9971021 {
9981022 size_t cpu_num_tensors = ml->tensors_map .tensors .size ();
999- size_t ctx_size = ctx_sizes[&model. backend_cpu ];
1000- model.buf_cpu = ggml_backend_alloc_buffer (&model. backend_cpu , ctx_size, cpu_num_tensors);
1023+ size_t ctx_size = ctx_sizes[backend_cpu];
1024+ model.buf_cpu = ggml_backend_alloc_buffer (backend_cpu, ctx_size, cpu_num_tensors);
10011025 struct ggml_init_params params = ggml_init_params_default ();
10021026 params.buffer = &model.buf_cpu ;
10031027 params.no_alloc = ml->use_mmap ;
@@ -1028,6 +1052,7 @@ static void llama_model_load_internal(
10281052 if (model.backend_input == backend_gpu) ctx_input = ctx_gpu;
10291053 ggml_context * ctx_output = model.ctx_cpu ;
10301054 if (model.backend_output == backend_gpu) ctx_output = ctx_gpu;
1055+
10311056 std::vector<ggml_context *> ctx_layers (n_layer, model.ctx_cpu );
10321057 for (uint32_t i = 0 ; i < n_layer; ++i) {
10331058 if (model.backend_layers [i] == backend_gpu) {
0 commit comments