metal : adapting to ggml_backend (WIP)

ggerganov · ggerganov · commit 0a3861c47b8d · 2023-07-18T16:54:41.000+03:00
diff --git a/ggml-metal.h b/ggml-metal.h
@@ -19,51 +19,56 @@
 
 #pragma once
 
+#include "ggml.h"
+
 #include <stddef.h>
 #include <stdbool.h>
 
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 16
 
-struct ggml_tensor;
-struct ggml_cgraph;
+//struct ggml_tensor;
+//struct ggml_cgraph;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct ggml_metal_context;
-
-// number of command buffers to use
-struct ggml_metal_context * ggml_metal_init(int n_cb);
-void ggml_metal_free(struct ggml_metal_context * ctx);
-
-// set the number of command buffers to use
-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+// GG: maybe return ptr and avoid the "ggml.h" include
+struct ggml_backend ggml_backend_metal_init();
 
-// creates a mapping between a host memory buffer and a device memory buffer
-// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
-// - the mapping is used during computation to determine the arguments of the compute kernels
-// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
-// - max_size specifies the maximum size of a tensor and is used to create shared views such
-//   that it is guaranteed that the tensor will fit in at least one of the views
+//struct ggml_metal_context;
 //
-bool ggml_metal_add_buffer(
-        struct ggml_metal_context * ctx,
-                       const char * name,
-                             void * data,
-                           size_t   size,
-                           size_t   max_size);
-
-// set data from host memory into the device
-void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// same as ggml_graph_compute but uses Metal
-// creates gf->n_threads command buffers in parallel
-void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+//// number of command buffers to use
+//struct ggml_metal_context * ggml_metal_init(int n_cb);
+//void ggml_metal_free(struct ggml_metal_context * ctx);
+//
+//// set the number of command buffers to use
+//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+//
+//// creates a mapping between a host memory buffer and a device memory buffer
+//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+//// - the mapping is used during computation to determine the arguments of the compute kernels
+//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+//// - max_size specifies the maximum size of a tensor and is used to create shared views such
+////   that it is guaranteed that the tensor will fit in at least one of the views
+////
+//bool ggml_metal_add_buffer(
+//        struct ggml_metal_context * ctx,
+//                       const char * name,
+//                             void * data,
+//                           size_t   size,
+//                           size_t   max_size);
+//
+//// set data from host memory into the device
+//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+//
+//// get data from the device into host memory
+//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+//
+//// same as ggml_graph_compute but uses Metal
+//// creates gf->n_threads command buffers in parallel
+//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 
 #ifdef __cplusplus
 }
diff --git a/ggml-metal.m b/ggml-metal.m
@@ -992,3 +992,31 @@ void ggml_metal_graph_compute(
         }
     }
 }
+
+static struct ggml_backend_interface metal_backend_interface = {
+    /* .get_name            = */ //ggml_backend_metal_name,
+    /* .free_context        = */ //ggml_backend_metal_free_context,
+    /* .alloc_buffer        = */ //ggml_backend_metal_alloc_buffer,
+    /* .free_buffer         = */ //ggml_backend_metal_free_buffer,
+    /* .reset_buffer        = */ //ggml_backend_metal_reset_buffer,
+    /* .alloc_tensor        = */ //ggml_backend_metal_alloc_tensor,
+    /* .set_tensor_async    = */ //ggml_backend_metal_set_tensor_async,
+    /* .get_tensor_async    = */ //ggml_backend_metal_get_tensor_async,
+    /* .synchronize         = */ //ggml_backend_metal_synchronize,
+    /* .cpy_tensor_from     = */ //nullptr,
+    /* .cpy_tensor_to       = */ //nullptr,
+    /* .graph_plan_create   = */ //ggml_backend_metal_graph_plan_create,
+    /* .graph_plan_free     = */ //ggml_backend_metal_graph_plan_free,
+    /* .graph_plan_compute  = */ //ggml_backend_metal_graph_plan_compute,
+    /* .graph_compute       = */ //ggml_backend_metal_graph_compute
+};
+
+struct ggml_backend ggml_backend_metal_init(void) {
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+
+    struct ggml_backend metal_backend = {
+        /* .interface = */ &metal_backend_interface,
+        /* .context   = */ ctx
+    };
+    return metal_backend;
+}
diff --git a/llama.cpp b/llama.cpp
@@ -233,6 +233,11 @@ struct llama_model {
     ggml_buffer    buf_cuda;
     ggml_context * ctx_cuda = NULL;
 #endif
+#ifdef GGML_USE_METAL
+    ggml_backend   backend_metal;
+    ggml_buffer    buf_metal;
+    ggml_context * ctx_metal = NULL;
+#endif
 
     // backend assigned to each layer
     ggml_backend * backend_input = NULL;
@@ -249,6 +254,12 @@ struct llama_model {
             ggml_free(ctx_cuda);
             ggml_backend_free_buffer(&buf_cuda);
         }
+#endif
+#ifdef GGML_USE_METAL
+        if (ctx_metal) {
+            ggml_free(ctx_metal);
+            ggml_backend_free_buffer(&buf_metal);
+        }
 #endif
     }
 };
@@ -290,6 +301,9 @@ struct llama_context {
 #ifdef GGML_USE_CUDA
     ggml_buffer buf_compute_cuda = {};
 #endif
+#ifdef GGML_USE_METAL
+    ggml_buffer buf_compute_metal = {};
+#endif
 
     // input tensors
     struct ggml_tensor * graph_tokens_in = nullptr;
@@ -940,21 +954,30 @@ static void llama_model_load_internal(
     const uint32_t n_layer = hparams.n_layer;
 
     model.backend_cpu = ggml_backend_cpu_init();
+
+    ggml_backend * backend_cpu = &model.backend_cpu;
     ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection
 #ifdef GGML_USE_CUDA
     if (n_gpu_layers > 0) {
         model.backend_cuda = ggml_backend_cuda_init();
         backend_gpu = &model.backend_cuda;
     }
 #endif
+#ifdef GGML_USE_METAL
+    if (n_gpu_layers > 0) {
+        model.backend_metal = ggml_backend_metal_init();
+        backend_gpu = &model.backend_metal;
+    }
+#endif
 
     // assign splits to the backends
     const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers);
-    model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : &model.backend_cpu;
-    model.backend_output = n_gpu_layers > 0 ? backend_gpu : &model.backend_cpu;
+    model.backend_input  = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu;
+    model.backend_output = n_gpu_layers > 0            ? backend_gpu : backend_cpu;
+
     model.backend_layers.resize(n_layer);
-    std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, &model.backend_cpu);
-    std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu);
+    std::fill(model.backend_layers.begin(),               model.backend_layers.begin() + i_gpu_start, backend_cpu);
+    std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(),                 backend_gpu);
 
     // calculate the size of each context
     std::unordered_map<struct ggml_backend *, size_t> ctx_sizes;
@@ -977,17 +1000,18 @@ static void llama_model_load_internal(
             ctx_sizes[model.backend_layers[layer]] += lt.size;
         }
     }
+
     // TODO: generalize support for mmap
     size_t mmap_size = 0;
     if (ml->use_mmap) {
-        mmap_size = ctx_sizes[&model.backend_cpu];
-        ctx_sizes[&model.backend_cpu] = 0;
+        mmap_size = ctx_sizes[backend_cpu];
+        ctx_sizes[backend_cpu] = 0;
     }
 
     fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
     for (const auto & it : ctx_sizes) {
         fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
-        if (it.first == &model.backend_cpu && ml->use_mmap) {
+        if (it.first == backend_cpu && ml->use_mmap) {
             fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
         }
         fprintf(stderr, "\n");
@@ -996,8 +1020,8 @@ static void llama_model_load_internal(
     // create the buffers and contexts
     {
         size_t cpu_num_tensors = ml->tensors_map.tensors.size();
-        size_t ctx_size = ctx_sizes[&model.backend_cpu];
-        model.buf_cpu = ggml_backend_alloc_buffer(&model.backend_cpu, ctx_size, cpu_num_tensors);
+        size_t ctx_size = ctx_sizes[backend_cpu];
+        model.buf_cpu = ggml_backend_alloc_buffer(backend_cpu, ctx_size, cpu_num_tensors);
         struct ggml_init_params params = ggml_init_params_default();
         params.buffer = &model.buf_cpu;
         params.no_alloc = ml->use_mmap;
@@ -1028,6 +1052,7 @@ static void llama_model_load_internal(
     if (model.backend_input == backend_gpu) ctx_input = ctx_gpu;
     ggml_context * ctx_output = model.ctx_cpu;
     if (model.backend_output == backend_gpu) ctx_output = ctx_gpu;
+
     std::vector<ggml_context *> ctx_layers(n_layer, model.ctx_cpu);
     for (uint32_t i = 0; i < n_layer; ++i) {
         if (model.backend_layers[i] == backend_gpu) {