Skip to content

Commit 0a3861c

Browse files
committed
metal : adapting to ggml_backend (WIP)
1 parent 1102ff5 commit 0a3861c

File tree

3 files changed

+99
-41
lines changed

3 files changed

+99
-41
lines changed

ggml-metal.h

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -19,51 +19,56 @@
1919

2020
#pragma once
2121

22+
#include "ggml.h"
23+
2224
#include <stddef.h>
2325
#include <stdbool.h>
2426

2527
// max memory buffers that can be mapped to the device
2628
#define GGML_METAL_MAX_BUFFERS 16
2729

28-
struct ggml_tensor;
29-
struct ggml_cgraph;
30+
//struct ggml_tensor;
31+
//struct ggml_cgraph;
3032

3133
#ifdef __cplusplus
3234
extern "C" {
3335
#endif
3436

35-
struct ggml_metal_context;
36-
37-
// number of command buffers to use
38-
struct ggml_metal_context * ggml_metal_init(int n_cb);
39-
void ggml_metal_free(struct ggml_metal_context * ctx);
40-
41-
// set the number of command buffers to use
42-
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
37+
// GG: maybe return ptr and avoid the "ggml.h" include
38+
struct ggml_backend ggml_backend_metal_init();
4339

44-
// creates a mapping between a host memory buffer and a device memory buffer
45-
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
46-
// - the mapping is used during computation to determine the arguments of the compute kernels
47-
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
48-
// - max_size specifies the maximum size of a tensor and is used to create shared views such
49-
// that it is guaranteed that the tensor will fit in at least one of the views
40+
//struct ggml_metal_context;
5041
//
51-
bool ggml_metal_add_buffer(
52-
struct ggml_metal_context * ctx,
53-
const char * name,
54-
void * data,
55-
size_t size,
56-
size_t max_size);
57-
58-
// set data from host memory into the device
59-
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
60-
61-
// get data from the device into host memory
62-
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
63-
64-
// same as ggml_graph_compute but uses Metal
65-
// creates gf->n_threads command buffers in parallel
66-
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
42+
//// number of command buffers to use
43+
//struct ggml_metal_context * ggml_metal_init(int n_cb);
44+
//void ggml_metal_free(struct ggml_metal_context * ctx);
45+
//
46+
//// set the number of command buffers to use
47+
//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
48+
//
49+
//// creates a mapping between a host memory buffer and a device memory buffer
50+
//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
51+
//// - the mapping is used during computation to determine the arguments of the compute kernels
52+
//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
53+
//// - max_size specifies the maximum size of a tensor and is used to create shared views such
54+
//// that it is guaranteed that the tensor will fit in at least one of the views
55+
////
56+
//bool ggml_metal_add_buffer(
57+
// struct ggml_metal_context * ctx,
58+
// const char * name,
59+
// void * data,
60+
// size_t size,
61+
// size_t max_size);
62+
//
63+
//// set data from host memory into the device
64+
//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
65+
//
66+
//// get data from the device into host memory
67+
//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
68+
//
69+
//// same as ggml_graph_compute but uses Metal
70+
//// creates gf->n_threads command buffers in parallel
71+
//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
6772

6873
#ifdef __cplusplus
6974
}

ggml-metal.m

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -992,3 +992,31 @@ void ggml_metal_graph_compute(
992992
}
993993
}
994994
}
995+
996+
static struct ggml_backend_interface metal_backend_interface = {
997+
/* .get_name = */ //ggml_backend_metal_name,
998+
/* .free_context = */ //ggml_backend_metal_free_context,
999+
/* .alloc_buffer = */ //ggml_backend_metal_alloc_buffer,
1000+
/* .free_buffer = */ //ggml_backend_metal_free_buffer,
1001+
/* .reset_buffer = */ //ggml_backend_metal_reset_buffer,
1002+
/* .alloc_tensor = */ //ggml_backend_metal_alloc_tensor,
1003+
/* .set_tensor_async = */ //ggml_backend_metal_set_tensor_async,
1004+
/* .get_tensor_async = */ //ggml_backend_metal_get_tensor_async,
1005+
/* .synchronize = */ //ggml_backend_metal_synchronize,
1006+
/* .cpy_tensor_from = */ //nullptr,
1007+
/* .cpy_tensor_to = */ //nullptr,
1008+
/* .graph_plan_create = */ //ggml_backend_metal_graph_plan_create,
1009+
/* .graph_plan_free = */ //ggml_backend_metal_graph_plan_free,
1010+
/* .graph_plan_compute = */ //ggml_backend_metal_graph_plan_compute,
1011+
/* .graph_compute = */ //ggml_backend_metal_graph_compute
1012+
};
1013+
1014+
struct ggml_backend ggml_backend_metal_init(void) {
1015+
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
1016+
1017+
struct ggml_backend metal_backend = {
1018+
/* .interface = */ &metal_backend_interface,
1019+
/* .context = */ ctx
1020+
};
1021+
return metal_backend;
1022+
}

llama.cpp

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,11 @@ struct llama_model {
233233
ggml_buffer buf_cuda;
234234
ggml_context * ctx_cuda = NULL;
235235
#endif
236+
#ifdef GGML_USE_METAL
237+
ggml_backend backend_metal;
238+
ggml_buffer buf_metal;
239+
ggml_context * ctx_metal = NULL;
240+
#endif
236241

237242
// backend assigned to each layer
238243
ggml_backend * backend_input = NULL;
@@ -249,6 +254,12 @@ struct llama_model {
249254
ggml_free(ctx_cuda);
250255
ggml_backend_free_buffer(&buf_cuda);
251256
}
257+
#endif
258+
#ifdef GGML_USE_METAL
259+
if (ctx_metal) {
260+
ggml_free(ctx_metal);
261+
ggml_backend_free_buffer(&buf_metal);
262+
}
252263
#endif
253264
}
254265
};
@@ -290,6 +301,9 @@ struct llama_context {
290301
#ifdef GGML_USE_CUDA
291302
ggml_buffer buf_compute_cuda = {};
292303
#endif
304+
#ifdef GGML_USE_METAL
305+
ggml_buffer buf_compute_metal = {};
306+
#endif
293307

294308
// input tensors
295309
struct ggml_tensor * graph_tokens_in = nullptr;
@@ -940,21 +954,30 @@ static void llama_model_load_internal(
940954
const uint32_t n_layer = hparams.n_layer;
941955

942956
model.backend_cpu = ggml_backend_cpu_init();
957+
958+
ggml_backend * backend_cpu = &model.backend_cpu;
943959
ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection
944960
#ifdef GGML_USE_CUDA
945961
if (n_gpu_layers > 0) {
946962
model.backend_cuda = ggml_backend_cuda_init();
947963
backend_gpu = &model.backend_cuda;
948964
}
949965
#endif
966+
#ifdef GGML_USE_METAL
967+
if (n_gpu_layers > 0) {
968+
model.backend_metal = ggml_backend_metal_init();
969+
backend_gpu = &model.backend_metal;
970+
}
971+
#endif
950972

951973
// assign splits to the backends
952974
const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers);
953-
model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : &model.backend_cpu;
954-
model.backend_output = n_gpu_layers > 0 ? backend_gpu : &model.backend_cpu;
975+
model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu;
976+
model.backend_output = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
977+
955978
model.backend_layers.resize(n_layer);
956-
std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, &model.backend_cpu);
957-
std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu);
979+
std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, backend_cpu);
980+
std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu);
958981

959982
// calculate the size of each context
960983
std::unordered_map<struct ggml_backend *, size_t> ctx_sizes;
@@ -977,17 +1000,18 @@ static void llama_model_load_internal(
9771000
ctx_sizes[model.backend_layers[layer]] += lt.size;
9781001
}
9791002
}
1003+
9801004
// TODO: generalize support for mmap
9811005
size_t mmap_size = 0;
9821006
if (ml->use_mmap) {
983-
mmap_size = ctx_sizes[&model.backend_cpu];
984-
ctx_sizes[&model.backend_cpu] = 0;
1007+
mmap_size = ctx_sizes[backend_cpu];
1008+
ctx_sizes[backend_cpu] = 0;
9851009
}
9861010

9871011
fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
9881012
for (const auto & it : ctx_sizes) {
9891013
fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
990-
if (it.first == &model.backend_cpu && ml->use_mmap) {
1014+
if (it.first == backend_cpu && ml->use_mmap) {
9911015
fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
9921016
}
9931017
fprintf(stderr, "\n");
@@ -996,8 +1020,8 @@ static void llama_model_load_internal(
9961020
// create the buffers and contexts
9971021
{
9981022
size_t cpu_num_tensors = ml->tensors_map.tensors.size();
999-
size_t ctx_size = ctx_sizes[&model.backend_cpu];
1000-
model.buf_cpu = ggml_backend_alloc_buffer(&model.backend_cpu, ctx_size, cpu_num_tensors);
1023+
size_t ctx_size = ctx_sizes[backend_cpu];
1024+
model.buf_cpu = ggml_backend_alloc_buffer(backend_cpu, ctx_size, cpu_num_tensors);
10011025
struct ggml_init_params params = ggml_init_params_default();
10021026
params.buffer = &model.buf_cpu;
10031027
params.no_alloc = ml->use_mmap;
@@ -1028,6 +1052,7 @@ static void llama_model_load_internal(
10281052
if (model.backend_input == backend_gpu) ctx_input = ctx_gpu;
10291053
ggml_context * ctx_output = model.ctx_cpu;
10301054
if (model.backend_output == backend_gpu) ctx_output = ctx_gpu;
1055+
10311056
std::vector<ggml_context *> ctx_layers(n_layer, model.ctx_cpu);
10321057
for (uint32_t i = 0; i < n_layer; ++i) {
10331058
if (model.backend_layers[i] == backend_gpu) {

0 commit comments

Comments
 (0)