Skip to content

Commit 90503f1

Browse files
committed
llama : init metal backend as CPU backend for now
1 parent 0a3861c commit 90503f1

File tree

2 files changed

+48
-27
lines changed

2 files changed

+48
-27
lines changed

ggml-backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ extern "C" {
7878
static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); }
7979

8080
// buffer and tensor allocation
81-
GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors);
81+
GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); // GG: probably return ptr
8282
GGML_API void ggml_backend_free_buffer(struct ggml_buffer * buffer);
8383
static inline void ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); }
8484
static inline void ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); }

llama.cpp

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,8 @@ struct llama_model {
240240
#endif
241241

242242
// backend assigned to each layer
243-
ggml_backend * backend_input = NULL;
244-
ggml_backend * backend_output = NULL;
243+
ggml_backend * backend_inp = NULL;
244+
ggml_backend * backend_out = NULL;
245245
std::vector<ggml_backend *> backend_layers;
246246

247247
~llama_model() {
@@ -965,15 +965,15 @@ static void llama_model_load_internal(
965965
#endif
966966
#ifdef GGML_USE_METAL
967967
if (n_gpu_layers > 0) {
968-
model.backend_metal = ggml_backend_metal_init();
968+
model.backend_metal = ggml_backend_cpu_init();
969969
backend_gpu = &model.backend_metal;
970970
}
971971
#endif
972972

973973
// assign splits to the backends
974974
const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers);
975-
model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu;
976-
model.backend_output = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
975+
model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu;
976+
model.backend_out = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
977977

978978
model.backend_layers.resize(n_layer);
979979
std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, backend_cpu);
@@ -983,10 +983,10 @@ static void llama_model_load_internal(
983983
std::unordered_map<struct ggml_backend *, size_t> ctx_sizes;
984984
for (const llama_load_tensor & lt : ml->tensors_map.tensors) {
985985
if (lt.name == "tok_embeddings.weight") {
986-
ctx_sizes[model.backend_input] += lt.size;
986+
ctx_sizes[model.backend_inp] += lt.size;
987987
}
988988
else if (lt.name == "norm.weight" || lt.name == "output.weight") {
989-
ctx_sizes[model.backend_output] += lt.size;
989+
ctx_sizes[model.backend_out] += lt.size;
990990
}
991991
else {
992992
// parse layer number from name
@@ -1032,6 +1032,7 @@ static void llama_model_load_internal(
10321032
}
10331033

10341034
ggml_context * ctx_gpu = model.ctx_cpu;
1035+
10351036
#ifdef GGML_USE_CUDA
10361037
if (n_gpu_layers > 0) {
10371038
size_t gpu_num_tensors = ml->tensors_map.tensors.size();
@@ -1043,15 +1044,35 @@ static void llama_model_load_internal(
10431044
if (!model.ctx_cuda) {
10441045
throw std::runtime_error(format("ggml_init() failed for CUDA backend"));
10451046
}
1047+
10461048
ctx_gpu = model.ctx_cuda;
10471049
}
10481050
#endif
10491051

1052+
#ifdef GGML_USE_METAL
1053+
if (n_gpu_layers > 0) {
1054+
// the metal context is actually a CPU context because we have unified memory
1055+
const size_t ctx_size = ctx_sizes[&model.backend_metal];
1056+
const size_t n_tensors = ml->tensors_map.tensors.size();
1057+
1058+
model.buf_metal = ggml_backend_alloc_buffer(&model.backend_metal, ctx_size, n_tensors);
1059+
1060+
struct ggml_init_params params = ggml_init_params_default();
1061+
params.buffer = &model.buf_metal;
1062+
params.no_alloc = ml->use_mmap;
1063+
1064+
model.ctx_metal = ggml_init(params);
1065+
if (!model.ctx_metal) {
1066+
throw std::runtime_error(format("ggml_init() failed for CPU backend"));
1067+
}
1068+
1069+
ctx_gpu = model.ctx_metal;
1070+
}
1071+
#endif
1072+
10501073
// TODO: clean this
1051-
ggml_context * ctx_input = model.ctx_cpu;
1052-
if (model.backend_input == backend_gpu) ctx_input = ctx_gpu;
1053-
ggml_context * ctx_output = model.ctx_cpu;
1054-
if (model.backend_output == backend_gpu) ctx_output = ctx_gpu;
1074+
ggml_context * ctx_input = (model.backend_inp == backend_gpu) ? ctx_gpu : model.ctx_cpu;
1075+
ggml_context * ctx_output = (model.backend_out == backend_gpu) ? ctx_gpu : model.ctx_cpu;
10551076

10561077
std::vector<ggml_context *> ctx_layers(n_layer, model.ctx_cpu);
10571078
for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1102,7 +1123,6 @@ static void llama_model_load_internal(
11021123
(void) low_vram;
11031124
(void) n_batch;
11041125

1105-
11061126
// print memory requirements
11071127
{
11081128
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
@@ -1224,29 +1244,30 @@ static ggml_graph_splits llama_build_graph(
12241244
#endif
12251245

12261246
// TODO: clean this
1227-
struct ggml_context * ctx_i = nullptr;
1247+
struct ggml_context * ctx_i = nullptr;
12281248
struct ggml_context * ctx_ls[80] = {nullptr};
1229-
struct ggml_context * ctx_o = nullptr;
1230-
struct ggml_context * ctx_kv = nullptr;
1249+
struct ggml_context * ctx_o = nullptr;
1250+
struct ggml_context * ctx_kv = nullptr;
12311251

1232-
if (lctx.model.backend_input == &lctx.model.backend_cpu) ctx_i = ctx_cpu;
1233-
if (lctx.model.backend_output == &lctx.model.backend_cpu) ctx_o = ctx_cpu;
1252+
if (lctx.model.backend_inp == &lctx.model.backend_cpu) ctx_i = ctx_cpu;
1253+
if (lctx.model.backend_out == &lctx.model.backend_cpu) ctx_o = ctx_cpu;
12341254
#ifdef GGML_USE_CUDA
1235-
if (lctx.model.backend_input == &lctx.model.backend_cuda) ctx_i = ctx_cuda;
1236-
if (lctx.model.backend_output == &lctx.model.backend_cuda) ctx_o = ctx_cuda;
1255+
if (lctx.model.backend_inp == &lctx.model.backend_cuda) ctx_i = ctx_cuda;
1256+
if (lctx.model.backend_out == &lctx.model.backend_cuda) ctx_o = ctx_cuda;
12371257
#endif
1258+
12381259
for (int il = 0; il < n_layer; il++) {
1239-
if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu;
1260+
if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu;
12401261
#ifdef GGML_USE_CUDA
12411262
if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda;
12421263
#endif
12431264
}
1244-
if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu;
1265+
1266+
if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu;
12451267
#ifdef GGML_USE_CUDA
12461268
if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda;
12471269
#endif
12481270

1249-
12501271
struct ggml_tensor * inpL;
12511272

12521273
if (embeddings_input) {
@@ -2678,7 +2699,7 @@ struct llama_context * llama_new_context_with_model(
26782699
buf_input_size += hparams.n_ctx * ggml_type_size(GGML_TYPE_F32); // input tokens
26792700
// TODO: input embeddings should be optional to save memory
26802701
buf_input_size += hparams.n_embd * hparams.n_ctx * ggml_type_size(GGML_TYPE_F32); // input embeddings
2681-
ctx->buf_input = ggml_backend_alloc_buffer(model->backend_input, buf_input_size, 2);
2702+
ctx->buf_input = ggml_backend_alloc_buffer(model->backend_inp, buf_input_size, 2);
26822703

26832704
struct ggml_init_params ggml_params = ggml_init_params_default();
26842705
ggml_params.buffer = &ctx->buf_input;
@@ -2702,7 +2723,7 @@ struct llama_context * llama_new_context_with_model(
27022723
if (params.embedding) {
27032724
buf_output_size += hparams.n_embd * ggml_type_size(GGML_TYPE_F32);
27042725
}
2705-
ctx->buf_output = ggml_backend_alloc_buffer(model->backend_output, buf_output_size, 2);
2726+
ctx->buf_output = ggml_backend_alloc_buffer(model->backend_out, buf_output_size, 2);
27062727

27072728
struct ggml_init_params ggml_params = ggml_init_params_default();
27082729
ggml_params.buffer = &ctx->buf_output;
@@ -2731,7 +2752,7 @@ struct llama_context * llama_new_context_with_model(
27312752
}
27322753

27332754
fprintf(stderr, "%s: layer backends: ", __func__);
2734-
fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_input));
2755+
fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp));
27352756

27362757
int start = 0;
27372758
struct ggml_backend * prev_backend = ctx->model.backend_layers[0];
@@ -2746,7 +2767,7 @@ struct llama_context * llama_new_context_with_model(
27462767
prev_backend = ctx->model.backend_layers[i];
27472768
}
27482769
}
2749-
fprintf(stderr, "output: %s, ", ggml_backend_name(ctx->model.backend_output));
2770+
fprintf(stderr, "output: %s, ", ggml_backend_name(ctx->model.backend_out));
27502771
fprintf(stderr, "kv: %s\n", ggml_backend_name(ctx->backend_kv));
27512772

27522773
#ifdef GGML_USE_MPI

0 commit comments

Comments
 (0)