Skip to content

Commit 927fac3

Browse files
remove extra, backend from ggml.c, ggml.h
1 parent e645d12 commit 927fac3

File tree

4 files changed

+41
-56
lines changed

4 files changed

+41
-56
lines changed

ggml-cuda.cu

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,7 +1196,7 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
11961196
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
11971197
}
11981198

1199-
void ggml_cuda_noop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1199+
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
12001200
(void) src0;
12011201
(void) src1;
12021202
(void) dst;
@@ -1287,6 +1287,10 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
12871287
}
12881288

12891289
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
1290+
if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) {
1291+
ggml_cuda_assign_buffers(tensor);
1292+
}
1293+
12901294
const size_t size = ggml_nbytes(tensor);
12911295
const size_t scratch_size = g_n_batch * GGML_CUDA_SCRATCH_SIZE_PER_BATCH;
12921296
GGML_ASSERT(size <= scratch_size);
@@ -1367,7 +1371,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
13671371
if (!any_on_device) {
13681372
return false;
13691373
}
1370-
func = ggml_cuda_noop;
1374+
func = ggml_cuda_nop;
13711375
break;
13721376
case GGML_OP_ROPE:
13731377
if (!any_on_device) {

ggml.c

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3639,8 +3639,6 @@ struct ggml_context {
36393639

36403640
struct ggml_scratch scratch;
36413641
struct ggml_scratch scratch_save;
3642-
3643-
enum ggml_backend default_backend;
36443642
};
36453643

36463644
struct ggml_context_container {
@@ -3967,7 +3965,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
39673965
/*.objects_end =*/ NULL,
39683966
/*.scratch =*/ { 0, 0, NULL, },
39693967
/*.scratch_save =*/ { 0, 0, NULL, },
3970-
/*.default_backend =*/ GGML_BACKEND_CPU,
39713968
};
39723969

39733970
GGML_ASSERT(ctx->mem_buffer != NULL);
@@ -4026,10 +4023,6 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
40264023
ctx->no_alloc = no_alloc;
40274024
}
40284025

4029-
void ggml_set_default_backend(struct ggml_context * ctx, enum ggml_backend backend) {
4030-
ctx->default_backend = backend;
4031-
}
4032-
40334026
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
40344027
return ctx->mem_buffer;
40354028
}
@@ -4141,7 +4134,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
41414134

41424135
*result = (struct ggml_tensor) {
41434136
/*.type =*/ type,
4144-
/*.backend =*/ ctx->default_backend,
4137+
/*.backend =*/ GGML_BACKEND_CPU,
41454138
/*.n_dims =*/ n_dims,
41464139
/*.ne =*/ { 1, 1, 1, 1 },
41474140
/*.nb =*/ { 0, 0, 0, 0 },
@@ -4174,15 +4167,6 @@ struct ggml_tensor * ggml_new_tensor_impl(
41744167
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
41754168
}
41764169

4177-
#ifdef GGML_USE_CUBLAS
4178-
if (result->backend == GGML_BACKEND_GPU) {
4179-
ggml_cuda_assign_buffers(result);
4180-
}
4181-
#else
4182-
GGML_ASSERT(result->backend == GGML_BACKEND_CPU);
4183-
#endif // GGML_USE_CUBLAS
4184-
GGML_ASSERT(result->backend != GGML_BACKEND_GPU_SPLIT);
4185-
41864170
ctx->n_objects++;
41874171

41884172
return result;
@@ -4537,8 +4521,6 @@ struct ggml_tensor * ggml_view_tensor(
45374521
result->nb[1] = src->nb[1];
45384522
result->nb[2] = src->nb[2];
45394523
result->nb[3] = src->nb[3];
4540-
result->backend = src->backend;
4541-
result->extra = src->extra;
45424524

45434525
return result;
45444526
}
@@ -5691,8 +5673,6 @@ struct ggml_tensor * ggml_reshape(
56915673
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
56925674
result->src0 = a;
56935675
result->src1 = NULL;
5694-
result->backend = a->backend;
5695-
result->extra = a->extra;
56965676

56975677
return result;
56985678
}
@@ -5717,8 +5697,6 @@ struct ggml_tensor * ggml_reshape_1d(
57175697
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
57185698
result->src0 = a;
57195699
result->src1 = NULL;
5720-
result->backend = a->backend;
5721-
result->extra = a->extra;
57225700

57235701
return result;
57245702
}
@@ -5744,8 +5722,6 @@ struct ggml_tensor * ggml_reshape_2d(
57445722
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
57455723
result->src0 = a;
57465724
result->src1 = NULL;
5747-
result->backend = a->backend;
5748-
result->extra = a->extra;
57495725

57505726
return result;
57515727
}
@@ -5772,8 +5748,6 @@ struct ggml_tensor * ggml_reshape_3d(
57725748
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
57735749
result->src0 = a;
57745750
result->src1 = NULL;
5775-
result->backend = a->backend;
5776-
result->extra = a->extra;
57775751

57785752
return result;
57795753
}
@@ -5802,8 +5776,6 @@ struct ggml_tensor * ggml_reshape_4d(
58025776
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
58035777
result->src0 = a;
58045778
result->src1 = NULL;
5805-
result->backend = a->backend;
5806-
result->extra = a->extra;
58075779

58085780
return result;
58095781
}

ggml.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,6 @@ extern "C" {
479479

480480
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
481481
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
482-
GGML_API void ggml_set_default_backend(struct ggml_context * ctx, enum ggml_backend backend);
483482

484483
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
485484
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);

llama.cpp

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ static const size_t MB = 1024*1024;
6060
// TODO: dynamically determine these sizes
6161
// needs modifications in ggml
6262

63+
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
64+
65+
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
66+
(void) tensor;
67+
}
68+
6369
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
6470
{
6571
static std::map<e_model, size_t> k_sizes = {
@@ -1300,10 +1306,11 @@ static bool llama_eval_internal(
13001306
const int i_gpu_start = n_layer - n_gpu_layers;
13011307

13021308
for (int il = 0; il < n_layer; ++il) {
1303-
ggml_backend backend_offload = GGML_BACKEND_CPU;
1309+
offload_func_t offload_func = llama_nop;
1310+
13041311
#ifdef GGML_USE_CUBLAS
13051312
if (il >= i_gpu_start) {
1306-
backend_offload = GGML_BACKEND_GPU;
1313+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
13071314
}
13081315
#endif // GGML_USE_CUBLAS
13091316

@@ -1313,40 +1320,31 @@ static bool llama_eval_internal(
13131320

13141321
// norm
13151322
{
1316-
ggml_set_default_backend(ctx0, backend_offload);
13171323
cur = ggml_rms_norm(ctx0, inpL);
1324+
offload_func(cur);
13181325
ggml_set_name(cur, "rms_norm_0");
13191326

13201327
// cur = cur*attention_norm(broadcasted)
13211328
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1329+
offload_func(cur);
13221330
ggml_set_name(cur, "attention_norm_0");
13231331
}
13241332

13251333
// self-attention
13261334
{
13271335
// compute Q and K and RoPE them
13281336
struct ggml_tensor * tmpq = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N);
1337+
offload_func(cur);
13291338
ggml_set_name(tmpq, "tmpq");
13301339
struct ggml_tensor * tmpk = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N);
1340+
offload_func(cur);
13311341
ggml_set_name(tmpk, "tmpk");
1332-
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
13331342

1334-
#ifdef GGML_USE_CUBLAS
1335-
struct ggml_tensor * Kcur;
1336-
struct ggml_tensor * Qcur;
1337-
if (backend_offload == GGML_BACKEND_GPU) {
1338-
Kcur = ggml_rope(ctx0, tmpk, n_past, n_rot, 0);
1339-
Qcur = ggml_rope(ctx0, tmpq, n_past, n_rot, 0);
1340-
} else {
1341-
Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
1342-
Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
1343-
}
1344-
#else
13451343
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
1344+
ggml_set_name(Kcur, "Kcur");
1345+
13461346
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
1347-
#endif // GGML_USE_CUBLAS
13481347
ggml_set_name(Qcur, "Qcur");
1349-
ggml_set_name(Kcur, "Kcur");
13501348

13511349
// store key and value to memory
13521350
{
@@ -1430,62 +1428,70 @@ static bool llama_eval_internal(
14301428
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
14311429
ggml_set_name(cur, "KQV_merged_contiguous");
14321430

1433-
ggml_set_default_backend(ctx0, backend_offload);
14341431
// projection (no bias)
14351432
cur = ggml_mul_mat(ctx0,
14361433
model.layers[il].wo,
14371434
cur);
1435+
offload_func(cur);
14381436
ggml_set_name(cur, "result_wo");
14391437
}
14401438

14411439
lctx.use_buf(ctx0, 1);
14421440
//ggml_cuda_set_scratch(1);
14431441

14441442
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1443+
offload_func(inpFF);
14451444
ggml_set_name(inpFF, "inpFF");
14461445

14471446
// feed-forward network
14481447
{
14491448
// norm
14501449
{
14511450
cur = ggml_rms_norm(ctx0, inpFF);
1451+
offload_func(cur);
14521452
ggml_set_name(cur, "rms_norm_1");
14531453

14541454
// cur = cur*ffn_norm(broadcasted)
14551455
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1456+
offload_func(cur);
14561457
ggml_set_name(cur, "ffn_norm");
14571458
}
14581459

14591460
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
14601461
model.layers[il].w3,
14611462
cur);
1462-
ggml_set_name(cur, "result_w3");
1463+
offload_func(tmp);
1464+
ggml_set_name(tmp, "result_w3");
14631465

14641466
cur = ggml_mul_mat(ctx0,
14651467
model.layers[il].w1,
14661468
cur);
1469+
offload_func(cur);
14671470
ggml_set_name(cur, "result_w2");
14681471

14691472
// SILU activation
14701473
cur = ggml_silu(ctx0, cur);
1474+
offload_func(cur);
14711475
ggml_set_name(cur, "silu");
14721476

14731477
cur = ggml_mul(ctx0, cur, tmp);
1478+
offload_func(cur);
14741479
ggml_set_name(cur, "silu_x_result_w3");
14751480

14761481
cur = ggml_mul_mat(ctx0,
14771482
model.layers[il].w2,
14781483
cur);
1484+
offload_func(cur);
14791485
ggml_set_name(cur, "result_w2");
14801486
}
14811487

14821488
cur = ggml_add(ctx0, cur, inpFF);
1489+
offload_func(cur);
14831490
ggml_set_name(cur, "inpFF_+_result_w2");
14841491

14851492
// input for next layer
14861493
inpL = cur;
14871494

1488-
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
14891495
}
14901496

14911497
lctx.use_buf(ctx0, 0);
@@ -1494,28 +1500,32 @@ static bool llama_eval_internal(
14941500
// used at the end to optionally extract the embeddings
14951501
struct ggml_tensor * embeddings = NULL;
14961502

1503+
offload_func_t offload_func = llama_nop;
1504+
14971505
#ifdef GGML_USE_CUBLAS
1498-
if (n_gpu_layers > n_layer) {
1499-
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
1500-
}
1506+
if (n_gpu_layers > n_layer) {
1507+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1508+
}
15011509
#endif // GGML_USE_CUBLAS
15021510

15031511
// norm
15041512
{
15051513
cur = ggml_rms_norm(ctx0, inpL);
1514+
offload_func(cur);
15061515
ggml_set_name(cur, "rms_norm_inpL");
15071516

15081517
cur = ggml_rms_norm(ctx0, cur);
1518+
offload_func(cur);
15091519
ggml_set_name(cur, "rms_norm_after");
15101520

15111521
// cur = cur*norm(broadcasted)
15121522
cur = ggml_mul(ctx0, cur, model.norm);
1523+
offload_func(cur);
15131524
ggml_set_name(cur, "result_norm");
15141525

15151526
embeddings = cur;
15161527
}
15171528

1518-
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
15191529

15201530
// lm_head
15211531
cur = ggml_mul_mat(ctx0, model.output, cur);

0 commit comments

Comments
 (0)