@@ -60,6 +60,12 @@ static const size_t MB = 1024*1024;
60
60
// TODO: dynamically determine these sizes
61
61
// needs modifications in ggml
62
62
63
+ typedef void (*offload_func_t )(struct ggml_tensor * tensor);
64
+
65
+ void llama_nop (struct ggml_tensor * tensor) { // don't offload by default
66
+ (void ) tensor;
67
+ }
68
+
63
69
static const std::map<e_model, size_t > & MEM_REQ_SCRATCH0 ()
64
70
{
65
71
static std::map<e_model, size_t > k_sizes = {
@@ -1300,10 +1306,11 @@ static bool llama_eval_internal(
1300
1306
const int i_gpu_start = n_layer - n_gpu_layers;
1301
1307
1302
1308
for (int il = 0 ; il < n_layer; ++il) {
1303
- ggml_backend backend_offload = GGML_BACKEND_CPU;
1309
+ offload_func_t offload_func = llama_nop;
1310
+
1304
1311
#ifdef GGML_USE_CUBLAS
1305
1312
if (il >= i_gpu_start) {
1306
- backend_offload = GGML_BACKEND_GPU;
1313
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1307
1314
}
1308
1315
#endif // GGML_USE_CUBLAS
1309
1316
@@ -1313,40 +1320,31 @@ static bool llama_eval_internal(
1313
1320
1314
1321
// norm
1315
1322
{
1316
- ggml_set_default_backend (ctx0, backend_offload);
1317
1323
cur = ggml_rms_norm (ctx0, inpL);
1324
+ offload_func (cur);
1318
1325
ggml_set_name (cur, " rms_norm_0" );
1319
1326
1320
1327
// cur = cur*attention_norm(broadcasted)
1321
1328
cur = ggml_mul (ctx0, cur, model.layers [il].attention_norm );
1329
+ offload_func (cur);
1322
1330
ggml_set_name (cur, " attention_norm_0" );
1323
1331
}
1324
1332
1325
1333
// self-attention
1326
1334
{
1327
1335
// compute Q and K and RoPE them
1328
1336
struct ggml_tensor * tmpq = ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wq , cur), n_embd/n_head, n_head, N);
1337
+ offload_func (cur);
1329
1338
ggml_set_name (tmpq, " tmpq" );
1330
1339
struct ggml_tensor * tmpk = ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wk , cur), n_embd/n_head, n_head, N);
1340
+ offload_func (cur);
1331
1341
ggml_set_name (tmpk, " tmpk" );
1332
- ggml_set_default_backend (ctx0, GGML_BACKEND_CPU);
1333
1342
1334
- #ifdef GGML_USE_CUBLAS
1335
- struct ggml_tensor * Kcur;
1336
- struct ggml_tensor * Qcur;
1337
- if (backend_offload == GGML_BACKEND_GPU) {
1338
- Kcur = ggml_rope (ctx0, tmpk, n_past, n_rot, 0 );
1339
- Qcur = ggml_rope (ctx0, tmpq, n_past, n_rot, 0 );
1340
- } else {
1341
- Kcur = ggml_rope_inplace (ctx0, tmpk, n_past, n_rot, 0 );
1342
- Qcur = ggml_rope_inplace (ctx0, tmpq, n_past, n_rot, 0 );
1343
- }
1344
- #else
1345
1343
struct ggml_tensor * Kcur = ggml_rope_inplace (ctx0, tmpk, n_past, n_rot, 0 );
1344
+ ggml_set_name (Kcur, " Kcur" );
1345
+
1346
1346
struct ggml_tensor * Qcur = ggml_rope_inplace (ctx0, tmpq, n_past, n_rot, 0 );
1347
- #endif // GGML_USE_CUBLAS
1348
1347
ggml_set_name (Qcur, " Qcur" );
1349
- ggml_set_name (Kcur, " Kcur" );
1350
1348
1351
1349
// store key and value to memory
1352
1350
{
@@ -1430,62 +1428,70 @@ static bool llama_eval_internal(
1430
1428
ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, N));
1431
1429
ggml_set_name (cur, " KQV_merged_contiguous" );
1432
1430
1433
- ggml_set_default_backend (ctx0, backend_offload);
1434
1431
// projection (no bias)
1435
1432
cur = ggml_mul_mat (ctx0,
1436
1433
model.layers [il].wo ,
1437
1434
cur);
1435
+ offload_func (cur);
1438
1436
ggml_set_name (cur, " result_wo" );
1439
1437
}
1440
1438
1441
1439
lctx.use_buf (ctx0, 1 );
1442
1440
// ggml_cuda_set_scratch(1);
1443
1441
1444
1442
struct ggml_tensor * inpFF = ggml_add (ctx0, cur, inpSA);
1443
+ offload_func (inpFF);
1445
1444
ggml_set_name (inpFF, " inpFF" );
1446
1445
1447
1446
// feed-forward network
1448
1447
{
1449
1448
// norm
1450
1449
{
1451
1450
cur = ggml_rms_norm (ctx0, inpFF);
1451
+ offload_func (cur);
1452
1452
ggml_set_name (cur, " rms_norm_1" );
1453
1453
1454
1454
// cur = cur*ffn_norm(broadcasted)
1455
1455
cur = ggml_mul (ctx0, cur, model.layers [il].ffn_norm );
1456
+ offload_func (cur);
1456
1457
ggml_set_name (cur, " ffn_norm" );
1457
1458
}
1458
1459
1459
1460
struct ggml_tensor * tmp = ggml_mul_mat (ctx0,
1460
1461
model.layers [il].w3 ,
1461
1462
cur);
1462
- ggml_set_name (cur, " result_w3" );
1463
+ offload_func (tmp);
1464
+ ggml_set_name (tmp, " result_w3" );
1463
1465
1464
1466
cur = ggml_mul_mat (ctx0,
1465
1467
model.layers [il].w1 ,
1466
1468
cur);
1469
+ offload_func (cur);
1467
1470
ggml_set_name (cur, " result_w2" );
1468
1471
1469
1472
// SILU activation
1470
1473
cur = ggml_silu (ctx0, cur);
1474
+ offload_func (cur);
1471
1475
ggml_set_name (cur, " silu" );
1472
1476
1473
1477
cur = ggml_mul (ctx0, cur, tmp);
1478
+ offload_func (cur);
1474
1479
ggml_set_name (cur, " silu_x_result_w3" );
1475
1480
1476
1481
cur = ggml_mul_mat (ctx0,
1477
1482
model.layers [il].w2 ,
1478
1483
cur);
1484
+ offload_func (cur);
1479
1485
ggml_set_name (cur, " result_w2" );
1480
1486
}
1481
1487
1482
1488
cur = ggml_add (ctx0, cur, inpFF);
1489
+ offload_func (cur);
1483
1490
ggml_set_name (cur, " inpFF_+_result_w2" );
1484
1491
1485
1492
// input for next layer
1486
1493
inpL = cur;
1487
1494
1488
- ggml_set_default_backend (ctx0, GGML_BACKEND_CPU);
1489
1495
}
1490
1496
1491
1497
lctx.use_buf (ctx0, 0 );
@@ -1494,28 +1500,32 @@ static bool llama_eval_internal(
1494
1500
// used at the end to optionally extract the embeddings
1495
1501
struct ggml_tensor * embeddings = NULL ;
1496
1502
1503
+ offload_func_t offload_func = llama_nop;
1504
+
1497
1505
#ifdef GGML_USE_CUBLAS
1498
- if (n_gpu_layers > n_layer) {
1499
- ggml_set_default_backend (ctx0, GGML_BACKEND_GPU);
1500
- }
1506
+ if (n_gpu_layers > n_layer) {
1507
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1508
+ }
1501
1509
#endif // GGML_USE_CUBLAS
1502
1510
1503
1511
// norm
1504
1512
{
1505
1513
cur = ggml_rms_norm (ctx0, inpL);
1514
+ offload_func (cur);
1506
1515
ggml_set_name (cur, " rms_norm_inpL" );
1507
1516
1508
1517
cur = ggml_rms_norm (ctx0, cur);
1518
+ offload_func (cur);
1509
1519
ggml_set_name (cur, " rms_norm_after" );
1510
1520
1511
1521
// cur = cur*norm(broadcasted)
1512
1522
cur = ggml_mul (ctx0, cur, model.norm );
1523
+ offload_func (cur);
1513
1524
ggml_set_name (cur, " result_norm" );
1514
1525
1515
1526
embeddings = cur;
1516
1527
}
1517
1528
1518
- ggml_set_default_backend (ctx0, GGML_BACKEND_CPU);
1519
1529
1520
1530
// lm_head
1521
1531
cur = ggml_mul_mat (ctx0, model.output , cur);
0 commit comments