@@ -60,6 +60,9 @@ static const size_t MB = 1024*1024;
60
60
// TODO: dynamically determine these sizes
61
61
// needs modifications in ggml
62
62
63
+ typedef void (*offload_func_t )(struct ggml_tensor * tensor);
64
+ void llama_nop (struct ggml_tensor * tensor) {} // do nothing by default
65
+
63
66
static const std::map<e_model, size_t > & MEM_REQ_SCRATCH0 ()
64
67
{
65
68
static std::map<e_model, size_t > k_sizes = {
@@ -1300,10 +1303,11 @@ static bool llama_eval_internal(
1300
1303
const int i_gpu_start = n_layer - n_gpu_layers;
1301
1304
1302
1305
for (int il = 0 ; il < n_layer; ++il) {
1303
- ggml_backend backend_offload = GGML_BACKEND_CPU;
1306
+ offload_func_t offload_func = llama_nop;
1307
+
1304
1308
#ifdef GGML_USE_CUBLAS
1305
1309
if (il >= i_gpu_start) {
1306
- backend_offload = GGML_BACKEND_GPU;
1310
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1307
1311
}
1308
1312
#endif // GGML_USE_CUBLAS
1309
1313
@@ -1313,40 +1317,31 @@ static bool llama_eval_internal(
1313
1317
1314
1318
// norm
1315
1319
{
1316
- ggml_set_default_backend (ctx0, backend_offload);
1317
1320
cur = ggml_rms_norm (ctx0, inpL);
1321
+ offload_func (cur);
1318
1322
ggml_set_name (cur, " rms_norm_0" );
1319
1323
1320
1324
// cur = cur*attention_norm(broadcasted)
1321
1325
cur = ggml_mul (ctx0, cur, model.layers [il].attention_norm );
1326
+ offload_func (cur);
1322
1327
ggml_set_name (cur, " attention_norm_0" );
1323
1328
}
1324
1329
1325
1330
// self-attention
1326
1331
{
1327
1332
// compute Q and K and RoPE them
1328
1333
struct ggml_tensor * tmpq = ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wq , cur), n_embd/n_head, n_head, N);
1334
+ offload_func (cur);
1329
1335
ggml_set_name (tmpq, " tmpq" );
1330
1336
struct ggml_tensor * tmpk = ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wk , cur), n_embd/n_head, n_head, N);
1337
+ offload_func (cur);
1331
1338
ggml_set_name (tmpk, " tmpk" );
1332
- ggml_set_default_backend (ctx0, GGML_BACKEND_CPU);
1333
1339
1334
- #ifdef GGML_USE_CUBLAS
1335
- struct ggml_tensor * Kcur;
1336
- struct ggml_tensor * Qcur;
1337
- if (backend_offload == GGML_BACKEND_GPU) {
1338
- Kcur = ggml_rope (ctx0, tmpk, n_past, n_rot, 0 );
1339
- Qcur = ggml_rope (ctx0, tmpq, n_past, n_rot, 0 );
1340
- } else {
1341
- Kcur = ggml_rope_inplace (ctx0, tmpk, n_past, n_rot, 0 );
1342
- Qcur = ggml_rope_inplace (ctx0, tmpq, n_past, n_rot, 0 );
1343
- }
1344
- #else
1345
1340
struct ggml_tensor * Kcur = ggml_rope_inplace (ctx0, tmpk, n_past, n_rot, 0 );
1341
+ ggml_set_name (Kcur, " Kcur" );
1342
+
1346
1343
struct ggml_tensor * Qcur = ggml_rope_inplace (ctx0, tmpq, n_past, n_rot, 0 );
1347
- #endif // GGML_USE_CUBLAS
1348
1344
ggml_set_name (Qcur, " Qcur" );
1349
- ggml_set_name (Kcur, " Kcur" );
1350
1345
1351
1346
// store key and value to memory
1352
1347
{
@@ -1430,62 +1425,70 @@ static bool llama_eval_internal(
1430
1425
ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, N));
1431
1426
ggml_set_name (cur, " KQV_merged_contiguous" );
1432
1427
1433
- ggml_set_default_backend (ctx0, backend_offload);
1434
1428
// projection (no bias)
1435
1429
cur = ggml_mul_mat (ctx0,
1436
1430
model.layers [il].wo ,
1437
1431
cur);
1432
+ offload_func (cur);
1438
1433
ggml_set_name (cur, " result_wo" );
1439
1434
}
1440
1435
1441
1436
lctx.use_buf (ctx0, 1 );
1442
1437
// ggml_cuda_set_scratch(1);
1443
1438
1444
1439
struct ggml_tensor * inpFF = ggml_add (ctx0, cur, inpSA);
1440
+ offload_func (inpFF);
1445
1441
ggml_set_name (inpFF, " inpFF" );
1446
1442
1447
1443
// feed-forward network
1448
1444
{
1449
1445
// norm
1450
1446
{
1451
1447
cur = ggml_rms_norm (ctx0, inpFF);
1448
+ offload_func (cur);
1452
1449
ggml_set_name (cur, " rms_norm_1" );
1453
1450
1454
1451
// cur = cur*ffn_norm(broadcasted)
1455
1452
cur = ggml_mul (ctx0, cur, model.layers [il].ffn_norm );
1453
+ offload_func (cur);
1456
1454
ggml_set_name (cur, " ffn_norm" );
1457
1455
}
1458
1456
1459
1457
struct ggml_tensor * tmp = ggml_mul_mat (ctx0,
1460
1458
model.layers [il].w3 ,
1461
1459
cur);
1462
- ggml_set_name (cur, " result_w3" );
1460
+ offload_func (tmp);
1461
+ ggml_set_name (tmp, " result_w3" );
1463
1462
1464
1463
cur = ggml_mul_mat (ctx0,
1465
1464
model.layers [il].w1 ,
1466
1465
cur);
1466
+ offload_func (cur);
1467
1467
ggml_set_name (cur, " result_w2" );
1468
1468
1469
1469
// SILU activation
1470
1470
cur = ggml_silu (ctx0, cur);
1471
+ offload_func (cur);
1471
1472
ggml_set_name (cur, " silu" );
1472
1473
1473
1474
cur = ggml_mul (ctx0, cur, tmp);
1475
+ offload_func (cur);
1474
1476
ggml_set_name (cur, " silu_x_result_w3" );
1475
1477
1476
1478
cur = ggml_mul_mat (ctx0,
1477
1479
model.layers [il].w2 ,
1478
1480
cur);
1481
+ offload_func (cur);
1479
1482
ggml_set_name (cur, " result_w2" );
1480
1483
}
1481
1484
1482
1485
cur = ggml_add (ctx0, cur, inpFF);
1486
+ offload_func (cur);
1483
1487
ggml_set_name (cur, " inpFF_+_result_w2" );
1484
1488
1485
1489
// input for next layer
1486
1490
inpL = cur;
1487
1491
1488
- ggml_set_default_backend (ctx0, GGML_BACKEND_CPU);
1489
1492
}
1490
1493
1491
1494
lctx.use_buf (ctx0, 0 );
@@ -1494,28 +1497,32 @@ static bool llama_eval_internal(
1494
1497
// used at the end to optionally extract the embeddings
1495
1498
struct ggml_tensor * embeddings = NULL ;
1496
1499
1500
+ offload_func_t offload_func = llama_nop;
1501
+
1497
1502
#ifdef GGML_USE_CUBLAS
1498
- if (n_gpu_layers > n_layer) {
1499
- ggml_set_default_backend (ctx0, GGML_BACKEND_GPU);
1500
- }
1503
+ if (n_gpu_layers > n_layer) {
1504
+ offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1505
+ }
1501
1506
#endif // GGML_USE_CUBLAS
1502
1507
1503
1508
// norm
1504
1509
{
1505
1510
cur = ggml_rms_norm (ctx0, inpL);
1511
+ offload_func (cur);
1506
1512
ggml_set_name (cur, " rms_norm_inpL" );
1507
1513
1508
1514
cur = ggml_rms_norm (ctx0, cur);
1515
+ offload_func (cur);
1509
1516
ggml_set_name (cur, " rms_norm_after" );
1510
1517
1511
1518
// cur = cur*norm(broadcasted)
1512
1519
cur = ggml_mul (ctx0, cur, model.norm );
1520
+ offload_func (cur);
1513
1521
ggml_set_name (cur, " result_norm" );
1514
1522
1515
1523
embeddings = cur;
1516
1524
}
1517
1525
1518
- ggml_set_default_backend (ctx0, GGML_BACKEND_CPU);
1519
1526
1520
1527
// lm_head
1521
1528
cur = ggml_mul_mat (ctx0, model.output , cur);
0 commit comments