@@ -1860,31 +1860,54 @@ static void llm_load_tensors(
1860
1860
1861
1861
// output
1862
1862
{
1863
- model.output_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd}, GGML_BACKEND_CPU);
1864
- model.output_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT_NORM, " bias" ), {n_embd}, GGML_BACKEND_CPU);
1865
- model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
1863
+ ggml_backend backend_norm;
1864
+ ggml_backend backend_output;
1865
+
1866
+ if (n_gpu_layers > int (n_layer)) {
1867
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1868
+ // on Windows however this is detrimental unless everything is on the GPU
1869
+ #ifndef _WIN32
1870
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1871
+ #else
1872
+ backend_norm = low_vram || n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1873
+ #endif // _WIN32
1874
+
1875
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1876
+ } else {
1877
+ backend_norm = GGML_BACKEND_CPU;
1878
+ backend_output = GGML_BACKEND_CPU;
1879
+ }
1880
+
1881
+ model.output_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd}, backend_norm);
1882
+ model.output_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT_NORM, " bias" ), {n_embd}, backend_norm);
1883
+ model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, backend_output);
1866
1884
}
1867
1885
1868
1886
const uint32_t n_ff = hparams.n_ff ;
1869
1887
1888
+ const int i_gpu_start = n_layer - n_gpu_layers;
1889
+
1870
1890
model.layers .resize (n_layer);
1871
1891
1872
1892
for (uint32_t i = 0 ; i < n_layer; ++i) {
1893
+ const ggml_backend backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1894
+ const ggml_backend backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1895
+
1873
1896
auto & layer = model.layers [i];
1874
1897
1875
- layer.attn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd}, GGML_BACKEND_CPU );
1876
- layer.attn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " bias" , i), {n_embd}, GGML_BACKEND_CPU );
1898
+ layer.attn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd}, backend );
1899
+ layer.attn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " bias" , i), {n_embd}, backend );
1877
1900
1878
1901
if (gguf_find_tensor (ml.ctx_gguf , tn (LLM_TENSOR_ATTN_NORM_2, " weight" , i).c_str ()) >= 0 ) {
1879
- layer.attn_norm_2 = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM_2, " weight" , i), {n_embd}, GGML_BACKEND_CPU );
1880
- layer.attn_norm_2_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM_2, " bias" , i), {n_embd}, GGML_BACKEND_CPU );
1902
+ layer.attn_norm_2 = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM_2, " weight" , i), {n_embd}, backend );
1903
+ layer.attn_norm_2_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM_2, " bias" , i), {n_embd}, backend );
1881
1904
}
1882
1905
1883
- layer.wqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " weight" , i), {n_embd, n_embd + 2 *n_embd_gqa}, GGML_BACKEND_CPU );
1884
- layer.wo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd}, GGML_BACKEND_CPU );
1906
+ layer.wqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " weight" , i), {n_embd, n_embd + 2 *n_embd_gqa}, backend_split );
1907
+ layer.wo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd}, backend_split );
1885
1908
1886
- layer.w2 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd}, GGML_BACKEND_CPU );
1887
- layer.w3 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, GGML_BACKEND_CPU );
1909
+ layer.w2 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd}, backend_split );
1910
+ layer.w3 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split );
1888
1911
}
1889
1912
} break ;
1890
1913
default :
@@ -2390,6 +2413,8 @@ static struct ggml_cgraph * llm_build_falcon(
2390
2413
const float freq_scale = hparams.rope_freq_scale ;
2391
2414
const float norm_eps = hparams.f_norm_eps ;
2392
2415
2416
+ const int n_gpu_layers = model.n_gpu_layers ;
2417
+
2393
2418
auto & buf_compute = lctx.buf_compute ;
2394
2419
2395
2420
struct ggml_init_params params = {
@@ -2430,6 +2455,30 @@ static struct ggml_cgraph * llm_build_falcon(
2430
2455
}
2431
2456
}
2432
2457
2458
+ const int i_gpu_start = n_layer - n_gpu_layers;
2459
+ (void ) i_gpu_start;
2460
+
2461
+ // offload functions set the tensor output backend to GPU
2462
+ // tensors are GPU-accelerated if any input or the output has been offloaded
2463
+ //
2464
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2465
+ // in that case ggml_cuda_assign_buffers has no effect
2466
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2467
+ offload_func_t offload_func_kq = llama_nop;
2468
+ offload_func_t offload_func_v = llama_nop;
2469
+
2470
+ #ifdef GGML_USE_CUBLAS
2471
+ if (n_gpu_layers > n_layer) {
2472
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
2473
+ }
2474
+ if (n_gpu_layers > n_layer + 1 ) {
2475
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
2476
+ }
2477
+ if (n_gpu_layers > n_layer + 2 ) {
2478
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
2479
+ }
2480
+ #endif // GGML_USE_CUBLAS
2481
+
2433
2482
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d (ctx0, GGML_TYPE_F32, 1 );
2434
2483
ggml_allocr_alloc (lctx.alloc , KQ_scale);
2435
2484
if (!ggml_allocr_is_measure (lctx.alloc )) {
@@ -2440,28 +2489,43 @@ static struct ggml_cgraph * llm_build_falcon(
2440
2489
for (int il = 0 ; il < n_layer; ++il) {
2441
2490
struct ggml_tensor * attn_norm;
2442
2491
2492
+ offload_func_t offload_func = llama_nop;
2493
+
2494
+ #ifdef GGML_USE_CUBLAS
2495
+ if (il >= i_gpu_start) {
2496
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
2497
+ }
2498
+ #endif // GGML_USE_CUBLAS
2499
+
2443
2500
// self-attention
2444
2501
// TODO: refactor into common function (shared with LLaMA)
2445
2502
{
2446
2503
attn_norm = ggml_norm (ctx0, inpL, norm_eps);
2504
+ offload_func (attn_norm);
2447
2505
2448
2506
attn_norm = ggml_add (ctx0,
2449
2507
ggml_mul (ctx0, attn_norm, model.layers [il].attn_norm ),
2450
2508
model.layers [il].attn_norm_b );
2509
+ offload_func (attn_norm->src [0 ]);
2510
+ offload_func (attn_norm);
2451
2511
2452
2512
if (model.layers [il].attn_norm_2 ) { // Falcon-40B
2453
2513
cur = ggml_norm (ctx0, inpL, norm_eps);
2514
+ offload_func (cur);
2454
2515
2455
2516
cur = ggml_add (ctx0,
2456
2517
ggml_mul (ctx0, cur, model.layers [il].attn_norm_2 ),
2457
2518
model.layers [il].attn_norm_2_b );
2519
+ offload_func (cur->src [0 ]);
2520
+ offload_func (cur);
2458
2521
} else { // Falcon 7B
2459
2522
cur = attn_norm;
2460
2523
}
2461
2524
2462
2525
// compute QKV
2463
2526
2464
2527
cur = ggml_mul_mat (ctx0, model.layers [il].wqkv , cur);
2528
+ offload_func_kq (cur);
2465
2529
2466
2530
// Note that the strides for Kcur, Vcur are set up so that the
2467
2531
// resulting views are misaligned with the tensor's storage
@@ -2479,39 +2543,49 @@ static struct ggml_cgraph * llm_build_falcon(
2479
2543
wsize * n_embd_head,
2480
2544
wsize * n_embd_head * (n_head + 2 * n_head_kv),
2481
2545
0 );
2546
+ offload_func_kq (tmpq);
2482
2547
2483
2548
struct ggml_tensor * tmpk = ggml_view_3d (
2484
2549
ctx0, cur, n_embd_head, n_head_kv, N,
2485
2550
wsize * n_embd_head,
2486
2551
wsize * n_embd_head * (n_head + 2 * n_head_kv),
2487
2552
wsize * n_embd_head * n_head);
2553
+ offload_func_kq (tmpk);
2488
2554
2489
2555
struct ggml_tensor * tmpv = ggml_view_3d (
2490
2556
ctx0, cur, n_embd_head, n_head_kv, N,
2491
2557
wsize * n_embd_head,
2492
2558
wsize * n_embd_head * (n_head + 2 * n_head_kv),
2493
2559
wsize * n_embd_head * (n_head + n_head_kv));
2560
+ offload_func_v (tmpv);
2494
2561
2495
2562
// using mode = 2 for neox mode
2496
2563
struct ggml_tensor * Qcur = ggml_rope_custom_inplace (ctx0, tmpq, n_past, n_embd_head, 2 , 0 , freq_base, freq_scale);
2564
+ offload_func_kq (Qcur);
2497
2565
struct ggml_tensor * Kcur = ggml_rope_custom_inplace (ctx0, tmpk, n_past, n_embd_head, 2 , 0 , freq_base, freq_scale);
2566
+ offload_func_kq (Kcur);
2498
2567
2499
2568
{
2500
2569
struct ggml_tensor * Vcur = ggml_transpose (ctx0, ggml_reshape_2d (ctx0, ggml_cont (ctx0, tmpv), n_embd_gqa, N));
2570
+ offload_func_v (Vcur);
2571
+ offload_func_v (Vcur->src [0 ]->src [0 ]);
2501
2572
ggml_set_name (Vcur, " Vcur" );
2502
2573
2503
2574
struct ggml_tensor * k = ggml_view_1d (ctx0, kv_self.k , N*n_embd_gqa, (ggml_element_size (kv_self.k )*n_embd_gqa)*(il*n_ctx + n_past));
2575
+ offload_func_kq (k);
2504
2576
ggml_set_name (k, " k" );
2505
2577
2506
2578
struct ggml_tensor * v = ggml_view_2d (ctx0, kv_self.v , N, n_embd_gqa,
2507
2579
( n_ctx)*ggml_element_size (kv_self.v ),
2508
2580
(il*n_ctx)*ggml_element_size (kv_self.v )*n_embd_gqa + n_past*ggml_element_size (kv_self.v ));
2581
+ offload_func_v (v);
2509
2582
2510
2583
ggml_build_forward_expand (gf, ggml_cpy (ctx0, Kcur, k));
2511
2584
ggml_build_forward_expand (gf, ggml_cpy (ctx0, Vcur, v));
2512
2585
}
2513
2586
2514
2587
struct ggml_tensor * Q = ggml_permute (ctx0, Qcur, 0 , 2 , 1 , 3 );
2588
+ offload_func_kq (Q);
2515
2589
ggml_set_name (Q, " Q" );
2516
2590
2517
2591
struct ggml_tensor * K =
@@ -2520,18 +2594,23 @@ static struct ggml_cgraph * llm_build_falcon(
2520
2594
ggml_element_size (kv_self.k )*n_embd_gqa,
2521
2595
ggml_element_size (kv_self.k )*n_embd_head,
2522
2596
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
2597
+ offload_func_kq (K);
2523
2598
ggml_set_name (K, " K" );
2524
2599
2525
2600
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
2601
+ offload_func_kq (KQ);
2526
2602
ggml_set_name (KQ, " KQ" );
2527
2603
2528
2604
struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
2605
+ offload_func_kq (KQ_scaled);
2529
2606
ggml_set_name (KQ_scaled, " KQ_scaled" );
2530
2607
2531
2608
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
2609
+ offload_func_kq (KQ_masked);
2532
2610
ggml_set_name (KQ_masked, " KQ_masked" );
2533
2611
2534
2612
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
2613
+ offload_func_v (KQ_soft_max);
2535
2614
ggml_set_name (KQ_soft_max, " KQ_soft_max" );
2536
2615
2537
2616
struct ggml_tensor * V =
@@ -2540,18 +2619,23 @@ static struct ggml_cgraph * llm_build_falcon(
2540
2619
ggml_element_size (kv_self.v )*n_ctx,
2541
2620
ggml_element_size (kv_self.v )*n_ctx*n_embd_head,
2542
2621
ggml_element_size (kv_self.v )*n_ctx*n_embd_gqa*il);
2622
+ offload_func_v (V);
2543
2623
ggml_set_name (V, " V" );
2544
2624
2545
2625
struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
2626
+ offload_func_v (KQV);
2546
2627
ggml_set_name (KQV, " KQV" );
2547
2628
2548
2629
struct ggml_tensor * KQV_merged = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
2630
+ offload_func_v (KQV_merged);
2549
2631
ggml_set_name (KQV_merged, " KQV_merged" );
2550
2632
2551
2633
cur = ggml_cpy (ctx0, KQV_merged, ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, N));
2634
+ offload_func_v (cur);
2552
2635
ggml_set_name (cur, " KQV_merged_contiguous" );
2553
2636
2554
2637
cur = ggml_mul_mat (ctx0, model.layers [il].wo , cur);
2638
+ offload_func (cur);
2555
2639
ggml_set_name (cur, " result_wo" );
2556
2640
}
2557
2641
@@ -2567,13 +2651,18 @@ static struct ggml_cgraph * llm_build_falcon(
2567
2651
// adding this, because there seems to be a bug in the Metal concurrency optimization
2568
2652
// without this line, the results are non-deterministic and wrong
2569
2653
cur->src [2 ] = attn_out;
2654
+ offload_func (cur);
2570
2655
2571
2656
cur = ggml_gelu (ctx0, cur);
2657
+ offload_func (cur);
2572
2658
cur = ggml_mul_mat (ctx0, model.layers [il].w2 , cur);
2659
+ offload_func (cur);
2573
2660
}
2574
2661
2575
2662
cur = ggml_add (ctx0, cur, attn_out);
2663
+ offload_func (cur);
2576
2664
cur = ggml_add (ctx0, cur, inpL);
2665
+ offload_func (cur);
2577
2666
2578
2667
// input for next layer
2579
2668
inpL = cur;
@@ -2584,6 +2673,7 @@ static struct ggml_cgraph * llm_build_falcon(
2584
2673
// norm
2585
2674
{
2586
2675
cur = ggml_norm (ctx0, cur, norm_eps);
2676
+ offload_func_nr (cur);
2587
2677
2588
2678
cur = ggml_add (ctx0,
2589
2679
ggml_mul (ctx0, cur, model.output_norm ),
0 commit comments