1
1
#include " ggml.h"
2
2
#include " ggml-alloc.h"
3
+ #include " ggml-backend.h"
3
4
#include " llama.h"
4
5
#include " common.h"
5
6
#include " train.h"
13
14
#pragma warning(disable: 4244 4267) // possible loss of data
14
15
#endif
15
16
16
- static const size_t tensor_alignment = 32 ;
17
-
18
17
struct my_llama_hparams {
19
18
uint32_t n_vocab = 32000 ;
20
19
uint32_t n_ctx = 512 ;
@@ -128,7 +127,7 @@ struct my_llama_lora_layer {
128
127
129
128
struct my_llama_lora {
130
129
struct ggml_context * ctx = NULL ;
131
- std::vector< uint8_t > data;
130
+ ggml_backend_buffer_t data;
132
131
133
132
my_llama_lora_hparams hparams;
134
133
@@ -372,63 +371,6 @@ static void set_param_lora(struct my_llama_lora * lora) {
372
371
}
373
372
}
374
373
375
- static void alloc_lora (struct ggml_allocr * alloc, struct my_llama_lora * lora) {
376
- ggml_allocr_alloc (alloc, lora->tok_embeddings_a );
377
- ggml_allocr_alloc (alloc, lora->tok_embeddings_b );
378
- ggml_allocr_alloc (alloc, lora->norm_a );
379
- ggml_allocr_alloc (alloc, lora->norm_b );
380
- ggml_allocr_alloc (alloc, lora->output_a );
381
- ggml_allocr_alloc (alloc, lora->output_b );
382
- for (uint32_t i = 0 ; i < lora->layers .size (); ++i) {
383
- auto & layer = lora->layers [i];
384
- ggml_allocr_alloc (alloc, layer.attention_norm_a );
385
- ggml_allocr_alloc (alloc, layer.attention_norm_b );
386
- ggml_allocr_alloc (alloc, layer.wq_a );
387
- ggml_allocr_alloc (alloc, layer.wq_b );
388
- ggml_allocr_alloc (alloc, layer.wk_a );
389
- ggml_allocr_alloc (alloc, layer.wk_b );
390
- ggml_allocr_alloc (alloc, layer.wv_a );
391
- ggml_allocr_alloc (alloc, layer.wv_b );
392
- ggml_allocr_alloc (alloc, layer.wo_a );
393
- ggml_allocr_alloc (alloc, layer.wo_b );
394
- ggml_allocr_alloc (alloc, layer.ffn_norm_a );
395
- ggml_allocr_alloc (alloc, layer.ffn_norm_b );
396
- ggml_allocr_alloc (alloc, layer.w1_a );
397
- ggml_allocr_alloc (alloc, layer.w1_b );
398
- ggml_allocr_alloc (alloc, layer.w2_a );
399
- ggml_allocr_alloc (alloc, layer.w2_b );
400
- ggml_allocr_alloc (alloc, layer.w3_a );
401
- ggml_allocr_alloc (alloc, layer.w3_b );
402
- }
403
- ggml_allocr_alloc (alloc, lora->tok_embeddings_a ->grad );
404
- ggml_allocr_alloc (alloc, lora->tok_embeddings_b ->grad );
405
- ggml_allocr_alloc (alloc, lora->norm_a ->grad );
406
- ggml_allocr_alloc (alloc, lora->norm_b ->grad );
407
- ggml_allocr_alloc (alloc, lora->output_a ->grad );
408
- ggml_allocr_alloc (alloc, lora->output_b ->grad );
409
- for (uint32_t i = 0 ; i < lora->layers .size (); ++i) {
410
- auto & layer = lora->layers [i];
411
- ggml_allocr_alloc (alloc, layer.attention_norm_a ->grad );
412
- ggml_allocr_alloc (alloc, layer.attention_norm_b ->grad );
413
- ggml_allocr_alloc (alloc, layer.wq_a ->grad );
414
- ggml_allocr_alloc (alloc, layer.wq_b ->grad );
415
- ggml_allocr_alloc (alloc, layer.wk_a ->grad );
416
- ggml_allocr_alloc (alloc, layer.wk_b ->grad );
417
- ggml_allocr_alloc (alloc, layer.wv_a ->grad );
418
- ggml_allocr_alloc (alloc, layer.wv_b ->grad );
419
- ggml_allocr_alloc (alloc, layer.wo_a ->grad );
420
- ggml_allocr_alloc (alloc, layer.wo_b ->grad );
421
- ggml_allocr_alloc (alloc, layer.ffn_norm_a ->grad );
422
- ggml_allocr_alloc (alloc, layer.ffn_norm_b ->grad );
423
- ggml_allocr_alloc (alloc, layer.w1_a ->grad );
424
- ggml_allocr_alloc (alloc, layer.w1_b ->grad );
425
- ggml_allocr_alloc (alloc, layer.w2_a ->grad );
426
- ggml_allocr_alloc (alloc, layer.w2_b ->grad );
427
- ggml_allocr_alloc (alloc, layer.w3_a ->grad );
428
- ggml_allocr_alloc (alloc, layer.w3_b ->grad );
429
- }
430
- }
431
-
432
374
static void init_lora (const struct my_llama_model * model, struct my_llama_lora * lora) {
433
375
const auto & lparams = lora->hparams ;
434
376
@@ -522,18 +464,8 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
522
464
523
465
set_param_lora (lora);
524
466
525
- // measure data size
526
- size_t size = 0 ;
527
- for (struct ggml_tensor * t = ggml_get_first_tensor (ctx); t != NULL ; t = ggml_get_next_tensor (ctx, t)) {
528
- size += GGML_PAD (ggml_nbytes (t), tensor_alignment);
529
- }
530
-
531
- // allocate data
532
- struct ggml_allocr * alloc = NULL ;
533
- lora->data .resize (size + tensor_alignment);
534
- alloc = ggml_allocr_new (lora->data .data (), lora->data .size (), tensor_alignment);
535
- alloc_lora (alloc, lora);
536
- ggml_allocr_free (alloc);
467
+ // allocate data for lora tensors
468
+ lora->data = ggml_backend_alloc_ctx_tensors_from_buft (ctx, ggml_backend_cpu_buffer_type ());
537
469
}
538
470
539
471
static void randomize_lora (struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
@@ -579,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
579
511
static struct ggml_tensor * llama_build_lora_finetune_graphs (
580
512
struct my_llama_model * model,
581
513
struct my_llama_lora * lora,
582
- struct ggml_allocr * alloc,
514
+ ggml_gallocr_t alloc,
583
515
struct ggml_context * ctx,
584
516
struct ggml_cgraph * gf,
585
517
struct ggml_cgraph * gb,
@@ -590,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
590
522
const int n_tokens,
591
523
const int n_batch,
592
524
const bool enable_flash_attn,
593
- const bool enable_checkpointing) {
525
+ const bool enable_checkpointing,
526
+ const bool measure_only) {
594
527
595
528
ggml_set_scratch (ctx, { 0 , 0 , nullptr , });
596
529
const int n_past = 0 ;
@@ -622,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
622
555
623
556
// KQ_pos - contains the positions
624
557
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d (ctx, GGML_TYPE_I32, N);
625
- ggml_allocr_alloc (alloc, KQ_pos);
626
- if (!ggml_allocr_is_measure (alloc)) {
627
- int * data = (int *) KQ_pos->data ;
628
- for (int i = 0 ; i < N; ++i) {
629
- data[i] = n_past + i;
630
- }
631
- }
558
+ ggml_set_input (KQ_pos);
632
559
633
560
// rope has so much parameters that we make a custom function for it
634
561
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@@ -780,7 +707,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
780
707
// input gradient
781
708
ggml_build_forward_expand (gb, ggml_scale_inplace (ctx, t36->grad , 1 .0f ));
782
709
GGML_ASSERT (t36->grad ->data == NULL && t36->grad ->view_src == NULL );
783
- ggml_allocr_alloc (alloc, t36->grad );
710
+ ggml_set_input ( t36->grad );
784
711
// KQ_pos
785
712
ggml_build_forward_expand (gb, ggml_scale_inplace (ctx, KQ_pos, 1 .0f ));
786
713
@@ -805,11 +732,23 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
805
732
// note: they will be freed in reverse order
806
733
for (unsigned int i = 0 ; i < checkpoints.size (); ++i) {
807
734
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL ) {
808
- ggml_allocr_alloc (alloc, checkpoints[i]);
735
+ ggml_set_input ( checkpoints[i]);
809
736
}
810
737
}
811
738
812
- ggml_allocr_alloc_graph (alloc, gb);
739
+ if (measure_only) {
740
+ ggml_gallocr_reserve (alloc, gb);
741
+ } else {
742
+ ggml_gallocr_alloc_graph (alloc, gb);
743
+
744
+ // set KQ_pos
745
+ {
746
+ int * data = (int *) KQ_pos->data ;
747
+ for (int i = 0 ; i < N; ++i) {
748
+ data[i] = n_past + i;
749
+ }
750
+ }
751
+ }
813
752
814
753
// remove the additional nodes and leafs
815
754
for (int i = n_leafs_before; i < gb->n_leafs ; ++i) {
@@ -1663,7 +1602,7 @@ int main(int argc, char ** argv) {
1663
1602
printf (" %s: seen train_samples %llu\n " , __func__, (long long unsigned ) train->train_samples );
1664
1603
printf (" %s: seen train_tokens %llu\n " , __func__, (long long unsigned ) train->train_tokens );
1665
1604
printf (" %s: completed train_epochs %llu\n " , __func__, (long long unsigned ) train->train_epochs );
1666
- printf (" %s: lora_size = %zu bytes (%.1f MB)\n " , __func__, (ggml_used_mem (lora.ctx ) + lora.data . size ( )), (float ) (ggml_used_mem (lora.ctx ) + lora.data . size ( )) / (1024 .0f *1024 .0f ));
1605
+ printf (" %s: lora_size = %zu bytes (%.1f MB)\n " , __func__, (ggml_used_mem (lora.ctx ) + ggml_backend_buffer_get_size ( lora.data )), (float ) (ggml_used_mem (lora.ctx ) + ggml_backend_buffer_get_size ( lora.data )) / (1024 .0f *1024 .0f ));
1667
1606
1668
1607
if (params.only_write_lora ) {
1669
1608
save_train_files_data save_data;
@@ -1690,10 +1629,6 @@ int main(int argc, char ** argv) {
1690
1629
int n_vocab = model.hparams .n_vocab ;
1691
1630
int n_batch = params.common .n_batch ;
1692
1631
1693
-
1694
- std::vector<uint8_t > mem_input_data;
1695
- std::vector<uint8_t > mem_compute_data;
1696
-
1697
1632
// context for input tensors without their data
1698
1633
struct ggml_init_params ctx_input_params = {
1699
1634
ggml_tensor_overhead () * 2 , // mem_size
@@ -1706,18 +1641,12 @@ int main(int argc, char ** argv) {
1706
1641
struct ggml_tensor * tokens_input = ggml_new_tensor_2d (ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
1707
1642
struct ggml_tensor * target_probs = ggml_new_tensor_3d (ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
1708
1643
1644
+ // allocate input tensors
1709
1645
// measure required memory for input tensors
1710
- size_t max_input_size = GGML_PAD (ggml_nbytes (tokens_input), tensor_alignment) +
1711
- GGML_PAD (ggml_nbytes (target_probs), tensor_alignment) +
1712
- tensor_alignment;
1646
+ ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft (ctx_input, ggml_backend_cpu_buffer_type ());
1647
+ size_t max_input_size = ggml_backend_buffer_get_size (input_data);
1713
1648
printf (" %s: input_size = %zu bytes (%.1f MB)\n " , __func__, max_input_size, (float ) max_input_size / (1024 .0f *1024 .0f ));
1714
1649
1715
- // allocate input tensors
1716
- mem_input_data.resize (max_input_size);
1717
- ggml_allocr_t alloc_inps = ggml_allocr_new (mem_input_data.data (), mem_input_data.size (), tensor_alignment);
1718
- ggml_allocr_alloc (alloc_inps, tokens_input);
1719
- ggml_allocr_alloc (alloc_inps, target_probs);
1720
-
1721
1650
// context for compute tensors without their data
1722
1651
const size_t estimated_compute_size_wo_data = (
1723
1652
2 *LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead () +
@@ -1743,7 +1672,7 @@ int main(int argc, char ** argv) {
1743
1672
// find best evaluation order
1744
1673
for (unsigned order = 0 ; order < (unsigned ) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
1745
1674
ctx_compute = ggml_init (ctx_compute_params);
1746
- ggml_allocr_t alloc = ggml_allocr_new_measure (tensor_alignment );
1675
+ ggml_gallocr_t alloc = ggml_gallocr_new ( ggml_backend_cpu_buffer_type () );
1747
1676
gf = ggml_new_graph_custom (ctx_compute, LLAMA_TRAIN_MAX_NODES, true );
1748
1677
gf->order = (enum ggml_cgraph_eval_order) order;
1749
1678
gb = ggml_new_graph_custom (ctx_compute, LLAMA_TRAIN_MAX_NODES, true );
@@ -1756,14 +1685,15 @@ int main(int argc, char ** argv) {
1756
1685
&logits, tokens_input, target_probs,
1757
1686
n_tokens, n_batch,
1758
1687
params.common .use_flash ,
1759
- params.common .use_checkpointing
1688
+ params.common .use_checkpointing ,
1689
+ true
1760
1690
);
1761
- size_t max_compute_size = ggml_allocr_max_size (alloc) + tensor_alignment;
1691
+ size_t max_compute_size = ggml_gallocr_get_buffer_size (alloc, 0 ); // FIXME: this will still allocate the buffer
1762
1692
if (max_compute_size < best_compute_size) {
1763
1693
best_compute_size = max_compute_size;
1764
1694
best_order = gf->order ;
1765
1695
}
1766
- ggml_allocr_free (alloc);
1696
+ ggml_gallocr_free (alloc);
1767
1697
ggml_free (ctx_compute);
1768
1698
}
1769
1699
size_t max_compute_size = best_compute_size;
@@ -1774,9 +1704,8 @@ int main(int argc, char ** argv) {
1774
1704
" invalid" );
1775
1705
1776
1706
// allocate compute tensors
1777
- mem_compute_data.resize (max_compute_size);
1778
1707
ctx_compute = ggml_init (ctx_compute_params);
1779
- ggml_allocr_t alloc = ggml_allocr_new (mem_compute_data. data (), mem_compute_data. size (), tensor_alignment );
1708
+ ggml_gallocr_t alloc = ggml_gallocr_new ( ggml_backend_cpu_buffer_type () );
1780
1709
gf = ggml_new_graph_custom (ctx_compute, LLAMA_TRAIN_MAX_NODES, true );
1781
1710
gf->order = best_order;
1782
1711
gb = ggml_new_graph_custom (ctx_compute, LLAMA_TRAIN_MAX_NODES, true );
@@ -1789,11 +1718,9 @@ int main(int argc, char ** argv) {
1789
1718
&logits, tokens_input, target_probs,
1790
1719
n_tokens, n_batch,
1791
1720
params.common .use_flash ,
1792
- params.common .use_checkpointing
1721
+ params.common .use_checkpointing ,
1722
+ false
1793
1723
);
1794
- ggml_allocr_free (alloc);
1795
- ggml_allocr_free (alloc_inps);
1796
-
1797
1724
1798
1725
// tokenize data
1799
1726
std::vector<llama_token> train_tokens;
@@ -1908,6 +1835,8 @@ int main(int argc, char ** argv) {
1908
1835
ggml_free (ctx_work);
1909
1836
ggml_free (ctx_compute);
1910
1837
ggml_free (ctx_input);
1838
+ ggml_gallocr_free (alloc);
1839
+
1911
1840
1912
1841
int64_t t1 = ggml_time_ms ();
1913
1842
printf (" %s: total training time: " , __func__);
0 commit comments