Skip to content

Commit 4b481cd

Browse files
ggerganovslaren
authored andcommitted
sync : ggml (ggml-org#5452)
* ggml-alloc : v3 (ggml/727) * ggml-alloc v3 ggml-ci * fix ci ggml-ci * whisper : check for backend buffer allocation failures * whisper : avoid leaks when initialization fails * cleanup ggml-ci * style fixes ggml-ci * sync : ggml * update llama.cpp, clip.cpp, export-lora.cpp * update finetune.cpp, train-text-from-scratch.cpp ggml-ci * ggml-backend : reduce alignment to 32 to match gguf and fix mmap --------- Co-authored-by: slaren <[email protected]>
1 parent f0752eb commit 4b481cd

File tree

12 files changed

+1120
-1195
lines changed

12 files changed

+1120
-1195
lines changed

examples/export-lora/export-lora.cpp

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -337,24 +337,14 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
337337
params.mem_buffer = NULL;
338338
params.no_alloc = true;
339339
struct ggml_context * ctx = NULL;
340-
struct ggml_allocr * alloc = NULL;
341-
struct ggml_cgraph * gf = NULL;
340+
struct ggml_gallocr * alloc = NULL;
341+
struct ggml_cgraph * gf = NULL;
342342

343343
ctx = ggml_init(params);
344-
alloc = ggml_allocr_new_measure(tensor_alignment);
344+
alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
345345
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
346-
size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
347-
ggml_allocr_free(alloc);
348-
ggml_free(ctx);
349-
350-
static std::vector<uint8_t> data_compute;
351-
data_compute.resize(alloc_size + tensor_alignment);
352346

353-
ctx = ggml_init(params);
354-
alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
355-
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
356-
ggml_allocr_alloc_graph(alloc, gf);
357-
ggml_allocr_free(alloc);
347+
ggml_gallocr_alloc_graph(alloc, gf);
358348

359349
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
360350
static std::vector<uint8_t> data_work;
@@ -363,6 +353,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
363353

364354
ggml_graph_compute(gf, &cplan);
365355

356+
ggml_gallocr_free(alloc);
366357
ggml_free(ctx);
367358
return true;
368359
}

examples/finetune/finetune.cpp

Lines changed: 37 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "ggml.h"
22
#include "ggml-alloc.h"
3+
#include "ggml-backend.h"
34
#include "llama.h"
45
#include "common.h"
56
#include "train.h"
@@ -13,8 +14,6 @@
1314
#pragma warning(disable: 4244 4267) // possible loss of data
1415
#endif
1516

16-
static const size_t tensor_alignment = 32;
17-
1817
struct my_llama_hparams {
1918
uint32_t n_vocab = 32000;
2019
uint32_t n_ctx = 512;
@@ -128,7 +127,7 @@ struct my_llama_lora_layer {
128127

129128
struct my_llama_lora {
130129
struct ggml_context * ctx = NULL;
131-
std::vector<uint8_t> data;
130+
ggml_backend_buffer_t data;
132131

133132
my_llama_lora_hparams hparams;
134133

@@ -372,63 +371,6 @@ static void set_param_lora(struct my_llama_lora * lora) {
372371
}
373372
}
374373

375-
static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) {
376-
ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
377-
ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
378-
ggml_allocr_alloc(alloc, lora->norm_a);
379-
ggml_allocr_alloc(alloc, lora->norm_b);
380-
ggml_allocr_alloc(alloc, lora->output_a);
381-
ggml_allocr_alloc(alloc, lora->output_b);
382-
for (uint32_t i = 0; i < lora->layers.size(); ++i) {
383-
auto & layer = lora->layers[i];
384-
ggml_allocr_alloc(alloc, layer.attention_norm_a);
385-
ggml_allocr_alloc(alloc, layer.attention_norm_b);
386-
ggml_allocr_alloc(alloc, layer.wq_a);
387-
ggml_allocr_alloc(alloc, layer.wq_b);
388-
ggml_allocr_alloc(alloc, layer.wk_a);
389-
ggml_allocr_alloc(alloc, layer.wk_b);
390-
ggml_allocr_alloc(alloc, layer.wv_a);
391-
ggml_allocr_alloc(alloc, layer.wv_b);
392-
ggml_allocr_alloc(alloc, layer.wo_a);
393-
ggml_allocr_alloc(alloc, layer.wo_b);
394-
ggml_allocr_alloc(alloc, layer.ffn_norm_a);
395-
ggml_allocr_alloc(alloc, layer.ffn_norm_b);
396-
ggml_allocr_alloc(alloc, layer.w1_a);
397-
ggml_allocr_alloc(alloc, layer.w1_b);
398-
ggml_allocr_alloc(alloc, layer.w2_a);
399-
ggml_allocr_alloc(alloc, layer.w2_b);
400-
ggml_allocr_alloc(alloc, layer.w3_a);
401-
ggml_allocr_alloc(alloc, layer.w3_b);
402-
}
403-
ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
404-
ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
405-
ggml_allocr_alloc(alloc, lora->norm_a->grad);
406-
ggml_allocr_alloc(alloc, lora->norm_b->grad);
407-
ggml_allocr_alloc(alloc, lora->output_a->grad);
408-
ggml_allocr_alloc(alloc, lora->output_b->grad);
409-
for (uint32_t i = 0; i < lora->layers.size(); ++i) {
410-
auto & layer = lora->layers[i];
411-
ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
412-
ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
413-
ggml_allocr_alloc(alloc, layer.wq_a->grad);
414-
ggml_allocr_alloc(alloc, layer.wq_b->grad);
415-
ggml_allocr_alloc(alloc, layer.wk_a->grad);
416-
ggml_allocr_alloc(alloc, layer.wk_b->grad);
417-
ggml_allocr_alloc(alloc, layer.wv_a->grad);
418-
ggml_allocr_alloc(alloc, layer.wv_b->grad);
419-
ggml_allocr_alloc(alloc, layer.wo_a->grad);
420-
ggml_allocr_alloc(alloc, layer.wo_b->grad);
421-
ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
422-
ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
423-
ggml_allocr_alloc(alloc, layer.w1_a->grad);
424-
ggml_allocr_alloc(alloc, layer.w1_b->grad);
425-
ggml_allocr_alloc(alloc, layer.w2_a->grad);
426-
ggml_allocr_alloc(alloc, layer.w2_b->grad);
427-
ggml_allocr_alloc(alloc, layer.w3_a->grad);
428-
ggml_allocr_alloc(alloc, layer.w3_b->grad);
429-
}
430-
}
431-
432374
static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
433375
const auto & lparams = lora->hparams;
434376

@@ -522,18 +464,8 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
522464

523465
set_param_lora(lora);
524466

525-
// measure data size
526-
size_t size = 0;
527-
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
528-
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
529-
}
530-
531-
// allocate data
532-
struct ggml_allocr * alloc = NULL;
533-
lora->data.resize(size + tensor_alignment);
534-
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
535-
alloc_lora(alloc, lora);
536-
ggml_allocr_free(alloc);
467+
// allocate data for lora tensors
468+
lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
537469
}
538470

539471
static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
@@ -579,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
579511
static struct ggml_tensor * llama_build_lora_finetune_graphs(
580512
struct my_llama_model * model,
581513
struct my_llama_lora * lora,
582-
struct ggml_allocr * alloc,
514+
ggml_gallocr_t alloc,
583515
struct ggml_context * ctx,
584516
struct ggml_cgraph * gf,
585517
struct ggml_cgraph * gb,
@@ -590,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
590522
const int n_tokens,
591523
const int n_batch,
592524
const bool enable_flash_attn,
593-
const bool enable_checkpointing) {
525+
const bool enable_checkpointing,
526+
const bool measure_only) {
594527

595528
ggml_set_scratch(ctx, { 0, 0, nullptr, });
596529
const int n_past = 0;
@@ -622,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
622555

623556
// KQ_pos - contains the positions
624557
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
625-
ggml_allocr_alloc(alloc, KQ_pos);
626-
if (!ggml_allocr_is_measure(alloc)) {
627-
int * data = (int *) KQ_pos->data;
628-
for (int i = 0; i < N; ++i) {
629-
data[i] = n_past + i;
630-
}
631-
}
558+
ggml_set_input(KQ_pos);
632559

633560
// rope has so much parameters that we make a custom function for it
634561
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@@ -780,7 +707,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
780707
// input gradient
781708
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
782709
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
783-
ggml_allocr_alloc(alloc, t36->grad);
710+
ggml_set_input(t36->grad);
784711
// KQ_pos
785712
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
786713

@@ -805,11 +732,23 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
805732
// note: they will be freed in reverse order
806733
for (unsigned int i = 0; i < checkpoints.size(); ++i) {
807734
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
808-
ggml_allocr_alloc(alloc, checkpoints[i]);
735+
ggml_set_input(checkpoints[i]);
809736
}
810737
}
811738

812-
ggml_allocr_alloc_graph(alloc, gb);
739+
if (measure_only) {
740+
ggml_gallocr_reserve(alloc, gb);
741+
} else {
742+
ggml_gallocr_alloc_graph(alloc, gb);
743+
744+
// set KQ_pos
745+
{
746+
int * data = (int *) KQ_pos->data;
747+
for (int i = 0; i < N; ++i) {
748+
data[i] = n_past + i;
749+
}
750+
}
751+
}
813752

814753
// remove the additional nodes and leafs
815754
for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
@@ -1663,7 +1602,7 @@ int main(int argc, char ** argv) {
16631602
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
16641603
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
16651604
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
1666-
printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
1605+
printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
16671606

16681607
if (params.only_write_lora) {
16691608
save_train_files_data save_data;
@@ -1690,10 +1629,6 @@ int main(int argc, char ** argv) {
16901629
int n_vocab = model.hparams.n_vocab;
16911630
int n_batch = params.common.n_batch;
16921631

1693-
1694-
std::vector<uint8_t> mem_input_data;
1695-
std::vector<uint8_t> mem_compute_data;
1696-
16971632
// context for input tensors without their data
16981633
struct ggml_init_params ctx_input_params = {
16991634
ggml_tensor_overhead() * 2, // mem_size
@@ -1706,18 +1641,12 @@ int main(int argc, char ** argv) {
17061641
struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
17071642
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
17081643

1644+
// allocate input tensors
17091645
// measure required memory for input tensors
1710-
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
1711-
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
1712-
tensor_alignment;
1646+
ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
1647+
size_t max_input_size = ggml_backend_buffer_get_size(input_data);
17131648
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
17141649

1715-
// allocate input tensors
1716-
mem_input_data.resize(max_input_size);
1717-
ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
1718-
ggml_allocr_alloc(alloc_inps, tokens_input);
1719-
ggml_allocr_alloc(alloc_inps, target_probs);
1720-
17211650
// context for compute tensors without their data
17221651
const size_t estimated_compute_size_wo_data = (
17231652
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
@@ -1743,7 +1672,7 @@ int main(int argc, char ** argv) {
17431672
// find best evaluation order
17441673
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
17451674
ctx_compute = ggml_init(ctx_compute_params);
1746-
ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
1675+
ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
17471676
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
17481677
gf->order = (enum ggml_cgraph_eval_order) order;
17491678
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@@ -1756,14 +1685,15 @@ int main(int argc, char ** argv) {
17561685
&logits, tokens_input, target_probs,
17571686
n_tokens, n_batch,
17581687
params.common.use_flash,
1759-
params.common.use_checkpointing
1688+
params.common.use_checkpointing,
1689+
true
17601690
);
1761-
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
1691+
size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
17621692
if (max_compute_size < best_compute_size) {
17631693
best_compute_size = max_compute_size;
17641694
best_order = gf->order;
17651695
}
1766-
ggml_allocr_free(alloc);
1696+
ggml_gallocr_free(alloc);
17671697
ggml_free(ctx_compute);
17681698
}
17691699
size_t max_compute_size = best_compute_size;
@@ -1774,9 +1704,8 @@ int main(int argc, char ** argv) {
17741704
"invalid");
17751705

17761706
// allocate compute tensors
1777-
mem_compute_data.resize(max_compute_size);
17781707
ctx_compute = ggml_init(ctx_compute_params);
1779-
ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
1708+
ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
17801709
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
17811710
gf->order = best_order;
17821711
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@@ -1789,11 +1718,9 @@ int main(int argc, char ** argv) {
17891718
&logits, tokens_input, target_probs,
17901719
n_tokens, n_batch,
17911720
params.common.use_flash,
1792-
params.common.use_checkpointing
1721+
params.common.use_checkpointing,
1722+
false
17931723
);
1794-
ggml_allocr_free(alloc);
1795-
ggml_allocr_free(alloc_inps);
1796-
17971724

17981725
// tokenize data
17991726
std::vector<llama_token> train_tokens;
@@ -1908,6 +1835,8 @@ int main(int argc, char ** argv) {
19081835
ggml_free(ctx_work);
19091836
ggml_free(ctx_compute);
19101837
ggml_free(ctx_input);
1838+
ggml_gallocr_free(alloc);
1839+
19111840

19121841
int64_t t1 = ggml_time_ms();
19131842
printf("%s: total training time: ", __func__);

0 commit comments

Comments
 (0)