From 91eb33585b3c2ff33b33cc129a0365befca1fcab Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Sun, 12 Nov 2023 18:23:06 -0800 Subject: [PATCH 1/3] finetune : zero the loraB initial vectors Without this, the first iteration is starting out far from the base model, instead of exactly on it. Zeroing loraB is what the paper recommends. loralib also zeroes at least one of the init vector pairs (though it departs from the paper in using a different distribution for the other vector, in some cases). --- common/train.cpp | 45 ++++++++++++++++++++++++++++++++++ common/train.h | 1 + examples/finetune/finetune.cpp | 24 +++++++++--------- 3 files changed, 58 insertions(+), 12 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index bc15b7a03c0cd..ceb9b57b05f95 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -68,6 +68,51 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd) free(rnd); } +struct ggml_tensor * zero_tensor(struct ggml_tensor * tensor) { + float scale = 1.0f; // xavier + switch (tensor->n_dims) { + case 1: + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = 0.0f; + } + break; + case 2: + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = 0.0f; + } + } + break; + case 3: + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = 0.0f; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = 0.0f; + } + } + } + } + break; + default: + die("Unsupported tensor->n_dims"); + }; + return tensor; +} + struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier switch (tensor->n_dims) { diff --git a/common/train.h b/common/train.h index d86c93cc4f147..e1758ddf25cfd 100644 --- a/common/train.h +++ b/common/train.h @@ -127,6 +127,7 @@ struct random_uniform_distribution * init_random_uniform_distribution(int seed, void free_random_normal_distribution (struct random_normal_distribution * rnd); void free_random_uniform_distribution(struct random_uniform_distribution * rnd); +struct ggml_tensor * zero_tensor (struct ggml_tensor * tensor); struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd); struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd); diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index fa7dbe496b2c5..04a7a986bf7ef 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); randomize_tensor_normal(lora->tok_embeddings_a, rnd); - randomize_tensor_normal(lora->tok_embeddings_b, rnd); + zero_tensor(lora->tok_embeddings_b); randomize_tensor_normal(lora->norm_a, rnd); - randomize_tensor_normal(lora->norm_b, rnd); + zero_tensor(lora->norm_b); randomize_tensor_normal(lora->output_a, rnd); - randomize_tensor_normal(lora->output_b, rnd); + zero_tensor(lora->output_b); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = lora->layers[i]; randomize_tensor_normal(layer.attention_norm_a, rnd); - randomize_tensor_normal(layer.attention_norm_b, rnd); + zero_tensor(layer.attention_norm_b); randomize_tensor_normal(layer.wq_a, rnd); - randomize_tensor_normal(layer.wq_b, rnd); + zero_tensor(layer.wq_b); randomize_tensor_normal(layer.wk_a, rnd); - randomize_tensor_normal(layer.wk_b, rnd); + zero_tensor(layer.wk_b); randomize_tensor_normal(layer.wv_a, rnd); - randomize_tensor_normal(layer.wv_b, rnd); + zero_tensor(layer.wv_b); randomize_tensor_normal(layer.wo_a, rnd); - randomize_tensor_normal(layer.wo_b, rnd); + zero_tensor(layer.wo_b); randomize_tensor_normal(layer.ffn_norm_a, rnd); - randomize_tensor_normal(layer.ffn_norm_b, rnd); + zero_tensor(layer.ffn_norm_b); randomize_tensor_normal(layer.w1_a, rnd); - randomize_tensor_normal(layer.w1_b, rnd); + zero_tensor(layer.w1_b); randomize_tensor_normal(layer.w2_a, rnd); - randomize_tensor_normal(layer.w2_b, rnd); + zero_tensor(layer.w2_b); randomize_tensor_normal(layer.w3_a, rnd); - randomize_tensor_normal(layer.w3_b, rnd); + zero_tensor(layer.w3_b); } free_random_normal_distribution(rnd); From c72c1b37de729c95d2d7b0b893f47e34c63c1854 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Tue, 14 Nov 2023 16:57:28 -0800 Subject: [PATCH 2/3] tabs to spaces --- common/train.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index ceb9b57b05f95..62aaa2638a361 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -81,7 +81,7 @@ struct ggml_tensor * zero_tensor(struct ggml_tensor * tensor) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = 0.0f; + *dst = 0.0f; } } break; @@ -90,7 +90,7 @@ struct ggml_tensor * zero_tensor(struct ggml_tensor * tensor) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = 0.0f; + *dst = 0.0f; } } } @@ -101,7 +101,7 @@ struct ggml_tensor * zero_tensor(struct ggml_tensor * tensor) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = 0.0f; + *dst = 0.0f; } } } From 4571bcc17fec0a75b6a3833b083462c87b497600 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Wed, 15 Nov 2023 08:05:40 -0800 Subject: [PATCH 3/3] Use ggml_set_zero instead of adding a new function --- common/train.cpp | 45 ---------------------------------- common/train.h | 1 - examples/finetune/finetune.cpp | 24 +++++++++--------- 3 files changed, 12 insertions(+), 58 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index 62aaa2638a361..bc15b7a03c0cd 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -68,51 +68,6 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd) free(rnd); } -struct ggml_tensor * zero_tensor(struct ggml_tensor * tensor) { - float scale = 1.0f; // xavier - switch (tensor->n_dims) { - case 1: - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = 0.0f; - } - break; - case 2: - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = 0.0f; - } - } - break; - case 3: - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = 0.0f; - } - } - } - break; - case 4: - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = 0.0f; - } - } - } - } - break; - default: - die("Unsupported tensor->n_dims"); - }; - return tensor; -} - struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier switch (tensor->n_dims) { diff --git a/common/train.h b/common/train.h index e1758ddf25cfd..d86c93cc4f147 100644 --- a/common/train.h +++ b/common/train.h @@ -127,7 +127,6 @@ struct random_uniform_distribution * init_random_uniform_distribution(int seed, void free_random_normal_distribution (struct random_normal_distribution * rnd); void free_random_uniform_distribution(struct random_uniform_distribution * rnd); -struct ggml_tensor * zero_tensor (struct ggml_tensor * tensor); struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd); struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd); diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 04a7a986bf7ef..d6e75fc370a6d 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); randomize_tensor_normal(lora->tok_embeddings_a, rnd); - zero_tensor(lora->tok_embeddings_b); + ggml_set_zero(lora->tok_embeddings_b); randomize_tensor_normal(lora->norm_a, rnd); - zero_tensor(lora->norm_b); + ggml_set_zero(lora->norm_b); randomize_tensor_normal(lora->output_a, rnd); - zero_tensor(lora->output_b); + ggml_set_zero(lora->output_b); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = lora->layers[i]; randomize_tensor_normal(layer.attention_norm_a, rnd); - zero_tensor(layer.attention_norm_b); + ggml_set_zero(layer.attention_norm_b); randomize_tensor_normal(layer.wq_a, rnd); - zero_tensor(layer.wq_b); + ggml_set_zero(layer.wq_b); randomize_tensor_normal(layer.wk_a, rnd); - zero_tensor(layer.wk_b); + ggml_set_zero(layer.wk_b); randomize_tensor_normal(layer.wv_a, rnd); - zero_tensor(layer.wv_b); + ggml_set_zero(layer.wv_b); randomize_tensor_normal(layer.wo_a, rnd); - zero_tensor(layer.wo_b); + ggml_set_zero(layer.wo_b); randomize_tensor_normal(layer.ffn_norm_a, rnd); - zero_tensor(layer.ffn_norm_b); + ggml_set_zero(layer.ffn_norm_b); randomize_tensor_normal(layer.w1_a, rnd); - zero_tensor(layer.w1_b); + ggml_set_zero(layer.w1_b); randomize_tensor_normal(layer.w2_a, rnd); - zero_tensor(layer.w2_b); + ggml_set_zero(layer.w2_b); randomize_tensor_normal(layer.w3_a, rnd); - zero_tensor(layer.w3_b); + ggml_set_zero(layer.w3_b); } free_random_normal_distribution(rnd);