From facb1a3e0fe539a079664d0e12ddfdca49c5c58e Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Sat, 21 Oct 2023 22:41:24 -0700
Subject: [PATCH 01/14] Add '-ngl' support to finetune.cpp

---
 common/train.cpp               |  2 ++
 common/train.h                 |  2 ++
 examples/finetune/finetune.cpp | 12 ++++++++++++
 3 files changed, 16 insertions(+)

diff --git a/common/train.cpp b/common/train.cpp
index 3cce5da269637..cd73e297b853a 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1080,6 +1080,8 @@ struct train_params_common get_default_train_params_common() {
     params.adam_beta2          = 0.999f;
     params.adam_gclip          = 1.0f;
     params.adam_eps_f          = 0.0f;
+
+    params.n_gpu_layers        = 0;
     return params;
 }
 
diff --git a/common/train.h b/common/train.h
index 42fa704b897ae..00dee04eeacbf 100644
--- a/common/train.h
+++ b/common/train.h
@@ -80,6 +80,8 @@ struct train_params_common {
     float adam_beta2;
     float adam_gclip;
     float adam_eps_f;
+
+	int32_t n_gpu_layers;
 };
 
 typedef void (*save_train_files_callback)(void * data, struct train_state * train);
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 35824cd2d786a..d158c2327a253 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1459,6 +1459,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             }
             params->n_rank_w3 = std::stoi(argv[i]);
             params->custom_n_rank_w3 = true;
+		} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+			if (++i >= argc) {
+				invalid_param = true;
+				break;
+			}
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+			params->common.n_gpu_layers = std::stoi(argv[i]);
+#else
+			fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+			fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             train_print_usage(argc, argv, &default_params);
@@ -1545,6 +1556,7 @@ int main(int argc, char ** argv) {
     srand(params.common.seed);
 
     struct llama_model_params llama_mparams = llama_model_default_params();
+	llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
     llama_mparams.vocab_only = false;
 
     printf("%s: model base = '%s'\n", __func__, params.fn_model_base);

From 4d452dbc1068e88979197077903a105d6aa60b74 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Sat, 21 Oct 2023 22:40:40 -0700
Subject: [PATCH 02/14] Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora
---
 ggml-cuda.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index db053e3b8a9d8..63685100d337e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -5910,6 +5910,7 @@ inline void ggml_cuda_op_add(
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
         add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
     } else {
+        fprintf(stderr, "%d, %d\n", src0->type, dst->type);
         GGML_ASSERT(false);
     }
 

From e1ebce03d61801c4b0f0137c8a072041ce556463 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Sat, 21 Oct 2023 22:42:37 -0700
Subject: [PATCH 03/14] Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"
---
 examples/finetune/finetune.sh | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 examples/finetune/finetune.sh

diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
new file mode 100644
index 0000000000000..b79b107531c1a
--- /dev/null
+++ b/examples/finetune/finetune.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+cd `dirname $0`
+cd ../..
+
+EXE="./finetune"
+
+MODEL="openllama-3b-v2.gguf"
+
+while getopts "dg" opt; do
+  case $opt in
+    d)
+      DEBUGGER="gdb --args"
+      ;;
+    g)
+      # GPU. The makefile doesn't support CUDA on Windows, so I have to use CMake and so main is built to a different location.
+      # Note: "-d" doesn't really work with this - it will run under gdb, but there are no debugging symbols (in a format gdb understands). I think the easiest workaround is to use WinDbg instead.
+      EXE="./build/bin/Release/finetune"
+      GPUARG="--gpu-layers 25"
+      ;;
+  esac
+done
+
+$DEBUGGER $EXE \
+        --model-base c:/models/$MODEL \
+	$GPUARG \
+        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
+        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
+        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
+        --train-data "c:\training\shakespeare.txt" \
+        --save-every 10 \
+        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
+        --use-checkpointing

From 1758d0abef9bf03a237df48f98cd675d9b370788 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Mon, 23 Oct 2023 09:59:43 -0700
Subject: [PATCH 04/14] tweak finetune.sh

---
 examples/finetune/finetune.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
index b79b107531c1a..9a9d02848255a 100644
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -4,7 +4,7 @@ cd ../..
 
 EXE="./finetune"
 
-MODEL="openllama-3b-v2.gguf"
+MODEL="c:/models/openllama-3b-v2.gguf"
 
 while getopts "dg" opt; do
   case $opt in
@@ -21,7 +21,7 @@ while getopts "dg" opt; do
 done
 
 $DEBUGGER $EXE \
-        --model-base c:/models/$MODEL \
+        --model-base $MODEL \
 	$GPUARG \
         --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
         --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \

From 9ea91ceaf2b3c5cb4f6d790c911eb2eaaca5de6e Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Mon, 23 Oct 2023 13:01:31 -0700
Subject: [PATCH 05/14] Suppress some warnings in ggml.c

---
 ggml.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml.c b/ggml.c
index 17f0ce4877592..e477f3640cad5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1276,6 +1276,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
     }
 #else
+	UNUSED(nb);
     // scalar
     quantize_row_q8_0_reference(x, y, k);
 #endif
@@ -1529,6 +1530,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         y[i].s = sum*d;
     }
 #else
+	UNUSED(nb);
     // scalar
     quantize_row_q8_1_reference(x, y, k);
 #endif

From 19097c97a84083637dd84ee0861f661e23efec9f Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Mon, 23 Oct 2023 13:21:17 -0700
Subject: [PATCH 06/14] Add f16 implementation to
 ggml_compute_forward_add_f16_f32

---
 ggml.c | 55 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 16 deletions(-)

diff --git a/ggml.c b/ggml.c
index e477f3640cad5..1bc77737cbb1e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9358,9 +9358,15 @@ static void ggml_compute_forward_add_f16_f32(
 
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
 
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+	if (dst->type == GGML_TYPE_F32) {
+		GGML_ASSERT( nb0 == sizeof(float));
+	}
+	else {
+		GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+		GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+	}
+
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 
     // rows per thread
@@ -9371,20 +9377,37 @@ static void ggml_compute_forward_add_f16_f32(
     const int ir1 = MIN(ir0 + dr, nr);
 
     if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-            for (int i = 0; i < ne0; i++) {
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
-            }
-        }
+		if (dst->type == GGML_TYPE_F16) {
+			for (int ir = ir0; ir < ir1; ++ir) {
+				// src0, src1 and dst are same shape => same indices
+				const int i3 = ir/(ne2*ne1);
+				const int i2 = (ir - i3*ne2*ne1)/ne1;
+				const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+				ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+				ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+				float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+				for (int i = 0; i < ne0; i++) {
+					dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+				}
+			}
+		} else {
+			for (int ir = ir0; ir < ir1; ++ir) {
+				// src0, src1 and dst are same shape => same indices
+				const int i3 = ir/(ne2*ne1);
+				const int i2 = (ir - i3*ne2*ne1)/ne1;
+				const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+				float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+				ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+				float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+				for (int i = 0; i < ne0; i++) {
+					dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+				}
+			}
+		}
     }
     else {
         // src1 is not contiguous

From 7cbf5b282c7fcc3f2500355936bd81f45815b6d8 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Mon, 23 Oct 2023 18:31:06 -0700
Subject: [PATCH 07/14] Add an f16 case to ggml_add_cast_impl and
 llama_build_lora_finetune_graphs

---
 examples/finetune/finetune.cpp | 2 +-
 ggml.c                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index d158c2327a253..29354db20f3d3 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -652,7 +652,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
     GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
 
     auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type)) {
+        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
             return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
         } else if (a->type == GGML_TYPE_F32) {
             return ggml_add(ctx, a, b);
diff --git a/ggml.c b/ggml.c
index 1bc77737cbb1e..a0501fbdd46ef 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5636,7 +5636,7 @@ static struct ggml_tensor * ggml_add_cast_impl(
     // TODO: support less-strict constraint
     //       GGML_ASSERT(ggml_can_repeat(b, a));
     GGML_ASSERT(ggml_can_repeat_rows(b, a));
-    GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input
+    GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
 
     bool is_node = false;
 

From 9587ab4c731e3ea4b4b11a12a1f40afb391f9bf9 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Mon, 23 Oct 2023 19:15:40 -0700
Subject: [PATCH 08/14] finetune.sh: Edit comments

---
 examples/finetune/finetune.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
index 9a9d02848255a..32ba303606d60 100644
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -4,7 +4,8 @@ cd ../..
 
 EXE="./finetune"
 
-MODEL="c:/models/openllama-3b-v2.gguf"
+# MODEL="c:/models/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
+MODEL="c:/models/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
 
 while getopts "dg" opt; do
   case $opt in
@@ -12,8 +13,6 @@ while getopts "dg" opt; do
       DEBUGGER="gdb --args"
       ;;
     g)
-      # GPU. The makefile doesn't support CUDA on Windows, so I have to use CMake and so main is built to a different location.
-      # Note: "-d" doesn't really work with this - it will run under gdb, but there are no debugging symbols (in a format gdb understands). I think the easiest workaround is to use WinDbg instead.
       EXE="./build/bin/Release/finetune"
       GPUARG="--gpu-layers 25"
       ;;

From 86ceda4275a99f24fe7c6816e9a18d1e54bfdc77 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Mon, 23 Oct 2023 19:26:20 -0700
Subject: [PATCH 09/14] Add "add_f16_f32_f32_cuda"

---
 ggml-cuda.cu | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 63685100d337e..960076f4d47df 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -496,6 +496,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
     dst[i] = __hadd(x[i], __float2half(y[i]));
 }
 
+static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = __half2float(x[i]) + y[i];
+}
+
 static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
@@ -4616,6 +4625,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
     add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
 }
 
+static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+}
+
 static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
     const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
     mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5909,8 +5923,10 @@ inline void ggml_cuda_op_add(
         add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
         add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
     } else {
-        fprintf(stderr, "%d, %d\n", src0->type, dst->type);
+        fprintf(stderr, "src0->type: %d  dst->type: %d\n", src0->type, dst->type);
         GGML_ASSERT(false);
     }
 

From 81dabd8edd4c74abb668d6f8211e4f42cd7c62c0 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Mon, 23 Oct 2023 19:28:05 -0700
Subject: [PATCH 10/14] Tweak an error message

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 61f30c3982f18..9d26b2dd2084e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8605,7 +8605,7 @@ static int llama_apply_lora_from_file_internal(
             if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
                 if (dest_t->type != GGML_TYPE_F16) {
                     throw std::runtime_error(format(
-                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
+                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
                 }
                 offload_func = ggml_cuda_assign_buffers;
                 offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;

From 6359c15174df8741f042a063b71905e45726323a Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Tue, 24 Oct 2023 19:14:17 -0700
Subject: [PATCH 11/14] finetune.sh: Add an optional LLAMA_MODEL_DIR variable

---
 examples/finetune/finetune.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
index 32ba303606d60..9c56bb8f5d5c5 100644
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -4,8 +4,10 @@ cd ../..
 
 EXE="./finetune"
 
-# MODEL="c:/models/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="c:/models/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
+if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
+
+# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
+MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
 
 while getopts "dg" opt; do
   case $opt in

From fe44ded01adc3d15b267d868b956d0386086af7f Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Tue, 24 Oct 2023 19:31:52 -0700
Subject: [PATCH 12/14] finetune.sh: Add an optional LLAMA_TRAINING_DIR
 variable

---
 examples/finetune/finetune.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
index 9c56bb8f5d5c5..f567d4736c36a 100644
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -5,6 +5,7 @@ cd ../..
 EXE="./finetune"
 
 if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
+if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
 
 # MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
 MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
@@ -27,7 +28,7 @@ $DEBUGGER $EXE \
         --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
         --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
         --lora-out lora-ol3b-shakespeare-ITERATION.bin \
-        --train-data "c:\training\shakespeare.txt" \
+        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
         --save-every 10 \
         --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
         --use-checkpointing

From c5c54d105760cbdf3bd4c6dad61861e4ef774c68 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 28 Oct 2023 13:54:46 +0300
Subject: [PATCH 13/14] train : minor

---
 common/train.cpp | 2 +-
 common/train.h   | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index cd73e297b853a..bc15b7a03c0cd 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1045,6 +1045,7 @@ struct train_params_common get_default_train_params_common() {
     params.n_batch    =    8;
     params.n_gradient_accumulation = 1;
     params.n_epochs   = -1;
+    params.n_gpu_layers = 0;
 
     params.custom_n_ctx = false;
 
@@ -1081,7 +1082,6 @@ struct train_params_common get_default_train_params_common() {
     params.adam_gclip          = 1.0f;
     params.adam_eps_f          = 0.0f;
 
-    params.n_gpu_layers        = 0;
     return params;
 }
 
diff --git a/common/train.h b/common/train.h
index 00dee04eeacbf..0545be2884f44 100644
--- a/common/train.h
+++ b/common/train.h
@@ -44,6 +44,7 @@ struct train_params_common {
     int n_batch;
     int n_gradient_accumulation;
     int n_epochs;
+	int n_gpu_layers;
 
     bool custom_n_ctx;
 
@@ -80,8 +81,6 @@ struct train_params_common {
     float adam_beta2;
     float adam_gclip;
     float adam_eps_f;
-
-	int32_t n_gpu_layers;
 };
 
 typedef void (*save_train_files_callback)(void * data, struct train_state * train);

From 998a548a30d06cc63a432567fa0a42d35a5e2d69 Mon Sep 17 00:00:00 2001
From: Andrew Godfrey <andrew@gfreys.com>
Date: Sun, 29 Oct 2023 19:14:27 -0700
Subject: [PATCH 14/14] tabs to spaces

---
 common/train.h                 |  2 +-
 examples/finetune/finetune.cpp | 18 ++++----
 examples/finetune/finetune.sh  |  2 +-
 ggml.c                         | 80 +++++++++++++++++-----------------
 4 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/common/train.h b/common/train.h
index 0545be2884f44..d86c93cc4f147 100644
--- a/common/train.h
+++ b/common/train.h
@@ -44,7 +44,7 @@ struct train_params_common {
     int n_batch;
     int n_gradient_accumulation;
     int n_epochs;
-	int n_gpu_layers;
+    int n_gpu_layers;
 
     bool custom_n_ctx;
 
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 29354db20f3d3..60c7faa797028 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1459,16 +1459,16 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             }
             params->n_rank_w3 = std::stoi(argv[i]);
             params->custom_n_rank_w3 = true;
-		} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
-			if (++i >= argc) {
-				invalid_param = true;
-				break;
-			}
+        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-			params->common.n_gpu_layers = std::stoi(argv[i]);
+            params->common.n_gpu_layers = std::stoi(argv[i]);
 #else
-			fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-			fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -1556,7 +1556,7 @@ int main(int argc, char ** argv) {
     srand(params.common.seed);
 
     struct llama_model_params llama_mparams = llama_model_default_params();
-	llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
+    llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
     llama_mparams.vocab_only = false;
 
     printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
index f567d4736c36a..079bfa1139d5b 100644
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -24,7 +24,7 @@ done
 
 $DEBUGGER $EXE \
         --model-base $MODEL \
-	$GPUARG \
+        $GPUARG \
         --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
         --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
         --lora-out lora-ol3b-shakespeare-ITERATION.bin \
diff --git a/ggml.c b/ggml.c
index a0501fbdd46ef..96e0fad6f84d8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1276,7 +1276,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
     }
 #else
-	UNUSED(nb);
+    UNUSED(nb);
     // scalar
     quantize_row_q8_0_reference(x, y, k);
 #endif
@@ -1530,7 +1530,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         y[i].s = sum*d;
     }
 #else
-	UNUSED(nb);
+    UNUSED(nb);
     // scalar
     quantize_row_q8_1_reference(x, y, k);
 #endif
@@ -9359,13 +9359,13 @@ static void ggml_compute_forward_add_f16_f32(
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
-	if (dst->type == GGML_TYPE_F32) {
-		GGML_ASSERT( nb0 == sizeof(float));
-	}
-	else {
-		GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-		GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-	}
+    if (dst->type == GGML_TYPE_F32) {
+        GGML_ASSERT( nb0 == sizeof(float));
+    }
+    else {
+        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    }
 
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 
@@ -9377,37 +9377,37 @@ static void ggml_compute_forward_add_f16_f32(
     const int ir1 = MIN(ir0 + dr, nr);
 
     if (nb10 == sizeof(float)) {
-		if (dst->type == GGML_TYPE_F16) {
-			for (int ir = ir0; ir < ir1; ++ir) {
-				// src0, src1 and dst are same shape => same indices
-				const int i3 = ir/(ne2*ne1);
-				const int i2 = (ir - i3*ne2*ne1)/ne1;
-				const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-				ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-				ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-				float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-				for (int i = 0; i < ne0; i++) {
-					dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
-				}
-			}
-		} else {
-			for (int ir = ir0; ir < ir1; ++ir) {
-				// src0, src1 and dst are same shape => same indices
-				const int i3 = ir/(ne2*ne1);
-				const int i2 = (ir - i3*ne2*ne1)/ne1;
-				const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-				float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-				ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-				float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-				for (int i = 0; i < ne0; i++) {
-					dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
-				}
-			}
-		}
+        if (dst->type == GGML_TYPE_F16) {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+                }
+            }
+        } else {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+                }
+            }
+        }
     }
     else {
         // src1 is not contiguous