From facb1a3e0fe539a079664d0e12ddfdca49c5c58e Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Sat, 21 Oct 2023 22:41:24 -0700 Subject: [PATCH 01/14] Add '-ngl' support to finetune.cpp --- common/train.cpp | 2 ++ common/train.h | 2 ++ examples/finetune/finetune.cpp | 12 ++++++++++++ 3 files changed, 16 insertions(+) diff --git a/common/train.cpp b/common/train.cpp index 3cce5da269637..cd73e297b853a 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1080,6 +1080,8 @@ struct train_params_common get_default_train_params_common() { params.adam_beta2 = 0.999f; params.adam_gclip = 1.0f; params.adam_eps_f = 0.0f; + + params.n_gpu_layers = 0; return params; } diff --git a/common/train.h b/common/train.h index 42fa704b897ae..00dee04eeacbf 100644 --- a/common/train.h +++ b/common/train.h @@ -80,6 +80,8 @@ struct train_params_common { float adam_beta2; float adam_gclip; float adam_eps_f; + + int32_t n_gpu_layers; }; typedef void (*save_train_files_callback)(void * data, struct train_state * train); diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 35824cd2d786a..d158c2327a253 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1459,6 +1459,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par } params->n_rank_w3 = std::stoi(argv[i]); params->custom_n_rank_w3 = true; + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params->common.n_gpu_layers = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); train_print_usage(argc, argv, &default_params); @@ -1545,6 +1556,7 @@ int main(int argc, char ** argv) { srand(params.common.seed); struct llama_model_params llama_mparams = llama_model_default_params(); + llama_mparams.n_gpu_layers = params.common.n_gpu_layers; llama_mparams.vocab_only = false; printf("%s: model base = '%s'\n", __func__, params.fn_model_base); From 4d452dbc1068e88979197077903a105d6aa60b74 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Sat, 21 Oct 2023 22:40:40 -0700 Subject: [PATCH 02/14] Add fprintf in ggml_cuda_op_add When I tried CUDA offloading during finetuning following the readme, I got an assert here. This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora --- ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index db053e3b8a9d8..63685100d337e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5910,6 +5910,7 @@ inline void ggml_cuda_op_add( } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream); } else { + fprintf(stderr, "%d, %d\n", src0->type, dst->type); GGML_ASSERT(false); } From e1ebce03d61801c4b0f0137c8a072041ce556463 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Sat, 21 Oct 2023 22:42:37 -0700 Subject: [PATCH 03/14] Add 'finetune.sh', which currently fails when using GPU "error: operator (): Finetuning on tensors with type 'f16' is not yet supported" --- examples/finetune/finetune.sh | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 examples/finetune/finetune.sh diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh new file mode 100644 index 0000000000000..b79b107531c1a --- /dev/null +++ b/examples/finetune/finetune.sh @@ -0,0 +1,32 @@ +#!/bin/bash +cd `dirname $0` +cd ../.. + +EXE="./finetune" + +MODEL="openllama-3b-v2.gguf" + +while getopts "dg" opt; do + case $opt in + d) + DEBUGGER="gdb --args" + ;; + g) + # GPU. The makefile doesn't support CUDA on Windows, so I have to use CMake and so main is built to a different location. + # Note: "-d" doesn't really work with this - it will run under gdb, but there are no debugging symbols (in a format gdb understands). I think the easiest workaround is to use WinDbg instead. + EXE="./build/bin/Release/finetune" + GPUARG="--gpu-layers 25" + ;; + esac +done + +$DEBUGGER $EXE \ + --model-base c:/models/$MODEL \ + $GPUARG \ + --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ + --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ + --lora-out lora-ol3b-shakespeare-ITERATION.bin \ + --train-data "c:\training\shakespeare.txt" \ + --save-every 10 \ + --threads 10 --adam-iter 30 --batch 4 --ctx 64 \ + --use-checkpointing From 1758d0abef9bf03a237df48f98cd675d9b370788 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Mon, 23 Oct 2023 09:59:43 -0700 Subject: [PATCH 04/14] tweak finetune.sh --- examples/finetune/finetune.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh index b79b107531c1a..9a9d02848255a 100644 --- a/examples/finetune/finetune.sh +++ b/examples/finetune/finetune.sh @@ -4,7 +4,7 @@ cd ../.. EXE="./finetune" -MODEL="openllama-3b-v2.gguf" +MODEL="c:/models/openllama-3b-v2.gguf" while getopts "dg" opt; do case $opt in @@ -21,7 +21,7 @@ while getopts "dg" opt; do done $DEBUGGER $EXE \ - --model-base c:/models/$MODEL \ + --model-base $MODEL \ $GPUARG \ --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ From 9ea91ceaf2b3c5cb4f6d790c911eb2eaaca5de6e Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Mon, 23 Oct 2023 13:01:31 -0700 Subject: [PATCH 05/14] Suppress some warnings in ggml.c --- ggml.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml.c b/ggml.c index 17f0ce4877592..e477f3640cad5 100644 --- a/ggml.c +++ b/ggml.c @@ -1276,6 +1276,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int __riscv_vse8_v_i8m1(y[i].qs , vs, vl); } #else + UNUSED(nb); // scalar quantize_row_q8_0_reference(x, y, k); #endif @@ -1529,6 +1530,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int y[i].s = sum*d; } #else + UNUSED(nb); // scalar quantize_row_q8_1_reference(x, y, k); #endif From 19097c97a84083637dd84ee0861f661e23efec9f Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Mon, 23 Oct 2023 13:21:17 -0700 Subject: [PATCH 06/14] Add f16 implementation to ggml_compute_forward_add_f16_f32 --- ggml.c | 55 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/ggml.c b/ggml.c index e477f3640cad5..1bc77737cbb1e 100644 --- a/ggml.c +++ b/ggml.c @@ -9358,9 +9358,15 @@ static void ggml_compute_forward_add_f16_f32( GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + if (dst->type == GGML_TYPE_F32) { + GGML_ASSERT( nb0 == sizeof(float)); + } + else { + GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + } + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); // rows per thread @@ -9371,20 +9377,37 @@ static void ggml_compute_forward_add_f16_f32( const int ir1 = MIN(ir0 + dr, nr); if (nb10 == sizeof(float)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); - } - } + if (dst->type == GGML_TYPE_F16) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } else { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; + } + } + } } else { // src1 is not contiguous From 7cbf5b282c7fcc3f2500355936bd81f45815b6d8 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Mon, 23 Oct 2023 18:31:06 -0700 Subject: [PATCH 07/14] Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs --- examples/finetune/finetune.cpp | 2 +- ggml.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index d158c2327a253..29354db20f3d3 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -652,7 +652,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - if (ggml_is_quantized(a->type)) { + if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) { return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); } else if (a->type == GGML_TYPE_F32) { return ggml_add(ctx, a, b); diff --git a/ggml.c b/ggml.c index 1bc77737cbb1e..a0501fbdd46ef 100644 --- a/ggml.c +++ b/ggml.c @@ -5636,7 +5636,7 @@ static struct ggml_tensor * ggml_add_cast_impl( // TODO: support less-strict constraint // GGML_ASSERT(ggml_can_repeat(b, a)); GGML_ASSERT(ggml_can_repeat_rows(b, a)); - GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input + GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16 bool is_node = false; From 9587ab4c731e3ea4b4b11a12a1f40afb391f9bf9 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Mon, 23 Oct 2023 19:15:40 -0700 Subject: [PATCH 08/14] finetune.sh: Edit comments --- examples/finetune/finetune.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh index 9a9d02848255a..32ba303606d60 100644 --- a/examples/finetune/finetune.sh +++ b/examples/finetune/finetune.sh @@ -4,7 +4,8 @@ cd ../.. EXE="./finetune" -MODEL="c:/models/openllama-3b-v2.gguf" +# MODEL="c:/models/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. +MODEL="c:/models/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing. while getopts "dg" opt; do case $opt in @@ -12,8 +13,6 @@ while getopts "dg" opt; do DEBUGGER="gdb --args" ;; g) - # GPU. The makefile doesn't support CUDA on Windows, so I have to use CMake and so main is built to a different location. - # Note: "-d" doesn't really work with this - it will run under gdb, but there are no debugging symbols (in a format gdb understands). I think the easiest workaround is to use WinDbg instead. EXE="./build/bin/Release/finetune" GPUARG="--gpu-layers 25" ;; From 86ceda4275a99f24fe7c6816e9a18d1e54bfdc77 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Mon, 23 Oct 2023 19:26:20 -0700 Subject: [PATCH 09/14] Add "add_f16_f32_f32_cuda" --- ggml-cuda.cu | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 63685100d337e..960076f4d47df 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -496,6 +496,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d dst[i] = __hadd(x[i], __float2half(y[i])); } +static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = __half2float(x[i]) + y[i]; +} + static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -4616,6 +4625,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co add_f16_f32_f16<<>>(x, y, dst, k); } +static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f16_f32_f32<<>>(x, y, dst, k); +} + static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE; mul_f32<<>>(x, y, dst, kx, ky); @@ -5909,8 +5923,10 @@ inline void ggml_cuda_op_add( add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream); } else { - fprintf(stderr, "%d, %d\n", src0->type, dst->type); + fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type); GGML_ASSERT(false); } From 81dabd8edd4c74abb668d6f8211e4f42cd7c62c0 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Mon, 23 Oct 2023 19:28:05 -0700 Subject: [PATCH 10/14] Tweak an error message --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 61f30c3982f18..9d26b2dd2084e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8605,7 +8605,7 @@ static int llama_apply_lora_from_file_internal( if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { if (dest_t->type != GGML_TYPE_F16) { throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); + "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type)); } offload_func = ggml_cuda_assign_buffers; offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; From 6359c15174df8741f042a063b71905e45726323a Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Tue, 24 Oct 2023 19:14:17 -0700 Subject: [PATCH 11/14] finetune.sh: Add an optional LLAMA_MODEL_DIR variable --- examples/finetune/finetune.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh index 32ba303606d60..9c56bb8f5d5c5 100644 --- a/examples/finetune/finetune.sh +++ b/examples/finetune/finetune.sh @@ -4,8 +4,10 @@ cd ../.. EXE="./finetune" -# MODEL="c:/models/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. -MODEL="c:/models/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing. +if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi + +# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. +MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing. while getopts "dg" opt; do case $opt in From fe44ded01adc3d15b267d868b956d0386086af7f Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Tue, 24 Oct 2023 19:31:52 -0700 Subject: [PATCH 12/14] finetune.sh: Add an optional LLAMA_TRAINING_DIR variable --- examples/finetune/finetune.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh index 9c56bb8f5d5c5..f567d4736c36a 100644 --- a/examples/finetune/finetune.sh +++ b/examples/finetune/finetune.sh @@ -5,6 +5,7 @@ cd ../.. EXE="./finetune" if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi +if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi # MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing. @@ -27,7 +28,7 @@ $DEBUGGER $EXE \ --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ --lora-out lora-ol3b-shakespeare-ITERATION.bin \ - --train-data "c:\training\shakespeare.txt" \ + --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \ --save-every 10 \ --threads 10 --adam-iter 30 --batch 4 --ctx 64 \ --use-checkpointing From c5c54d105760cbdf3bd4c6dad61861e4ef774c68 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 13:54:46 +0300 Subject: [PATCH 13/14] train : minor --- common/train.cpp | 2 +- common/train.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index cd73e297b853a..bc15b7a03c0cd 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1045,6 +1045,7 @@ struct train_params_common get_default_train_params_common() { params.n_batch = 8; params.n_gradient_accumulation = 1; params.n_epochs = -1; + params.n_gpu_layers = 0; params.custom_n_ctx = false; @@ -1081,7 +1082,6 @@ struct train_params_common get_default_train_params_common() { params.adam_gclip = 1.0f; params.adam_eps_f = 0.0f; - params.n_gpu_layers = 0; return params; } diff --git a/common/train.h b/common/train.h index 00dee04eeacbf..0545be2884f44 100644 --- a/common/train.h +++ b/common/train.h @@ -44,6 +44,7 @@ struct train_params_common { int n_batch; int n_gradient_accumulation; int n_epochs; + int n_gpu_layers; bool custom_n_ctx; @@ -80,8 +81,6 @@ struct train_params_common { float adam_beta2; float adam_gclip; float adam_eps_f; - - int32_t n_gpu_layers; }; typedef void (*save_train_files_callback)(void * data, struct train_state * train); From 998a548a30d06cc63a432567fa0a42d35a5e2d69 Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Sun, 29 Oct 2023 19:14:27 -0700 Subject: [PATCH 14/14] tabs to spaces --- common/train.h | 2 +- examples/finetune/finetune.cpp | 18 ++++---- examples/finetune/finetune.sh | 2 +- ggml.c | 80 +++++++++++++++++----------------- 4 files changed, 51 insertions(+), 51 deletions(-) diff --git a/common/train.h b/common/train.h index 0545be2884f44..d86c93cc4f147 100644 --- a/common/train.h +++ b/common/train.h @@ -44,7 +44,7 @@ struct train_params_common { int n_batch; int n_gradient_accumulation; int n_epochs; - int n_gpu_layers; + int n_gpu_layers; bool custom_n_ctx; diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 29354db20f3d3..60c7faa797028 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1459,16 +1459,16 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par } params->n_rank_w3 = std::stoi(argv[i]); params->custom_n_rank_w3 = true; - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params->common.n_gpu_layers = std::stoi(argv[i]); + params->common.n_gpu_layers = std::stoi(argv[i]); #else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -1556,7 +1556,7 @@ int main(int argc, char ** argv) { srand(params.common.seed); struct llama_model_params llama_mparams = llama_model_default_params(); - llama_mparams.n_gpu_layers = params.common.n_gpu_layers; + llama_mparams.n_gpu_layers = params.common.n_gpu_layers; llama_mparams.vocab_only = false; printf("%s: model base = '%s'\n", __func__, params.fn_model_base); diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh index f567d4736c36a..079bfa1139d5b 100644 --- a/examples/finetune/finetune.sh +++ b/examples/finetune/finetune.sh @@ -24,7 +24,7 @@ done $DEBUGGER $EXE \ --model-base $MODEL \ - $GPUARG \ + $GPUARG \ --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ --lora-out lora-ol3b-shakespeare-ITERATION.bin \ diff --git a/ggml.c b/ggml.c index a0501fbdd46ef..96e0fad6f84d8 100644 --- a/ggml.c +++ b/ggml.c @@ -1276,7 +1276,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int __riscv_vse8_v_i8m1(y[i].qs , vs, vl); } #else - UNUSED(nb); + UNUSED(nb); // scalar quantize_row_q8_0_reference(x, y, k); #endif @@ -1530,7 +1530,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int y[i].s = sum*d; } #else - UNUSED(nb); + UNUSED(nb); // scalar quantize_row_q8_1_reference(x, y, k); #endif @@ -9359,13 +9359,13 @@ static void ggml_compute_forward_add_f16_f32( GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - if (dst->type == GGML_TYPE_F32) { - GGML_ASSERT( nb0 == sizeof(float)); - } - else { - GGML_ASSERT(dst->type == GGML_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - } + if (dst->type == GGML_TYPE_F32) { + GGML_ASSERT( nb0 == sizeof(float)); + } + else { + GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + } GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); @@ -9377,37 +9377,37 @@ static void ggml_compute_forward_add_f16_f32( const int ir1 = MIN(ir0 + dr, nr); if (nb10 == sizeof(float)) { - if (dst->type == GGML_TYPE_F16) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); - } - } - } else { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; - } - } - } + if (dst->type == GGML_TYPE_F16) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } else { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; + } + } + } } else { // src1 is not contiguous