From 2cf4f62e12c36c7ba81efd8db3cb68a84e3121dd Mon Sep 17 00:00:00 2001 From: ochafik Date: Fri, 18 Aug 2023 01:46:20 +0100 Subject: [PATCH 01/10] Skip computation of unused logits during batch prompt eval (drop other batch positions after writing their kv to cache) --- llama.cpp | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/llama.cpp b/llama.cpp index f2dc4da1db344..838f47e3621d7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2117,7 +2117,8 @@ static struct ggml_cgraph * llm_build_llama( GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT - const int N = n_tokens; + // Non-const to allow short-circuiting to the last token in the last layer in prompt eval mode. + int N = n_tokens; const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -2245,18 +2246,10 @@ static struct ggml_cgraph * llm_build_llama( offload_func_kq(tmpk); ggml_set_name(tmpk, "tmpk"); - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); - // store key and value to memory { // compute the transposed [N, n_embd] V matrix @@ -2284,6 +2277,35 @@ static struct ggml_cgraph * llm_build_llama( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } + if (il == n_layer - 1 && !lctx.logits_all) + { + // From here on, we only care about the last token and its logits. + // We do as if N = 1 (from the end), which means we only keep + // the last column of cur and inpSA ((n_embd, N) -> (n_embd, 1)). + // + // Note that we do this even when N==1 so that we don't change the # nodes in the graph, + // otherwise for Metal we'd have to rebuild the concurrency list. + + cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd); + offload_func_nr(cur); + ggml_set_name(cur, "cur-lastpos"); + + inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd); + offload_func_nr(inpSA); + ggml_set_name(inpSA, "inpSA-lastpos"); + + n_past += N - 1; + N = 1; + } + + struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + offload_func_kq(tmpq); + ggml_set_name(tmpq, "tmpq"); + + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + offload_func_kq(Qcur); + ggml_set_name(Qcur, "Qcur"); + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); offload_func_kq(Q); ggml_set_name(Q, "Q"); @@ -2902,11 +2924,13 @@ static bool llama_eval_internal( if (lctx.logits_all) { logits_out.resize(n_vocab * N); + GGML_ASSERT(ggml_nelements(res) == n_vocab * N); memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N); } else { // return result for just the last token + GGML_ASSERT(ggml_nelements(res) == n_vocab); logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab); } } From 7ec7ef94a92161d8cf65866e0666001ba9fc8ac9 Mon Sep 17 00:00:00 2001 From: ochafik Date: Wed, 23 Aug 2023 21:36:56 +0100 Subject: [PATCH 02/10] skip-unused: disable skipping on ROCm / when LLAMA_USE_HIPBLAS --- llama.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 838f47e3621d7..5692ced566296 100644 --- a/llama.cpp +++ b/llama.cpp @@ -56,6 +56,12 @@ #include // for _fseeki64 #endif +// TODO: Fix unused logit skipping crashes on ROCm +// (see https://github.com/ggerganov/llama.cpp/pull/2700#issuecomment-1689548127) +#ifndef LLAMA_USE_HIPBLAS +#define LLAMA_SKIP_UNUSED_LOGITS +#endif + #include #include #include @@ -2277,6 +2283,7 @@ static struct ggml_cgraph * llm_build_llama( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } +#ifdef LLAMA_SKIP_UNUSED_LOGITS if (il == n_layer - 1 && !lctx.logits_all) { // From here on, we only care about the last token and its logits. @@ -2297,6 +2304,7 @@ static struct ggml_cgraph * llm_build_llama( n_past += N - 1; N = 1; } +#endif // LLAMA_SKIP_UNUSED_LOGITS struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); offload_func_kq(tmpq); @@ -2928,9 +2936,14 @@ static bool llama_eval_internal( memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N); } else { // return result for just the last token - GGML_ASSERT(ggml_nelements(res) == n_vocab); logits_out.resize(n_vocab); +#ifdef LLAMA_SKIP_UNUSED_LOGITS + GGML_ASSERT(ggml_nelements(res) == n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab); +#else + GGML_ASSERT(ggml_nelements(res) == n_vocab * N); + memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); +#endif } } From 5553820d90701455b41daab8ecab7282fba7f1d1 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 25 Aug 2023 14:00:24 +0100 Subject: [PATCH 03/10] Allow disabling unused logit skipping code w/ cmake / make options cmake -DLLAMA_SKIP_UNUSED_LOGITS=OFF ... LLAMA_NO_SKIP_UNUSED_LOGITS=1 make ... --- CMakeLists.txt | 5 +++++ Makefile | 5 +++++ llama.cpp | 6 ------ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb63ef98e3013..de389f3507118 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,7 @@ option(LLAMA_METAL "llama: use Metal" option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) +option(LLAMA_SKIP_UNUSED_LOGITS "llama: skip computation of unused logits" ON) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -352,6 +353,10 @@ if (LLAMA_CLBLAST) endif() endif() +if (LLAMA_SKIP_UNUSED_LOGITS) + add_compile_definitions(LLAMA_SKIP_UNUSED_LOGITS) +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags diff --git a/Makefile b/Makefile index d31acc450b261..85e9869c793ae 100644 --- a/Makefile +++ b/Makefile @@ -302,6 +302,11 @@ k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_NO_K_QUANTS +ifndef LLAMA_NO_SKIP_UNUSED_LOGITS + CFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS + CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS +endif + # # Print build information # diff --git a/llama.cpp b/llama.cpp index 5692ced566296..7cb5e158ff1ea 100644 --- a/llama.cpp +++ b/llama.cpp @@ -56,12 +56,6 @@ #include // for _fseeki64 #endif -// TODO: Fix unused logit skipping crashes on ROCm -// (see https://github.com/ggerganov/llama.cpp/pull/2700#issuecomment-1689548127) -#ifndef LLAMA_USE_HIPBLAS -#define LLAMA_SKIP_UNUSED_LOGITS -#endif - #include #include #include From 3be6e8d36f9c630e1f52eb931cfcb9ebad6f5f93 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 25 Aug 2023 14:02:04 +0100 Subject: [PATCH 04/10] Tweak GPU offload when skipping unused logits computations --- llama.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 7cb5e158ff1ea..e72997385e743 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2191,9 +2191,10 @@ static struct ggml_cgraph * llm_build_llama( // // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal // in that case ggml_cuda_assign_buffers has no effect - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + offload_func_t offload_func_skip = llama_nop; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { @@ -2205,6 +2206,9 @@ static struct ggml_cgraph * llm_build_llama( if (n_gpu_layers > n_layer + 2) { offload_func_kq = ggml_cuda_assign_buffers_no_alloc; } + if (n_gpu_layers > 0) { + offload_func_skip = ggml_cuda_assign_buffers_no_alloc; + } #endif // GGML_USE_CUBLAS struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); @@ -2288,11 +2292,11 @@ static struct ggml_cgraph * llm_build_llama( // otherwise for Metal we'd have to rebuild the concurrency list. cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd); - offload_func_nr(cur); + offload_func_skip(cur); ggml_set_name(cur, "cur-lastpos"); inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd); - offload_func_nr(inpSA); + offload_func_skip(inpSA); ggml_set_name(inpSA, "inpSA-lastpos"); n_past += N - 1; From 21df40d0c4cc4941b00f8fc6eeb24a69285d83ad Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Sun, 27 Aug 2023 11:21:26 +0200 Subject: [PATCH 05/10] fix offloading logic --- llama.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index eaa3d11d66e2a..8e4ae9fbb0aee 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2233,7 +2233,6 @@ static struct ggml_cgraph * llm_build_llama( offload_func_t offload_func_nr = llama_nop; // nr = non-repeating offload_func_t offload_func_kq = llama_nop; offload_func_t offload_func_v = llama_nop; - offload_func_t offload_func_skip = llama_nop; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { @@ -2245,9 +2244,6 @@ static struct ggml_cgraph * llm_build_llama( if (n_gpu_layers > n_layer + 2) { offload_func_kq = ggml_cuda_assign_buffers_no_alloc; } - if (n_gpu_layers > 0) { - offload_func_skip = ggml_cuda_assign_buffers_no_alloc; - } #endif // GGML_USE_CUBLAS struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); @@ -2331,11 +2327,11 @@ static struct ggml_cgraph * llm_build_llama( // otherwise for Metal we'd have to rebuild the concurrency list. cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd); - offload_func_skip(cur); + offload_func_kq(cur); ggml_set_name(cur, "cur-lastpos"); inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd); - offload_func_skip(inpSA); + offload_func(inpSA); ggml_set_name(inpSA, "inpSA-lastpos"); n_past += N - 1; From 2eaeb7e872639902673d5991dd4511d3c371f887 Mon Sep 17 00:00:00 2001 From: ochafik Date: Mon, 28 Aug 2023 14:27:54 +0100 Subject: [PATCH 06/10] skip-unused: fix brackets & tabs --- Makefile | 4 ++-- llama.cpp | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 178ccb914aa07..50f1b8f39fa39 100644 --- a/Makefile +++ b/Makefile @@ -327,8 +327,8 @@ k_quants.o: k_quants.c k_quants.h endif # LLAMA_NO_K_QUANTS ifndef LLAMA_NO_SKIP_UNUSED_LOGITS - CFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS - CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS + CFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS + CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS endif # diff --git a/llama.cpp b/llama.cpp index 8e4ae9fbb0aee..0de4aaa2fd8d7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2317,8 +2317,7 @@ static struct ggml_cgraph * llm_build_llama( } #ifdef LLAMA_SKIP_UNUSED_LOGITS - if (il == n_layer - 1 && !lctx.logits_all) - { + if (il == n_layer - 1 && !lctx.logits_all) { // From here on, we only care about the last token and its logits. // We do as if N = 1 (from the end), which means we only keep // the last column of cur and inpSA ((n_embd, N) -> (n_embd, 1)). From f6a446ec01419c89f72649ee3f6d596aaf9d1c2d Mon Sep 17 00:00:00 2001 From: ochafik Date: Mon, 28 Aug 2023 14:30:29 +0100 Subject: [PATCH 07/10] skip-unused: revert extra spaces --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0de4aaa2fd8d7..2c39c41bc96fb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2230,9 +2230,9 @@ static struct ggml_cgraph * llm_build_llama( // // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal // in that case ggml_cuda_assign_buffers has no effect - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { From 9f5b7813c6f03673b0faf92a2a63c55a3d1a5518 Mon Sep 17 00:00:00 2001 From: ochafik Date: Mon, 28 Aug 2023 15:44:55 +0100 Subject: [PATCH 08/10] skip-unused: fix -ngl=1 case by ensure input & of view are offloaded consistently --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 2c39c41bc96fb..664aed88e6d6a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2325,10 +2325,12 @@ static struct ggml_cgraph * llm_build_llama( // Note that we do this even when N==1 so that we don't change the # nodes in the graph, // otherwise for Metal we'd have to rebuild the concurrency list. + offload_func(cur); cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd); - offload_func_kq(cur); + offload_func(cur); ggml_set_name(cur, "cur-lastpos"); + offload_func(inpSA); inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd); offload_func(inpSA); ggml_set_name(inpSA, "inpSA-lastpos"); From e9e8ac4c826c8c2242b2e9e90e65755d8c1760b6 Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Mon, 28 Aug 2023 17:44:10 +0200 Subject: [PATCH 09/10] Fix multiple offloading --- ggml-cuda.cu | 4 ++++ llama.cpp | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 83d53c13c1a54..9bca8551d4877 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6463,6 +6463,10 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo return; } + if (tensor->backend != GGML_BACKEND_CPU) { + return; + } + // recursively assign CUDA buffers until a compute tensor is found if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { const ggml_op src0_op = tensor->src[0]->op; diff --git a/llama.cpp b/llama.cpp index 664aed88e6d6a..3f7df330fd065 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2325,7 +2325,6 @@ static struct ggml_cgraph * llm_build_llama( // Note that we do this even when N==1 so that we don't change the # nodes in the graph, // otherwise for Metal we'd have to rebuild the concurrency list. - offload_func(cur); cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd); offload_func(cur); ggml_set_name(cur, "cur-lastpos"); From 58bb7d563149448c70c9cb8e4a7801660675f602 Mon Sep 17 00:00:00 2001 From: ochafik Date: Thu, 21 Sep 2023 00:52:24 +0100 Subject: [PATCH 10/10] Makefile: move unused logits flags where they don't interfere w/ targets (and also fix bad merge) --- Makefile | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index b7acf8f839922..32c319bad73e1 100644 --- a/Makefile +++ b/Makefile @@ -172,6 +172,10 @@ ifdef LLAMA_DISABLE_LOGS MK_CPPFLAGS += -DLOG_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS +ifndef LLAMA_NO_SKIP_UNUSED_LOGITS + MK_CPPFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS +endif # LLAMA_NO_SKIP_UNUSED_LOGITS + # warnings MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function @@ -447,15 +451,6 @@ k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_NO_K_QUANTS -ifndef LLAMA_NO_SKIP_UNUSED_LOGITS - CFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS - CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS -endif # LLAMA_NO_SKIP_UNUSED_LOGITS - -ifdef LLAMA_DISABLE_LOGS - CFLAGS += -DLOG_DISABLE_LOGS - CXXFLAGS += -DLOG_DISABLE_LOGS -endif # LLAMA_DISABLE_LOGS # combine build flags with cmdline overrides override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)