From 2cf4f62e12c36c7ba81efd8db3cb68a84e3121dd Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Fri, 18 Aug 2023 01:46:20 +0100
Subject: [PATCH 01/10] Skip computation of unused logits during batch prompt
 eval (drop other batch positions after writing their kv to cache)

---
 llama.cpp | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index f2dc4da1db344..838f47e3621d7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2117,7 +2117,8 @@ static struct ggml_cgraph * llm_build_llama(
 
     GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
 
-    const int N = n_tokens;
+    // Non-const to allow short-circuiting to the last token in the last layer in prompt eval mode.
+    int N = n_tokens;
 
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
@@ -2245,18 +2246,10 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_kq(tmpk);
             ggml_set_name(tmpk, "tmpk");
 
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            offload_func_kq(tmpq);
-            ggml_set_name(tmpq, "tmpq");
-
             struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
-            offload_func_kq(Qcur);
-            ggml_set_name(Qcur, "Qcur");
-
             // store key and value to memory
             {
                 // compute the transposed [N, n_embd] V matrix
@@ -2284,6 +2277,35 @@ static struct ggml_cgraph * llm_build_llama(
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
             }
 
+            if (il == n_layer - 1 && !lctx.logits_all)
+            {
+                // From here on, we only care about the last token and its logits.
+                // We do as if N = 1 (from the end), which means we only keep
+                // the last column of cur and inpSA ((n_embd, N) -> (n_embd, 1)).
+                //
+                // Note that we do this even when N==1 so that we don't change the # nodes in the graph,
+                // otherwise for Metal we'd have to rebuild the concurrency list.
+
+                cur   = ggml_view_2d(ctx0, cur,   n_embd, 1,   cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
+                offload_func_nr(cur);
+                ggml_set_name(cur, "cur-lastpos");
+
+                inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd);
+                offload_func_nr(inpSA);
+                ggml_set_name(inpSA, "inpSA-lastpos");
+
+                n_past += N - 1;
+                N = 1;
+            }
+
+            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+            offload_func_kq(tmpq);
+            ggml_set_name(tmpq, "tmpq");
+
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+            offload_func_kq(Qcur);
+            ggml_set_name(Qcur, "Qcur");
+
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             offload_func_kq(Q);
             ggml_set_name(Q, "Q");
@@ -2902,11 +2924,13 @@ static bool llama_eval_internal(
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
+            GGML_ASSERT(ggml_nelements(res) == n_vocab * N);
             memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
+            GGML_ASSERT(ggml_nelements(res) == n_vocab);
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab);
         }
     }
 

From 7ec7ef94a92161d8cf65866e0666001ba9fc8ac9 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 23 Aug 2023 21:36:56 +0100
Subject: [PATCH 02/10] skip-unused: disable skipping on ROCm / when
 LLAMA_USE_HIPBLAS

---
 llama.cpp | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 838f47e3621d7..5692ced566296 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -56,6 +56,12 @@
     #include <stdio.h> // for _fseeki64
 #endif
 
+// TODO: Fix unused logit skipping crashes on ROCm
+// (see https://github.com/ggerganov/llama.cpp/pull/2700#issuecomment-1689548127)
+#ifndef LLAMA_USE_HIPBLAS
+#define LLAMA_SKIP_UNUSED_LOGITS
+#endif
+
 #include <algorithm>
 #include <array>
 #include <cassert>
@@ -2277,6 +2283,7 @@ static struct ggml_cgraph * llm_build_llama(
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
             }
 
+#ifdef LLAMA_SKIP_UNUSED_LOGITS
             if (il == n_layer - 1 && !lctx.logits_all)
             {
                 // From here on, we only care about the last token and its logits.
@@ -2297,6 +2304,7 @@ static struct ggml_cgraph * llm_build_llama(
                 n_past += N - 1;
                 N = 1;
             }
+#endif  // LLAMA_SKIP_UNUSED_LOGITS
 
             struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
             offload_func_kq(tmpq);
@@ -2928,9 +2936,14 @@ static bool llama_eval_internal(
             memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
-            GGML_ASSERT(ggml_nelements(res) == n_vocab);
             logits_out.resize(n_vocab);
+#ifdef LLAMA_SKIP_UNUSED_LOGITS
+            GGML_ASSERT(ggml_nelements(res) == n_vocab);
             memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab);
+#else
+            GGML_ASSERT(ggml_nelements(res) == n_vocab * N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+#endif
         }
     }
 

From 5553820d90701455b41daab8ecab7282fba7f1d1 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Fri, 25 Aug 2023 14:00:24 +0100
Subject: [PATCH 03/10] Allow disabling unused logit skipping code w/ cmake /
 make options

cmake -DLLAMA_SKIP_UNUSED_LOGITS=OFF ...
LLAMA_NO_SKIP_UNUSED_LOGITS=1 make ...
---
 CMakeLists.txt | 5 +++++
 Makefile       | 5 +++++
 llama.cpp      | 6 ------
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb63ef98e3013..de389f3507118 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,7 @@ option(LLAMA_METAL                           "llama: use Metal"
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
+option(LLAMA_SKIP_UNUSED_LOGITS              "llama: skip computation of unused logits"         ON)
 
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
@@ -352,6 +353,10 @@ if (LLAMA_CLBLAST)
     endif()
 endif()
 
+if (LLAMA_SKIP_UNUSED_LOGITS)
+    add_compile_definitions(LLAMA_SKIP_UNUSED_LOGITS)
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags
diff --git a/Makefile b/Makefile
index d31acc450b261..85e9869c793ae 100644
--- a/Makefile
+++ b/Makefile
@@ -302,6 +302,11 @@ k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 
+ifndef LLAMA_NO_SKIP_UNUSED_LOGITS
+  CFLAGS   += -DLLAMA_SKIP_UNUSED_LOGITS
+  CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS
+endif
+
 #
 # Print build information
 #
diff --git a/llama.cpp b/llama.cpp
index 5692ced566296..7cb5e158ff1ea 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -56,12 +56,6 @@
     #include <stdio.h> // for _fseeki64
 #endif
 
-// TODO: Fix unused logit skipping crashes on ROCm
-// (see https://github.com/ggerganov/llama.cpp/pull/2700#issuecomment-1689548127)
-#ifndef LLAMA_USE_HIPBLAS
-#define LLAMA_SKIP_UNUSED_LOGITS
-#endif
-
 #include <algorithm>
 #include <array>
 #include <cassert>

From 3be6e8d36f9c630e1f52eb931cfcb9ebad6f5f93 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Fri, 25 Aug 2023 14:02:04 +0100
Subject: [PATCH 04/10] Tweak GPU offload when skipping unused logits
 computations

---
 llama.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 7cb5e158ff1ea..e72997385e743 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2191,9 +2191,10 @@ static struct ggml_cgraph * llm_build_llama(
     //
     // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
     // in that case ggml_cuda_assign_buffers has no effect
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
+    offload_func_t offload_func_nr   = llama_nop; // nr = non-repeating
+    offload_func_t offload_func_kq   = llama_nop;
+    offload_func_t offload_func_v    = llama_nop;
+    offload_func_t offload_func_skip = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
     if (n_gpu_layers > n_layer) {
@@ -2205,6 +2206,9 @@ static struct ggml_cgraph * llm_build_llama(
     if (n_gpu_layers > n_layer + 2) {
         offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
     }
+    if (n_gpu_layers > 0) {
+        offload_func_skip = ggml_cuda_assign_buffers_no_alloc;
+    }
 #endif // GGML_USE_CUBLAS
 
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
@@ -2288,11 +2292,11 @@ static struct ggml_cgraph * llm_build_llama(
                 // otherwise for Metal we'd have to rebuild the concurrency list.
 
                 cur   = ggml_view_2d(ctx0, cur,   n_embd, 1,   cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
-                offload_func_nr(cur);
+                offload_func_skip(cur);
                 ggml_set_name(cur, "cur-lastpos");
 
                 inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd);
-                offload_func_nr(inpSA);
+                offload_func_skip(inpSA);
                 ggml_set_name(inpSA, "inpSA-lastpos");
 
                 n_past += N - 1;

From 21df40d0c4cc4941b00f8fc6eeb24a69285d83ad Mon Sep 17 00:00:00 2001
From: JohannesGaessler <johannesg@5d6.de>
Date: Sun, 27 Aug 2023 11:21:26 +0200
Subject: [PATCH 05/10] fix offloading logic

---
 llama.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index eaa3d11d66e2a..8e4ae9fbb0aee 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2233,7 +2233,6 @@ static struct ggml_cgraph * llm_build_llama(
     offload_func_t offload_func_nr   = llama_nop; // nr = non-repeating
     offload_func_t offload_func_kq   = llama_nop;
     offload_func_t offload_func_v    = llama_nop;
-    offload_func_t offload_func_skip = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
     if (n_gpu_layers > n_layer) {
@@ -2245,9 +2244,6 @@ static struct ggml_cgraph * llm_build_llama(
     if (n_gpu_layers > n_layer + 2) {
         offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
     }
-    if (n_gpu_layers > 0) {
-        offload_func_skip = ggml_cuda_assign_buffers_no_alloc;
-    }
 #endif // GGML_USE_CUBLAS
 
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
@@ -2331,11 +2327,11 @@ static struct ggml_cgraph * llm_build_llama(
                 // otherwise for Metal we'd have to rebuild the concurrency list.
 
                 cur   = ggml_view_2d(ctx0, cur,   n_embd, 1,   cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
-                offload_func_skip(cur);
+                offload_func_kq(cur);
                 ggml_set_name(cur, "cur-lastpos");
 
                 inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd);
-                offload_func_skip(inpSA);
+                offload_func(inpSA);
                 ggml_set_name(inpSA, "inpSA-lastpos");
 
                 n_past += N - 1;

From 2eaeb7e872639902673d5991dd4511d3c371f887 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 28 Aug 2023 14:27:54 +0100
Subject: [PATCH 06/10] skip-unused: fix brackets & tabs

---
 Makefile  | 4 ++--
 llama.cpp | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 178ccb914aa07..50f1b8f39fa39 100644
--- a/Makefile
+++ b/Makefile
@@ -327,8 +327,8 @@ k_quants.o: k_quants.c k_quants.h
 endif # LLAMA_NO_K_QUANTS
 
 ifndef LLAMA_NO_SKIP_UNUSED_LOGITS
-  CFLAGS   += -DLLAMA_SKIP_UNUSED_LOGITS
-  CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS
+	CFLAGS   += -DLLAMA_SKIP_UNUSED_LOGITS
+	CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS
 endif
 
 #
diff --git a/llama.cpp b/llama.cpp
index 8e4ae9fbb0aee..0de4aaa2fd8d7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2317,8 +2317,7 @@ static struct ggml_cgraph * llm_build_llama(
             }
 
 #ifdef LLAMA_SKIP_UNUSED_LOGITS
-            if (il == n_layer - 1 && !lctx.logits_all)
-            {
+            if (il == n_layer - 1 && !lctx.logits_all) {
                 // From here on, we only care about the last token and its logits.
                 // We do as if N = 1 (from the end), which means we only keep
                 // the last column of cur and inpSA ((n_embd, N) -> (n_embd, 1)).

From f6a446ec01419c89f72649ee3f6d596aaf9d1c2d Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 28 Aug 2023 14:30:29 +0100
Subject: [PATCH 07/10] skip-unused: revert extra spaces

---
 llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0de4aaa2fd8d7..2c39c41bc96fb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2230,9 +2230,9 @@ static struct ggml_cgraph * llm_build_llama(
     //
     // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
     // in that case ggml_cuda_assign_buffers has no effect
-    offload_func_t offload_func_nr   = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq   = llama_nop;
-    offload_func_t offload_func_v    = llama_nop;
+    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
+    offload_func_t offload_func_kq = llama_nop;
+    offload_func_t offload_func_v  = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
     if (n_gpu_layers > n_layer) {

From 9f5b7813c6f03673b0faf92a2a63c55a3d1a5518 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 28 Aug 2023 15:44:55 +0100
Subject: [PATCH 08/10] skip-unused: fix -ngl=1 case by ensure input & of view
 are offloaded consistently

---
 llama.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 2c39c41bc96fb..664aed88e6d6a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2325,10 +2325,12 @@ static struct ggml_cgraph * llm_build_llama(
                 // Note that we do this even when N==1 so that we don't change the # nodes in the graph,
                 // otherwise for Metal we'd have to rebuild the concurrency list.
 
+                offload_func(cur);
                 cur   = ggml_view_2d(ctx0, cur,   n_embd, 1,   cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
-                offload_func_kq(cur);
+                offload_func(cur);
                 ggml_set_name(cur, "cur-lastpos");
 
+                offload_func(inpSA);
                 inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd);
                 offload_func(inpSA);
                 ggml_set_name(inpSA, "inpSA-lastpos");

From e9e8ac4c826c8c2242b2e9e90e65755d8c1760b6 Mon Sep 17 00:00:00 2001
From: JohannesGaessler <johannesg@5d6.de>
Date: Mon, 28 Aug 2023 17:44:10 +0200
Subject: [PATCH 09/10] Fix multiple offloading

---
 ggml-cuda.cu | 4 ++++
 llama.cpp    | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 83d53c13c1a54..9bca8551d4877 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6463,6 +6463,10 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
         return;
     }
 
+    if (tensor->backend != GGML_BACKEND_CPU) {
+        return;
+    }
+
     // recursively assign CUDA buffers until a compute tensor is found
     if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
         const ggml_op src0_op = tensor->src[0]->op;
diff --git a/llama.cpp b/llama.cpp
index 664aed88e6d6a..3f7df330fd065 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2325,7 +2325,6 @@ static struct ggml_cgraph * llm_build_llama(
                 // Note that we do this even when N==1 so that we don't change the # nodes in the graph,
                 // otherwise for Metal we'd have to rebuild the concurrency list.
 
-                offload_func(cur);
                 cur   = ggml_view_2d(ctx0, cur,   n_embd, 1,   cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
                 offload_func(cur);
                 ggml_set_name(cur, "cur-lastpos");

From 58bb7d563149448c70c9cb8e4a7801660675f602 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Thu, 21 Sep 2023 00:52:24 +0100
Subject: [PATCH 10/10] Makefile: move unused logits flags where they don't
 interfere w/ targets

(and also fix bad merge)
---
 Makefile | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index b7acf8f839922..32c319bad73e1 100644
--- a/Makefile
+++ b/Makefile
@@ -172,6 +172,10 @@ ifdef LLAMA_DISABLE_LOGS
 	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
 endif # LLAMA_DISABLE_LOGS
 
+ifndef LLAMA_NO_SKIP_UNUSED_LOGITS
+	MK_CPPFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS
+endif # LLAMA_NO_SKIP_UNUSED_LOGITS
+
 # warnings
 MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
 				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
@@ -447,15 +451,6 @@ k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 
-ifndef LLAMA_NO_SKIP_UNUSED_LOGITS
-	CFLAGS   += -DLLAMA_SKIP_UNUSED_LOGITS
-	CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS
-endif # LLAMA_NO_SKIP_UNUSED_LOGITS
-
-ifdef LLAMA_DISABLE_LOGS
-	CFLAGS   += -DLOG_DISABLE_LOGS
-	CXXFLAGS += -DLOG_DISABLE_LOGS
-endif # LLAMA_DISABLE_LOGS
 # combine build flags with cmdline overrides
 override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
 override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)