From 59864fd5b3c0e8226116972fed59c3b0158f214e Mon Sep 17 00:00:00 2001
From: Britt Lewis <brittlewis12@gmail.com>
Date: Sat, 24 May 2025 13:22:29 -0400
Subject: [PATCH 1/4] remove deprecated kv view & rename kv cache -> kv self
 APIs

* bump llama.cpp to b5474 (259469c), latest release as of 2025-05-24
  - https://github.com/ggml-org/llama.cpp/commits/259469c4b57c1a32606353bcac52ba683424a990
---
 llama-cpp-2/src/context/kv_cache.rs | 141 +++-------------------------
 llama-cpp-sys-2/llama.cpp           |   2 +-
 2 files changed, 12 insertions(+), 131 deletions(-)

diff --git a/llama-cpp-2/src/context/kv_cache.rs b/llama-cpp-2/src/context/kv_cache.rs
index d90a6b8a..14f5b5a6 100644
--- a/llama-cpp-2/src/context/kv_cache.rs
+++ b/llama-cpp-2/src/context/kv_cache.rs
@@ -28,7 +28,7 @@ impl LlamaContext<'_> {
     /// * `dest` - The sequence id to copy the cache to.
     /// * `size` - The size of the cache to copy.
     pub fn copy_cache(&mut self, src: i32, dest: i32, size: i32) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_seq_cp(self.context.as_ptr(), src, dest, 0, size) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_seq_cp(self.context.as_ptr(), src, dest, 0, size) }
     }
 
     /// Copy the cache from one sequence to another.
@@ -58,7 +58,7 @@ impl LlamaContext<'_> {
             .map_or(Ok(-1), i32::try_from)
             .map_err(KvCacheConversionError::P1TooLarge)?;
         unsafe {
-            llama_cpp_sys_2::llama_kv_cache_seq_cp(self.context.as_ptr(), src, dest, p0, p1);
+            llama_cpp_sys_2::llama_kv_self_seq_cp(self.context.as_ptr(), src, dest, p0, p1);
         }
         Ok(())
     }
@@ -92,18 +92,18 @@ impl LlamaContext<'_> {
         let p1 = p1
             .map_or(Ok(-1), i32::try_from)
             .map_err(KvCacheConversionError::P1TooLarge)?;
-        Ok(unsafe { llama_cpp_sys_2::llama_kv_cache_seq_rm(self.context.as_ptr(), src, p0, p1) })
+        Ok(unsafe { llama_cpp_sys_2::llama_kv_self_seq_rm(self.context.as_ptr(), src, p0, p1) })
     }
 
     /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
     #[must_use]
     pub fn get_kv_cache_used_cells(&self) -> i32 {
-        unsafe { llama_cpp_sys_2::llama_get_kv_cache_used_cells(self.context.as_ptr()) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_used_cells(self.context.as_ptr()) }
     }
 
     /// Clear the KV cache
     pub fn clear_kv_cache(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_clear(self.context.as_ptr()) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_clear(self.context.as_ptr()) }
     }
 
     /// Removes all tokens that do not belong to the specified sequence
@@ -112,7 +112,7 @@ impl LlamaContext<'_> {
     ///
     /// * `seq_id` - The sequence id to keep
     pub fn llama_kv_cache_seq_keep(&mut self, seq_id: i32) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_seq_keep(self.context.as_ptr(), seq_id) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_seq_keep(self.context.as_ptr(), seq_id) }
     }
 
     #[allow(clippy::doc_markdown)]
@@ -147,7 +147,7 @@ impl LlamaContext<'_> {
             .map_or(Ok(-1), i32::try_from)
             .map_err(KvCacheConversionError::P1TooLarge)?;
         unsafe {
-            llama_cpp_sys_2::llama_kv_cache_seq_add(self.context.as_ptr(), seq_id, p0, p1, delta);
+            llama_cpp_sys_2::llama_kv_self_seq_add(self.context.as_ptr(), seq_id, p0, p1, delta);
         }
         Ok(())
     }
@@ -183,7 +183,7 @@ impl LlamaContext<'_> {
             .map_or(Ok(-1), i32::try_from)
             .map_err(KvCacheConversionError::P1TooLarge)?;
         let d = c_int::from(d.get());
-        unsafe { llama_cpp_sys_2::llama_kv_cache_seq_div(self.context.as_ptr(), seq_id, p0, p1, d) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_seq_div(self.context.as_ptr(), seq_id, p0, p1, d) }
         Ok(())
     }
 
@@ -194,7 +194,7 @@ impl LlamaContext<'_> {
     /// * `seq_id` - The sequence id to get the max position for
     #[must_use]
     pub fn kv_cache_seq_pos_max(&self, seq_id: i32) -> i32 {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_seq_pos_max(self.context.as_ptr(), seq_id) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_seq_pos_max(self.context.as_ptr(), seq_id) }
     }
 
     /// Defragment the KV cache
@@ -202,130 +202,11 @@ impl LlamaContext<'_> {
     ///   - lazily on next [`LlamaContext::decode`]
     ///   - explicitly with [`Self::kv_cache_update`]
     pub fn kv_cache_defrag(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_defrag(self.context.as_ptr()) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_defrag(self.context.as_ptr()) }
     }
 
     /// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
     pub fn kv_cache_update(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_update(self.context.as_ptr()) }
-    }
-
-    /// Returns the number of tokens in the KV cache (slow, use only for debug)
-    /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    #[must_use]
-    pub fn get_kv_cache_token_count(&self) -> i32 {
-        unsafe { llama_cpp_sys_2::llama_get_kv_cache_token_count(self.context.as_ptr()) }
-    }
-
-    /// Create an empty KV cache view. (use only for debugging purposes)
-    ///
-    /// # Parameters
-    ///
-    /// * `n_max_seq` - Maximum number of sequences that can exist in a cell. It's not an error
-    ///                 if there are more sequences in a cell than this value, however they will
-    ///                 not be visible in the view `cells_sequences`.
-    #[must_use]
-    pub fn new_kv_cache_view(&self, n_max_seq: i32) -> KVCacheView {
-        let view =
-            unsafe { llama_cpp_sys_2::llama_kv_cache_view_init(self.context.as_ptr(), n_max_seq) };
-        KVCacheView { view, ctx: self }
-    }
-}
-
-/// Information associated with an individual cell in the KV cache view.
-#[derive(Debug)]
-pub struct KVCacheViewCell {
-    /// The position for this cell. Takes KV cache shifts into account.
-    /// May be negative if the cell is not populated.
-    pub pos: llama_cpp_sys_2::llama_pos,
-}
-
-/// An updateable view of the KV cache. (use only for debugging purposes)
-#[derive(Debug)]
-pub struct KVCacheView<'a> {
-    ctx: &'a LlamaContext<'a>,
-    view: llama_cpp_sys_2::llama_kv_cache_view,
-}
-
-impl KVCacheView<'_> {
-    /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    pub fn update(&mut self) {
-        unsafe {
-            llama_cpp_sys_2::llama_kv_cache_view_update(self.ctx.context.as_ptr(), &mut self.view);
-        }
-    }
-
-    /// Number of KV cache cells. This will be the same as the context size.
-    #[must_use]
-    pub fn n_cells(&self) -> i32 {
-        self.view.n_cells
-    }
-
-    /// Number of tokens in the cache. For example, if there are two populated
-    /// cells, the first with 1 sequence id in it and the second with 2 sequence
-    /// ids then you'll have 3 tokens.
-    #[must_use]
-    pub fn token_count(&self) -> i32 {
-        self.view.token_count
-    }
-
-    /// Number of populated cache cells.
-    #[must_use]
-    pub fn used_cells(&self) -> i32 {
-        self.view.used_cells
-    }
-
-    /// Maximum contiguous empty slots in the cache.
-    #[must_use]
-    pub fn max_contiguous(&self) -> i32 {
-        self.view.max_contiguous
-    }
-
-    /// Index to the start of the `max_contiguous` slot range. Can be negative
-    /// when cache is full.
-    #[must_use]
-    pub fn max_contiguous_idx(&self) -> i32 {
-        self.view.max_contiguous_idx
-    }
-
-    /// Information for individual cells.
-    ///
-    /// # Panics
-    ///
-    /// - if `n_cells` does not fit into usize.
-    pub fn cells(&self) -> impl Iterator<Item = KVCacheViewCell> {
-        unsafe {
-            std::slice::from_raw_parts(
-                self.view.cells,
-                usize::try_from(self.view.n_cells).expect("failed to fit n_cells into usize"),
-            )
-        }
-        .iter()
-        .map(|&cell| KVCacheViewCell { pos: cell.pos })
-    }
-
-    /// The sequences for each cell. There will be `n_max_seq` items per cell.
-    ///
-    /// # Panics
-    ///
-    /// - if `n_cells * n_max_seq` does not fit into usize.
-    /// - if `n_max_seq` does not fit into usize.
-    pub fn cells_sequences(&self) -> impl Iterator<Item = &[llama_cpp_sys_2::llama_seq_id]> {
-        unsafe {
-            std::slice::from_raw_parts(
-                self.view.cells_sequences,
-                usize::try_from(self.view.n_cells * self.view.n_seq_max)
-                    .expect("failed to fit n_cells * n_max_seq into usize"),
-            )
-        }
-        .chunks(usize::try_from(self.view.n_seq_max).expect("failed to fit n_max_seq into usize"))
-    }
-}
-
-impl Drop for KVCacheView<'_> {
-    fn drop(&mut self) {
-        unsafe {
-            llama_cpp_sys_2::llama_kv_cache_view_free(&mut self.view);
-        }
+        unsafe { llama_cpp_sys_2::llama_kv_self_update(self.context.as_ptr()) }
     }
 }
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index ceda28ef..259469c4 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit ceda28ef8e310a8dee60bf275077a3eedae8e36c
+Subproject commit 259469c4b57c1a32606353bcac52ba683424a990

From b0839c391ebbb74efda2d2852603f595c94e7ff3 Mon Sep 17 00:00:00 2001
From: Britt Lewis <brittlewis12@gmail.com>
Date: Sat, 24 May 2025 13:24:37 -0400
Subject: [PATCH 2/4] update llama.cpp org-ref

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 625b54c7..0dfa7e0d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "llama-cpp-sys-2/llama.cpp"]
 	path = llama-cpp-sys-2/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp
+	url = https://github.com/ggml-org/llama.cpp

From f8d986b3f699e04b3fba3a1da7dadb9c9773fc62 Mon Sep 17 00:00:00 2001
From: Britt Lewis <brittlewis12@gmail.com>
Date: Sat, 24 May 2025 14:44:19 -0400
Subject: [PATCH 3/4] disable building tools post upstream reorganization

* https://github.com/ggml-org/llama.cpp/pull/13249
---
 llama-cpp-sys-2/build.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index df654053..156eb4b4 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -268,6 +268,7 @@ fn main() {
     config.define("LLAMA_BUILD_TESTS", "OFF");
     config.define("LLAMA_BUILD_EXAMPLES", "OFF");
     config.define("LLAMA_BUILD_SERVER", "OFF");
+    config.define("LLAMA_BUILD_TOOLS", "OFF");
     config.define("LLAMA_CURL", "OFF");
 
     config.define(

From ff4784e62db6fe15446f325d430a876454b3ec0e Mon Sep 17 00:00:00 2001
From: Britt Lewis <brittlewis12@gmail.com>
Date: Sat, 24 May 2025 14:45:34 -0400
Subject: [PATCH 4/4] cargo fmt in build.rs

---
 llama-cpp-sys-2/build.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 156eb4b4..f545ff9a 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -280,7 +280,11 @@ fn main() {
         config.define("GGML_BLAS", "OFF");
     }
 
-    if (matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc)) && matches!(profile.as_str(), "Release" | "RelWithDebInfo" | "MinSizeRel"))
+    if (matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc))
+        && matches!(
+            profile.as_str(),
+            "Release" | "RelWithDebInfo" | "MinSizeRel"
+        ))
     {
         // Debug Rust builds under MSVC turn off optimization even though we're ideally building the release profile of llama.cpp.
         // Looks like an upstream bug: