From 59864fd5b3c0e8226116972fed59c3b0158f214e Mon Sep 17 00:00:00 2001 From: Britt Lewis Date: Sat, 24 May 2025 13:22:29 -0400 Subject: [PATCH 1/4] remove deprecated kv view & rename kv cache -> kv self APIs * bump llama.cpp to b5474 (259469c), latest release as of 2025-05-24 - https://github.com/ggml-org/llama.cpp/commits/259469c4b57c1a32606353bcac52ba683424a990 --- llama-cpp-2/src/context/kv_cache.rs | 141 +++------------------------- llama-cpp-sys-2/llama.cpp | 2 +- 2 files changed, 12 insertions(+), 131 deletions(-) diff --git a/llama-cpp-2/src/context/kv_cache.rs b/llama-cpp-2/src/context/kv_cache.rs index d90a6b8a..14f5b5a6 100644 --- a/llama-cpp-2/src/context/kv_cache.rs +++ b/llama-cpp-2/src/context/kv_cache.rs @@ -28,7 +28,7 @@ impl LlamaContext<'_> { /// * `dest` - The sequence id to copy the cache to. /// * `size` - The size of the cache to copy. pub fn copy_cache(&mut self, src: i32, dest: i32, size: i32) { - unsafe { llama_cpp_sys_2::llama_kv_cache_seq_cp(self.context.as_ptr(), src, dest, 0, size) } + unsafe { llama_cpp_sys_2::llama_kv_self_seq_cp(self.context.as_ptr(), src, dest, 0, size) } } /// Copy the cache from one sequence to another. @@ -58,7 +58,7 @@ impl LlamaContext<'_> { .map_or(Ok(-1), i32::try_from) .map_err(KvCacheConversionError::P1TooLarge)?; unsafe { - llama_cpp_sys_2::llama_kv_cache_seq_cp(self.context.as_ptr(), src, dest, p0, p1); + llama_cpp_sys_2::llama_kv_self_seq_cp(self.context.as_ptr(), src, dest, p0, p1); } Ok(()) } @@ -92,18 +92,18 @@ impl LlamaContext<'_> { let p1 = p1 .map_or(Ok(-1), i32::try_from) .map_err(KvCacheConversionError::P1TooLarge)?; - Ok(unsafe { llama_cpp_sys_2::llama_kv_cache_seq_rm(self.context.as_ptr(), src, p0, p1) }) + Ok(unsafe { llama_cpp_sys_2::llama_kv_self_seq_rm(self.context.as_ptr(), src, p0, p1) }) } /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them) #[must_use] pub fn get_kv_cache_used_cells(&self) -> i32 { - unsafe { llama_cpp_sys_2::llama_get_kv_cache_used_cells(self.context.as_ptr()) } + unsafe { llama_cpp_sys_2::llama_kv_self_used_cells(self.context.as_ptr()) } } /// Clear the KV cache pub fn clear_kv_cache(&mut self) { - unsafe { llama_cpp_sys_2::llama_kv_cache_clear(self.context.as_ptr()) } + unsafe { llama_cpp_sys_2::llama_kv_self_clear(self.context.as_ptr()) } } /// Removes all tokens that do not belong to the specified sequence @@ -112,7 +112,7 @@ impl LlamaContext<'_> { /// /// * `seq_id` - The sequence id to keep pub fn llama_kv_cache_seq_keep(&mut self, seq_id: i32) { - unsafe { llama_cpp_sys_2::llama_kv_cache_seq_keep(self.context.as_ptr(), seq_id) } + unsafe { llama_cpp_sys_2::llama_kv_self_seq_keep(self.context.as_ptr(), seq_id) } } #[allow(clippy::doc_markdown)] @@ -147,7 +147,7 @@ impl LlamaContext<'_> { .map_or(Ok(-1), i32::try_from) .map_err(KvCacheConversionError::P1TooLarge)?; unsafe { - llama_cpp_sys_2::llama_kv_cache_seq_add(self.context.as_ptr(), seq_id, p0, p1, delta); + llama_cpp_sys_2::llama_kv_self_seq_add(self.context.as_ptr(), seq_id, p0, p1, delta); } Ok(()) } @@ -183,7 +183,7 @@ impl LlamaContext<'_> { .map_or(Ok(-1), i32::try_from) .map_err(KvCacheConversionError::P1TooLarge)?; let d = c_int::from(d.get()); - unsafe { llama_cpp_sys_2::llama_kv_cache_seq_div(self.context.as_ptr(), seq_id, p0, p1, d) } + unsafe { llama_cpp_sys_2::llama_kv_self_seq_div(self.context.as_ptr(), seq_id, p0, p1, d) } Ok(()) } @@ -194,7 +194,7 @@ impl LlamaContext<'_> { /// * `seq_id` - The sequence id to get the max position for #[must_use] pub fn kv_cache_seq_pos_max(&self, seq_id: i32) -> i32 { - unsafe { llama_cpp_sys_2::llama_kv_cache_seq_pos_max(self.context.as_ptr(), seq_id) } + unsafe { llama_cpp_sys_2::llama_kv_self_seq_pos_max(self.context.as_ptr(), seq_id) } } /// Defragment the KV cache @@ -202,130 +202,11 @@ impl LlamaContext<'_> { /// - lazily on next [`LlamaContext::decode`] /// - explicitly with [`Self::kv_cache_update`] pub fn kv_cache_defrag(&mut self) { - unsafe { llama_cpp_sys_2::llama_kv_cache_defrag(self.context.as_ptr()) } + unsafe { llama_cpp_sys_2::llama_kv_self_defrag(self.context.as_ptr()) } } /// Apply the KV cache updates (such as K-shifts, defragmentation, etc.) pub fn kv_cache_update(&mut self) { - unsafe { llama_cpp_sys_2::llama_kv_cache_update(self.context.as_ptr()) } - } - - /// Returns the number of tokens in the KV cache (slow, use only for debug) - /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times - #[must_use] - pub fn get_kv_cache_token_count(&self) -> i32 { - unsafe { llama_cpp_sys_2::llama_get_kv_cache_token_count(self.context.as_ptr()) } - } - - /// Create an empty KV cache view. (use only for debugging purposes) - /// - /// # Parameters - /// - /// * `n_max_seq` - Maximum number of sequences that can exist in a cell. It's not an error - /// if there are more sequences in a cell than this value, however they will - /// not be visible in the view `cells_sequences`. - #[must_use] - pub fn new_kv_cache_view(&self, n_max_seq: i32) -> KVCacheView { - let view = - unsafe { llama_cpp_sys_2::llama_kv_cache_view_init(self.context.as_ptr(), n_max_seq) }; - KVCacheView { view, ctx: self } - } -} - -/// Information associated with an individual cell in the KV cache view. -#[derive(Debug)] -pub struct KVCacheViewCell { - /// The position for this cell. Takes KV cache shifts into account. - /// May be negative if the cell is not populated. - pub pos: llama_cpp_sys_2::llama_pos, -} - -/// An updateable view of the KV cache. (use only for debugging purposes) -#[derive(Debug)] -pub struct KVCacheView<'a> { - ctx: &'a LlamaContext<'a>, - view: llama_cpp_sys_2::llama_kv_cache_view, -} - -impl KVCacheView<'_> { - /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) - pub fn update(&mut self) { - unsafe { - llama_cpp_sys_2::llama_kv_cache_view_update(self.ctx.context.as_ptr(), &mut self.view); - } - } - - /// Number of KV cache cells. This will be the same as the context size. - #[must_use] - pub fn n_cells(&self) -> i32 { - self.view.n_cells - } - - /// Number of tokens in the cache. For example, if there are two populated - /// cells, the first with 1 sequence id in it and the second with 2 sequence - /// ids then you'll have 3 tokens. - #[must_use] - pub fn token_count(&self) -> i32 { - self.view.token_count - } - - /// Number of populated cache cells. - #[must_use] - pub fn used_cells(&self) -> i32 { - self.view.used_cells - } - - /// Maximum contiguous empty slots in the cache. - #[must_use] - pub fn max_contiguous(&self) -> i32 { - self.view.max_contiguous - } - - /// Index to the start of the `max_contiguous` slot range. Can be negative - /// when cache is full. - #[must_use] - pub fn max_contiguous_idx(&self) -> i32 { - self.view.max_contiguous_idx - } - - /// Information for individual cells. - /// - /// # Panics - /// - /// - if `n_cells` does not fit into usize. - pub fn cells(&self) -> impl Iterator { - unsafe { - std::slice::from_raw_parts( - self.view.cells, - usize::try_from(self.view.n_cells).expect("failed to fit n_cells into usize"), - ) - } - .iter() - .map(|&cell| KVCacheViewCell { pos: cell.pos }) - } - - /// The sequences for each cell. There will be `n_max_seq` items per cell. - /// - /// # Panics - /// - /// - if `n_cells * n_max_seq` does not fit into usize. - /// - if `n_max_seq` does not fit into usize. - pub fn cells_sequences(&self) -> impl Iterator { - unsafe { - std::slice::from_raw_parts( - self.view.cells_sequences, - usize::try_from(self.view.n_cells * self.view.n_seq_max) - .expect("failed to fit n_cells * n_max_seq into usize"), - ) - } - .chunks(usize::try_from(self.view.n_seq_max).expect("failed to fit n_max_seq into usize")) - } -} - -impl Drop for KVCacheView<'_> { - fn drop(&mut self) { - unsafe { - llama_cpp_sys_2::llama_kv_cache_view_free(&mut self.view); - } + unsafe { llama_cpp_sys_2::llama_kv_self_update(self.context.as_ptr()) } } } diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp index ceda28ef..259469c4 160000 --- a/llama-cpp-sys-2/llama.cpp +++ b/llama-cpp-sys-2/llama.cpp @@ -1 +1 @@ -Subproject commit ceda28ef8e310a8dee60bf275077a3eedae8e36c +Subproject commit 259469c4b57c1a32606353bcac52ba683424a990 From b0839c391ebbb74efda2d2852603f595c94e7ff3 Mon Sep 17 00:00:00 2001 From: Britt Lewis Date: Sat, 24 May 2025 13:24:37 -0400 Subject: [PATCH 2/4] update llama.cpp org-ref --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 625b54c7..0dfa7e0d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "llama-cpp-sys-2/llama.cpp"] path = llama-cpp-sys-2/llama.cpp - url = https://github.com/ggerganov/llama.cpp + url = https://github.com/ggml-org/llama.cpp From f8d986b3f699e04b3fba3a1da7dadb9c9773fc62 Mon Sep 17 00:00:00 2001 From: Britt Lewis Date: Sat, 24 May 2025 14:44:19 -0400 Subject: [PATCH 3/4] disable building tools post upstream reorganization * https://github.com/ggml-org/llama.cpp/pull/13249 --- llama-cpp-sys-2/build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index df654053..156eb4b4 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -268,6 +268,7 @@ fn main() { config.define("LLAMA_BUILD_TESTS", "OFF"); config.define("LLAMA_BUILD_EXAMPLES", "OFF"); config.define("LLAMA_BUILD_SERVER", "OFF"); + config.define("LLAMA_BUILD_TOOLS", "OFF"); config.define("LLAMA_CURL", "OFF"); config.define( From ff4784e62db6fe15446f325d430a876454b3ec0e Mon Sep 17 00:00:00 2001 From: Britt Lewis Date: Sat, 24 May 2025 14:45:34 -0400 Subject: [PATCH 4/4] cargo fmt in build.rs --- llama-cpp-sys-2/build.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 156eb4b4..f545ff9a 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -280,7 +280,11 @@ fn main() { config.define("GGML_BLAS", "OFF"); } - if (matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc)) && matches!(profile.as_str(), "Release" | "RelWithDebInfo" | "MinSizeRel")) + if (matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc)) + && matches!( + profile.as_str(), + "Release" | "RelWithDebInfo" | "MinSizeRel" + )) { // Debug Rust builds under MSVC turn off optimization even though we're ideally building the release profile of llama.cpp. // Looks like an upstream bug: