ggml-org
diff --git a/‎common/common.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/common.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/speculative.cpp‎
Lines changed: 6 additions & 4 deletions b/‎common/speculative.cpp‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 1 addition & 2 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/batched.swift/Sources/main.swift‎
Lines changed: 1 addition & 1 deletion b/‎examples/batched.swift/Sources/main.swift‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/embedding/embedding.cpp‎
Lines changed: 18 additions & 3 deletions b/‎examples/embedding/embedding.cpp‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎examples/gritlm/gritlm.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/gritlm/gritlm.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/llama.android/llama/src/main/cpp/llama-android.cpp‎
Lines changed: 4 additions & 4 deletions b/‎examples/llama.android/llama/src/main/cpp/llama-android.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/llama.swiftui/llama.cpp.swift/LibLlama.swift‎
Lines changed: 4 additions & 4 deletions b/‎examples/llama.swiftui/llama.cpp.swift/LibLlama.swift‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/lookahead/lookahead.cpp‎
Lines changed: 8 additions & 6 deletions b/‎examples/lookahead/lookahead.cpp‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎examples/lookup/lookup.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/lookup/lookup.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -942,7 +942,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
-    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
+    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
         LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
@@ -1049,7 +1049,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_self_clear(lctx);
+        llama_memory_clear(llama_get_memory(lctx), true);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
         llama_set_warmup(lctx, false);
 
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
     auto & smpl   = spec->smpl;
     auto & prompt = spec->prompt;
 
+    auto * mem = llama_get_memory(ctx);
+
     int reuse_i = 0;
     int reuse_n = 0;
 
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
     result.reserve(params.n_draft);
 
     if (reuse_n == 0) {
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(mem, false);
 
         prompt.clear();
     } else {
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
         }
 
         if (reuse_i > 0) {
-            llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
-            llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+            llama_memory_seq_rm (mem, 0, 0, reuse_i);
+            llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
 
             prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
         }
 
         if (reuse_n < (int) prompt.size()) {
-            llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
+            llama_memory_seq_rm (mem, 0, reuse_n, -1);
 
             prompt.erase(prompt.begin() + reuse_n, prompt.end());
         }
 
@@ -3709,8 +3709,7 @@ def set_gguf_parameters(self):
         self._try_set_pooling_type()
 
         if self.cls_out_labels:
-            key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
-            self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
+            self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
 
     def set_vocab(self):
         tokens, toktypes, tokpre = self.get_vocab_base()
 
@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
 }
 
 for i in 1 ..< n_parallel {
-    llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+    llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
 }
 
 if n_parallel > 1 {
 
@@ -37,7 +37,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_self_clear(ctx);
+    llama_memory_clear(llama_get_memory(ctx), true);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
@@ -236,9 +236,24 @@ int main(int argc, char ** argv) {
                 LOG("\n");
             }
         } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+            const uint32_t n_cls_out = llama_model_n_cls_out(model);
+            std::vector<std::string> cls_out_labels;
+
+            for (uint32_t i = 0; i < n_cls_out; i++) {
+                const char * label = llama_model_cls_label(model, i);
+                const std::string label_i(label == nullptr ? "" : label);
+                cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
+            }
+
             for (int j = 0; j < n_embd_count; j++) {
-                // NOTE: if you change this log - update the tests in ci/run.sh
-                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                for (uint32_t i = 0; i < n_cls_out; i++) {
+                    // NOTE: if you change this log - update the tests in ci/run.sh
+                    if (n_cls_out == 1) {
+                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                    } else {
+                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
+                    }
+                }
             }
         } else {
             // print the first part of the embeddings or for a single prompt, the full embedding
 
@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
         llama_set_embeddings(ctx, true);
         llama_set_causal_attn(ctx, false);
 
@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
 
     llama_token eos_token = llama_vocab_eos(vocab);
 
-    llama_kv_self_clear(ctx);
+    llama_memory_clear(llama_get_memory(ctx), true);
     llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
 
 
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
         }
 
         batch->logits[batch->n_tokens - 1] = true;
-        llama_kv_self_clear(context);
+        llama_memory_clear(llama_get_memory(context), false);
 
         const auto t_pp_start = ggml_time_us();
         if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         LOGi("Benchmark text generation (tg)");
 
-        llama_kv_self_clear(context);
+        llama_memory_clear(llama_get_memory(context), false);
         const auto t_tg_start = ggml_time_us();
         for (i = 0; i < tg; i++) {
 
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         const auto t_tg_end = ggml_time_us();
 
-        llama_kv_self_clear(context);
+        llama_memory_clear(llama_get_memory(context), false);
 
         const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
         const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
+    llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
 }
@@ -210,7 +210,7 @@ actor LlamaContext {
             }
             batch.logits[Int(batch.n_tokens) - 1] = 1 // true
 
-            llama_kv_self_clear(context)
+            llama_memory_clear(llama_get_memory(context), false)
 
             let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
@@ -223,7 +223,7 @@ actor LlamaContext {
 
             // bench text generation
 
-            llama_kv_self_clear(context)
+            llama_memory_clear(llama_get_memory(context), false)
 
             let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
@@ -242,7 +242,7 @@ actor LlamaContext {
 
             let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
 
-            llama_kv_self_clear(context)
+            llama_memory_clear(llama_get_memory(context), false)
 
             let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
             let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -292,7 +292,7 @@ actor LlamaContext {
     func clear() {
         tokens_list.removeAll()
         temporary_invalid_cchars.removeAll()
-        llama_kv_self_clear(context)
+        llama_memory_clear(llama_get_memory(context), true)
     }
 
     private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
 
@@ -60,6 +60,8 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
 
+    auto * mem = llama_get_memory(ctx);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     // Tokenize the prompt
@@ -94,7 +96,7 @@ int main(int argc, char ** argv) {
     llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
+        llama_memory_seq_cp(mem, 0, s, -1, -1);
     }
 
     const auto t_enc_end = ggml_time_us();
@@ -427,17 +429,17 @@ int main(int argc, char ** argv) {
 
         // KV cache management
         // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_self_seq_rm(ctx, -1, n_past, -1);
+        llama_memory_seq_rm(mem, -1, n_past, -1);
 
         if (seq_id_best != 0) {
             // if a verification token matched, we keep the best sequence and remove the rest
             // this leads to some KV cache fragmentation
-            llama_kv_self_seq_keep(ctx, seq_id_best);
-            llama_kv_self_seq_cp  (ctx, seq_id_best, 0, -1, -1);
-            llama_kv_self_seq_rm  (ctx, seq_id_best,    -1, -1);
+            llama_memory_seq_keep(mem, seq_id_best);
+            llama_memory_seq_cp  (mem, seq_id_best, 0, -1, -1);
+            llama_memory_seq_rm  (mem, seq_id_best,    -1, -1);
 
             for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
+                llama_memory_seq_cp(mem, 0, s, -1, -1);
             }
         }
     }
 
@@ -181,7 +181,7 @@ int main(int argc, char ** argv){
 
         // KV cache management
         // clean the cache of draft tokens that weren't accepted
-        llama_kv_self_seq_rm(ctx, 0, n_past, -1);
+        llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
 
         common_batch_clear(batch_tgt);
         common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
Original file line number	Diff line number	Diff line change
`@@ -942,7 +942,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`942`	`942`	`return iparams;`
`943`	`943`	`}`
`944`	`944`
`945`		`- if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {`
	`945`	`+ if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {`
`946`	`946`	`LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);`
`947`	`947`	`params.ctx_shift = false;`
`948`	`948`	`}`
`@@ -1049,7 +1049,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`1049`	`1049`	`if (llama_model_has_decoder(model)) {`
`1050`	`1050`	`llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));`
`1051`	`1051`	`}`
`1052`		`- llama_kv_self_clear(lctx);`
	`1052`	`+ llama_memory_clear(llama_get_memory(lctx), true);`
`1053`	`1053`	`llama_synchronize(lctx);`
`1054`	`1054`	`llama_perf_context_reset(lctx);`
`1055`	`1055`	`llama_set_warmup(lctx, false);`
Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {`
`116`	`116`	`}`
`117`	`117`
`118`	`118`	`for i in 1 ..< n_parallel {`
`119`		`- llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)`
	`119`	`+ llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)`
`120`	`120`	`}`
`121`	`121`
`122`	`122`	`if n_parallel > 1 {`