kv_cache : provide rope factors

ggerganov · ggerganov · commit e1a502cc11e1 · 2025-03-11T13:53:43.000+02:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1729,7 +1729,7 @@ llama_context_kv_self::llama_context_kv_self(
 
     const auto & hparams = model.hparams;
 
-    kv_self = std::make_unique<llama_kv_cache_unified>(hparams);
+    kv_self.reset(static_cast<llama_kv_cache_unified *>(model.create_memory()));
 
     LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
 
@@ -1885,7 +1885,7 @@ llm_graph_result_ptr llama_context_kv_self::build_kv_self_shift(
         const int64_t n_head_kv    = hparams.n_head_kv(il);
         const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
 
-        ggml_tensor * rope_factors = model.build_rope_factors(n_ctx_per_seq(), il);
+        ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il);
 
         ggml_tensor * k =
             ggml_view_3d(ctx0, kv_self->k_l[il],
@@ -2665,7 +2665,7 @@ llama_context_recurrent::llama_context_recurrent(
 
     const auto & hparams = model.hparams;
 
-    kv_self = std::make_unique<llama_kv_cache_recurrent>(hparams);
+    kv_self.reset(static_cast<llama_kv_cache_recurrent *>(model.create_memory()));
 
     LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
 
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -13,7 +13,7 @@
 
 static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
 
-llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams) : hparams(hparams) {
+llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
 }
 
 bool llama_kv_cache_unified::init(
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -6,9 +6,9 @@
 
 #include "ggml-cpp.h"
 
+#include <functional>
 #include <set>
 #include <vector>
-#include <algorithm>
 
 struct llama_cparams;
 struct llama_hparams;
@@ -62,7 +62,15 @@ struct llama_kv_cache_slot_info {
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
-    llama_kv_cache_unified(const llama_hparams & hparams);
+    // can be used to query data from the model if needed
+    struct callbacks {
+        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
+    };
+
+    llama_kv_cache_unified(
+            const llama_hparams & hparams,
+            callbacks             cbs);
+
     virtual ~llama_kv_cache_unified() = default;
 
     // TODO: become constructor
@@ -129,6 +137,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
 
     const llama_hparams & hparams;
 
+    callbacks cbs;
+
     bool has_shift = false;
     bool do_defrag = false;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3824,7 +3824,7 @@ struct llm_build_llama : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = model.build_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -3998,7 +3998,7 @@ struct llm_build_deci : public llm_graph_context {
             } else if (n_head > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = model.build_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -6156,7 +6156,7 @@ struct llm_build_phi3 : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = model.build_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 
                 struct ggml_tensor* attn_norm_output = build_norm(inpL,
                         model.layers[il].attn_norm,
@@ -6879,7 +6879,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            struct ggml_tensor * rope_factors = model.build_rope_factors(n_ctx_per_seq, il);
+            ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 
             // norm
             cur = build_norm(inpL,
@@ -7801,7 +7801,7 @@ struct llm_build_cohere2 : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = model.build_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -8715,7 +8715,7 @@ struct llm_build_deepseek : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = model.build_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9872,7 +9872,7 @@ struct llm_build_exaone : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = model.build_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -10682,17 +10682,38 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
     }
 };
 
-ggml_tensor * llama_model::build_rope_factors(uint32_t n_ctx_per_seq, int il) const {
-    // choose long/short freq factors based on the context size
-    if (layers[il].rope_freqs != nullptr) {
-        return layers[il].rope_freqs;
-    }
+llama_memory_i * llama_model::create_memory() const {
+    llama_memory_i * res;
 
-    if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
-        return layers[il].rope_long;
+    switch (arch) {
+        case LLM_ARCH_RWKV6:
+        case LLM_ARCH_RWKV6QWEN2:
+        case LLM_ARCH_MAMBA:
+            {
+                res = new llama_kv_cache_recurrent(hparams, {
+                    /*.get_rope_factors =*/ nullptr
+                });
+            } break;
+        default:
+            {
+                res = new llama_kv_cache_unified(hparams, {
+                    /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
+                        // choose long/short freq factors based on the context size
+                        if (layers[il].rope_freqs != nullptr) {
+                            return layers[il].rope_freqs;
+                        }
+
+                        if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
+                            return layers[il].rope_long;
+                        }
+
+                        return layers[il].rope_short;
+                    }
+                });
+            }
     }
 
-    return layers[il].rope_short;
+    return res;
 }
 
 llm_graph_result_ptr llama_model::build_graph(
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -2,8 +2,9 @@
 
 #include "llama.h"
 #include "llama-arch.h"
-#include "llama-hparams.h"
 #include "llama-graph.h"
+#include "llama-hparams.h"
+#include "llama-memory.h"
 #include "llama-vocab.h"
 
 #include <memory>
@@ -366,7 +367,7 @@ struct llama_model {
     const struct ggml_tensor * get_tensor(const char * name) const;
 
     // TODO: move this to new llm_arch_model_i interface
-    ggml_tensor * build_rope_factors(uint32_t n_ctx_per_seq, int il) const;
+    llama_memory_i * create_memory() const; // TODO: params
 
     // TODO: move this to new llm_arch_model_i interface
     llm_graph_result_ptr build_graph(

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`
`14`	`14`	`static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};`
`15`	`15`
`16`		`-llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams) : hparams(hparams) {`
	`16`	`+llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {`
`17`	`17`	`}`
`18`	`18`
`19`	`19`	`bool llama_kv_cache_unified::init(`