@@ -3824,7 +3824,7 @@ struct llm_build_llama : public llm_graph_context {
3824
3824
// self-attention
3825
3825
{
3826
3826
// rope freq factors for llama3; may return nullptr for llama2 and other models
3827
- struct ggml_tensor * rope_factors = model.build_rope_factors (n_ctx_per_seq, il);
3827
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors (n_ctx_per_seq, il);
3828
3828
3829
3829
// compute Q and K and RoPE them
3830
3830
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -3998,7 +3998,7 @@ struct llm_build_deci : public llm_graph_context {
3998
3998
} else if (n_head > 0) {
3999
3999
// self-attention
4000
4000
// rope freq factors for llama3; may return nullptr for llama2 and other models
4001
- struct ggml_tensor * rope_factors = model.build_rope_factors (n_ctx_per_seq, il);
4001
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors (n_ctx_per_seq, il);
4002
4002
4003
4003
// compute Q and K and RoPE them
4004
4004
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -6156,7 +6156,7 @@ struct llm_build_phi3 : public llm_graph_context {
6156
6156
// self-attention
6157
6157
{
6158
6158
// rope freq factors for 128k context
6159
- struct ggml_tensor * rope_factors = model.build_rope_factors (n_ctx_per_seq, il);
6159
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors (n_ctx_per_seq, il);
6160
6160
6161
6161
struct ggml_tensor* attn_norm_output = build_norm(inpL,
6162
6162
model.layers[il].attn_norm,
@@ -6879,7 +6879,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
6879
6879
for (int il = 0; il < n_layer; ++il) {
6880
6880
struct ggml_tensor * inpSA = inpL;
6881
6881
6882
- struct ggml_tensor * rope_factors = model.build_rope_factors (n_ctx_per_seq, il);
6882
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors (n_ctx_per_seq, il);
6883
6883
6884
6884
// norm
6885
6885
cur = build_norm(inpL,
@@ -7801,7 +7801,7 @@ struct llm_build_cohere2 : public llm_graph_context {
7801
7801
// self-attention
7802
7802
{
7803
7803
// rope freq factors for 128k context
7804
- struct ggml_tensor * rope_factors = model.build_rope_factors (n_ctx_per_seq, il);
7804
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors (n_ctx_per_seq, il);
7805
7805
7806
7806
// compute Q and K and RoPE them
7807
7807
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -8715,7 +8715,7 @@ struct llm_build_deepseek : public llm_graph_context {
8715
8715
// self-attention
8716
8716
{
8717
8717
// rope freq factors for llama3; may return nullptr for llama2 and other models
8718
- struct ggml_tensor * rope_factors = model.build_rope_factors (n_ctx_per_seq, il);
8718
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors (n_ctx_per_seq, il);
8719
8719
8720
8720
// compute Q and K and RoPE them
8721
8721
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9872,7 +9872,7 @@ struct llm_build_exaone : public llm_graph_context {
9872
9872
// self-attention
9873
9873
{
9874
9874
// rope freq factors for llama3; may return nullptr for llama2 and other models
9875
- struct ggml_tensor * rope_factors = model.build_rope_factors (n_ctx_per_seq, il);
9875
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors (n_ctx_per_seq, il);
9876
9876
9877
9877
// compute Q and K and RoPE them
9878
9878
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -10682,17 +10682,38 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
10682
10682
}
10683
10683
};
10684
10684
10685
- ggml_tensor * llama_model::build_rope_factors(uint32_t n_ctx_per_seq, int il) const {
10686
- // choose long/short freq factors based on the context size
10687
- if (layers[il].rope_freqs != nullptr) {
10688
- return layers[il].rope_freqs;
10689
- }
10685
+ llama_memory_i * llama_model::create_memory() const {
10686
+ llama_memory_i * res;
10690
10687
10691
- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
10692
- return layers[il].rope_long;
10688
+ switch (arch) {
10689
+ case LLM_ARCH_RWKV6:
10690
+ case LLM_ARCH_RWKV6QWEN2:
10691
+ case LLM_ARCH_MAMBA:
10692
+ {
10693
+ res = new llama_kv_cache_recurrent(hparams, {
10694
+ /*.get_rope_factors =*/ nullptr
10695
+ });
10696
+ } break;
10697
+ default:
10698
+ {
10699
+ res = new llama_kv_cache_unified(hparams, {
10700
+ /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
10701
+ // choose long/short freq factors based on the context size
10702
+ if (layers[il].rope_freqs != nullptr) {
10703
+ return layers[il].rope_freqs;
10704
+ }
10705
+
10706
+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
10707
+ return layers[il].rope_long;
10708
+ }
10709
+
10710
+ return layers[il].rope_short;
10711
+ }
10712
+ });
10713
+ }
10693
10714
}
10694
10715
10695
- return layers[il].rope_short ;
10716
+ return res ;
10696
10717
}
10697
10718
10698
10719
llm_graph_result_ptr llama_model::build_graph(
0 commit comments