@@ -9061,6 +9061,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
9061
9061
// number of cells moved
9062
9062
uint32_t n_moves = 0;
9063
9063
9064
+ // each move requires 6*n_layer tensors (see build_defrag)
9065
+ // - source view, destination view, copy operation
9066
+ // - x2 for keys and values
9067
+ const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
9068
+
9064
9069
// determine which KV cells to move where
9065
9070
//
9066
9071
// cell i moves to ids[i]
@@ -9087,15 +9092,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
9087
9092
nh++;
9088
9093
}
9089
9094
9090
- // each move requires 6*n_layer tensors (see build_defrag)
9091
- // - source view, destination view, copy operation
9092
- // - x2 for keys and values
9093
- //
9094
- if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
9095
- // the graph is too big, we cannot move more cells
9096
- break;
9097
- }
9098
-
9099
9095
uint32_t nf = 0;
9100
9096
uint32_t is = n_kv - 1;
9101
9097
@@ -9125,11 +9121,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
9125
9121
// are we moving a continuous block of memory?
9126
9122
bool cont = false;
9127
9123
9124
+ // should we stop searching for the next move?
9125
+ bool stop = false;
9126
+
9128
9127
// go back and move the nf cells to the hole
9129
9128
for (; i1 < n_kv; ++i1) {
9130
9129
auto & cell1 = kv_self.cells[i1];
9131
9130
9132
9131
if (cell1.is_empty() || ids[i1] != n_kv) {
9132
+ if (n_moves == max_moves) {
9133
+ stop = true;
9134
+ break;
9135
+ }
9136
+
9133
9137
cont = false;
9134
9138
continue;
9135
9139
}
@@ -9156,6 +9160,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
9156
9160
}
9157
9161
}
9158
9162
9163
+ if (stop || n_moves == max_moves) {
9164
+ break;
9165
+ }
9166
+
9159
9167
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
9160
9168
9161
9169
i0 += nh - 1;
0 commit comments