Skip to content

Commit c56b6dd

Browse files
authored
fix(llama.cpp): disable infinite context shifting (#1704)
Infinite context loop might as well trigger an infinite loop of context shifting if the model hallucinates and does not stop answering. This has the unpleasant effect that the predicion never terminates, which is the case especially on small models which tends to hallucinate. Workarounds #1333 by removing context-shifting. See also upstream issue: ggml-org/llama.cpp#3969
1 parent 2e61ff3 commit c56b6dd

File tree

1 file changed

+13
-23
lines changed

1 file changed

+13
-23
lines changed

backend/cpp/llama/grpc-server.cpp

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1387,30 +1387,20 @@ struct llama_server_context
13871387
{
13881388
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
13891389
{
1390-
// Shift context
1391-
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
1392-
const int n_discard = n_left / 2;
1393-
1394-
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
1395-
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
1396-
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
1397-
1398-
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
1399-
{
1400-
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
1401-
}
1402-
1403-
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
1404-
1405-
slot.n_past -= n_discard;
1406-
1407-
slot.truncated = true;
1390+
// START LOCALAI changes
1391+
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
1392+
// See: https://github.com/mudler/LocalAI/issues/1333
1393+
// Context is exhausted, release the slot
1394+
slot.release();
1395+
send_final_response(slot);
1396+
slot.cache_tokens.clear();
1397+
slot.n_past = 0;
1398+
slot.truncated = false;
1399+
slot.has_next_token = true;
1400+
LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
14081401

1409-
LOG_VERBOSE("context shift", {
1410-
{ "n_ctx", n_ctx },
1411-
{ "n_keep", params.n_keep },
1412-
{ "n_left", n_left },
1413-
});
1402+
continue;
1403+
// END LOCALAI changes
14141404
}
14151405
}
14161406
}

0 commit comments

Comments
 (0)