@@ -405,13 +405,17 @@ struct llama_server_context
405
405
// compare the evaluated prompt with the new prompt
406
406
n_past = common_part (embd, prompt_tokens);
407
407
embd = prompt_tokens;
408
+
408
409
if (n_past == num_prompt_tokens)
409
410
{
410
411
// we have to evaluate at least 1 token to generate logits.
411
412
printf (" we have to evaluate at least 1 token to generate logits\n " );
412
413
n_past--;
413
414
}
414
415
416
+ // since #3228 we now have to manually manage the KV cache
417
+ llama_kv_cache_seq_rm (ctx, 0 , n_past, -1 );
418
+
415
419
LOG_VERBOSE (" prompt ingested" , {
416
420
{" n_past" , n_past},
417
421
{" cached" , tokens_to_str (ctx, embd.cbegin (), embd.cbegin () + n_past)},
@@ -461,16 +465,16 @@ struct llama_server_context
461
465
// compare the evaluated prompt with the new prompt
462
466
n_past = common_part (embd, prompt_tokens);
463
467
464
- // since #3228 we now have to manually manage the KV cache
465
- llama_kv_cache_seq_rm (ctx, 0 , n_past, -1 );
466
-
467
468
embd = prompt_tokens;
468
469
if (n_past == num_prompt_tokens)
469
470
{
470
471
// we have to evaluate at least 1 token to generate logits.
471
472
n_past--;
472
473
}
473
474
475
+ // since #3228 we now have to manually manage the KV cache
476
+ llama_kv_cache_seq_rm (ctx, 0 , n_past, -1 );
477
+
474
478
LOG_VERBOSE (" prompt ingested" , {
475
479
{" n_past" , n_past},
476
480
{" cached" , tokens_to_str (ctx, embd.cbegin (), embd.cbegin () + n_past)},
0 commit comments