@@ -1225,7 +1225,7 @@ struct llama_server_context
1225
1225
std::vector<llama_token> append_tokens = tokenize (json_prompt, false ); // has next image
1226
1226
for (int i = 0 ; i < (int ) append_tokens.size (); ++i)
1227
1227
{
1228
- llama_batch_add (batch, append_tokens[i], slot.n_past , { slot.id }, true );
1228
+ llama_batch_add (batch, append_tokens[i], system_tokens. size () + slot.n_past , { slot.id }, true );
1229
1229
slot.n_past += 1 ;
1230
1230
}
1231
1231
}
@@ -1376,12 +1376,12 @@ struct llama_server_context
1376
1376
if (slot.is_processing () && system_tokens.size () + slot.cache_tokens .size () >= (size_t ) slot.n_ctx )
1377
1377
{
1378
1378
// Shift context
1379
- const int n_left = slot.n_past - slot.params .n_keep - 1 ;
1379
+ const int n_left = system_tokens. size () + slot.n_past - slot.params .n_keep - 1 ;
1380
1380
const int n_discard = n_left / 2 ;
1381
1381
1382
1382
LOG_TEE (" slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n " , slot.id , slot.params .n_keep , n_left, n_discard);
1383
1383
llama_kv_cache_seq_rm (ctx, slot.id , slot.params .n_keep + 1 , slot.params .n_keep + n_discard + 1 );
1384
- llama_kv_cache_seq_shift (ctx, slot.id , slot.params .n_keep + 1 + n_discard, slot.n_past , -n_discard);
1384
+ llama_kv_cache_seq_shift (ctx, slot.id , slot.params .n_keep + 1 + n_discard, system_tokens. size () + slot.n_past , -n_discard);
1385
1385
1386
1386
for (size_t i = slot.params .n_keep + 1 + n_discard; i < slot.cache_tokens .size (); i++)
1387
1387
{
@@ -1426,6 +1426,8 @@ struct llama_server_context
1426
1426
1427
1427
slot.i_batch = batch.n_tokens ;
1428
1428
1429
+ // TODO: we always have to take into account the "system_tokens"
1430
+ // this is not great and needs to be improved somehow
1429
1431
llama_batch_add (batch, slot.sampled , system_tokens.size () + slot.n_past , { slot.id }, true );
1430
1432
1431
1433
slot.n_past += 1 ;
@@ -1478,8 +1480,8 @@ struct llama_server_context
1478
1480
1479
1481
prefix_tokens.insert (prefix_tokens.begin (), llama_token_prefix (model));
1480
1482
prefix_tokens.insert (prefix_tokens.begin (), llama_token_bos (model)); // always add BOS
1481
- prefix_tokens.insert (prefix_tokens.end (), llama_token_suffix (model));
1482
- prefix_tokens.insert (prefix_tokens.end (), suffix_tokens.begin (), suffix_tokens.end ());
1483
+ prefix_tokens.insert (prefix_tokens.end (), llama_token_suffix (model));
1484
+ prefix_tokens.insert (prefix_tokens.end (), suffix_tokens.begin (), suffix_tokens.end ());
1483
1485
prefix_tokens.push_back (llama_token_middle (model));
1484
1486
prompt_tokens = prefix_tokens;
1485
1487
}
0 commit comments