@@ -1586,12 +1586,19 @@ struct server_prompt_cache {
15861586 }
15871587
15881588 if (limit_tokens > 0 ) {
1589- while (states.size () > 1 && n_tokens () > limit_tokens) {
1589+ // average size per token
1590+ const float size_per_token = std::max<float >(1 .0f , float (size ()) / (std::max<size_t >(1 , n_tokens ())));
1591+
1592+ // dynamically increase the token limit if it can fit in the memory limit
1593+ const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t >(limit_tokens, limit_size/size_per_token) : limit_tokens;
1594+
1595+ while (states.size () > 1 && n_tokens () > limit_tokens_cur) {
15901596 if (states.empty ()) {
15911597 break ;
15921598 }
15931599
1594- SRV_WRN (" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n " , states.front ().size () / (1024.0 * 1024.0 ));
1600+ SRV_WRN (" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n " ,
1601+ limit_tokens, limit_tokens_cur, states.front ().size () / (1024.0 * 1024.0 ));
15951602
15961603 states.pop_front ();
15971604 }
@@ -1601,7 +1608,8 @@ struct server_prompt_cache {
16011608 states.size (), size () / (1024.0 * 1024.0 ), limit_size / (1024.0 * 1024.0 ), limit_tokens);
16021609
16031610 for (const auto & state : states) {
1604- SRV_WRN (" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n " , (const void *)&state, state.n_tokens (), state.checkpoints .size (), state.size () / (1024.0 * 1024.0 ));
1611+ SRV_WRN (" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n " ,
1612+ (const void *)&state, state.n_tokens (), state.checkpoints .size (), state.size () / (1024.0 * 1024.0 ));
16051613 }
16061614 }
16071615};
0 commit comments