server : dynamic token limit for prompt cache

ggerganov · ggerganov · commit 07d6954f6374 · 2025-10-13T13:10:01.000+03:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -1586,12 +1586,19 @@ struct server_prompt_cache {
         }
 
         if (limit_tokens > 0) {
-            while (states.size() > 1 && n_tokens() > limit_tokens) {
+            // average size per token
+            const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
+
+            // dynamically increase the token limit if it can fit in the memory limit
+            const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
+
+            while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
                 if (states.empty()) {
                     break;
                 }
 
-                SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
+                SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
+                        limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
 
                 states.pop_front();
             }
@@ -1601,7 +1608,8 @@ struct server_prompt_cache {
                 states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);
 
         for (const auto & state : states) {
-            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
+            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
+                    (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
         }
     }
 };

Original file line number	Diff line number	Diff line change
`@@ -1586,12 +1586,19 @@ struct server_prompt_cache {`
`1586`	`1586`	`}`
`1587`	`1587`
`1588`	`1588`	`if (limit_tokens > 0) {`
`1589`		`- while (states.size() > 1 && n_tokens() > limit_tokens) {`
	`1589`	`+ // average size per token`
	`1590`	`+ const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));`
	`1591`	`+`
	`1592`	`+ // dynamically increase the token limit if it can fit in the memory limit`
	`1593`	`+ const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;`
	`1594`	`+`
	`1595`	`+ while (states.size() > 1 && n_tokens() > limit_tokens_cur) {`
`1590`	`1596`	`if (states.empty()) {`
`1591`	`1597`	`break;`
`1592`	`1598`	`}`
`1593`	`1599`
`1594`		`- SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));`
	`1600`	`+ SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",`
	`1601`	`+ limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));`
`1595`	`1602`
`1596`	`1603`	`states.pop_front();`
`1597`	`1604`	`}`
`@@ -1601,7 +1608,8 @@ struct server_prompt_cache {`
`1601`	`1608`	`states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);`
`1602`	`1609`
`1603`	`1610`	`for (const auto & state : states) {`
`1604`		`- SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void )&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 1024.0));`
	`1611`	`+ SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",`
	`1612`	`+ (const void )&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 1024.0));`
`1605`	`1613`	`}`
`1606`	`1614`	`}`
`1607`	`1615`	`};`