@@ -247,6 +247,7 @@ struct server_slot {
247
247
if (is_processing ()) {
248
248
SLT_INF (*this , " stop processing: n_past = %d, truncated = %d\n " , n_past, truncated);
249
249
250
+ t_last_used = ggml_time_us ();
250
251
t_token_generation = (ggml_time_us () - t_start_generation) / 1e3 ;
251
252
state = SLOT_STATE_IDLE;
252
253
callback_on_release (id);
@@ -730,7 +731,7 @@ struct server_context {
730
731
731
732
// find the slot that has at least n% prompt similarity
732
733
if (ret == nullptr && slot_prompt_similarity != 0 .0f ) {
733
- int max_lcs_len = 0 ;
734
+ int lcs_len = 0 ;
734
735
float similarity = 0 ;
735
736
736
737
for (server_slot & slot : slots) {
@@ -745,20 +746,21 @@ struct server_context {
745
746
}
746
747
747
748
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
748
- int lcs_len = longest_common_subsequence (slot.cache_tokens , task.prompt_tokens );
749
+ int cur_lcs_len = longest_common_subsequence (slot.cache_tokens , task.prompt_tokens );
749
750
750
751
// fraction of the common subsequence length compared to the current slot's prompt length
751
- similarity = static_cast <float >(lcs_len ) / static_cast <int >(slot.cache_tokens .size ());
752
+ float cur_similarity = static_cast <float >(cur_lcs_len ) / static_cast <int >(slot.cache_tokens .size ());
752
753
753
754
// select the current slot if the criteria match
754
- if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
755
- max_lcs_len = lcs_len;
755
+ if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
756
+ lcs_len = cur_lcs_len;
757
+ similarity = cur_similarity;
756
758
ret = &slot;
757
759
}
758
760
}
759
761
760
762
if (ret != nullptr ) {
761
- SLT_DBG (*ret, " selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n " , max_lcs_len , similarity);
763
+ SLT_DBG (*ret, " selected slot by lcs similarity, lcs_len = %d, similarity = %f\n " , lcs_len , similarity);
762
764
}
763
765
}
764
766
0 commit comments