@@ -395,7 +395,7 @@ struct llama_client_slot
395
395
}
396
396
397
397
bool isProcessing () {
398
- return (state == IDLE || state == SLEEPING) && command == LOAD_PROMPT || state == PROCESSING;
398
+ return (( state == IDLE || state == SLEEPING) && command == LOAD_PROMPT) || state == PROCESSING;
399
399
}
400
400
401
401
completion_token_output next () {
@@ -1041,26 +1041,22 @@ struct llama_server_context
1041
1041
// if input prompt is too big, truncate like normal
1042
1042
if (slot.num_prompt_tokens >= (size_t )n_ctx)
1043
1043
{
1044
- const int n_left = ( n_ctx - params.n_keep ) / 2 ;
1044
+ const int n_left = n_ctx - params.n_keep ;
1045
1045
std::vector<llama_token> new_tokens (prompt_tokens.begin (), prompt_tokens.begin () + params.n_keep );
1046
- const int erased_blocks = (slot.num_prompt_tokens - params.n_keep - n_left - 1 ) / n_left;
1047
- new_tokens.insert (new_tokens.end (), prompt_tokens.begin () + params.n_keep + erased_blocks * n_left, prompt_tokens.end ());
1048
- std::copy (prompt_tokens.end () - n_ctx, prompt_tokens.end (), slot.last_n_tokens .begin ());
1049
-
1046
+ // Use half the left-over space in the context for the prompt
1047
+ new_tokens.insert (new_tokens.end (), prompt_tokens.end () - n_left / 2 , prompt_tokens.end ());
1050
1048
LOG_VERBOSE (" input truncated" , {
1051
1049
{" n_ctx" , n_ctx},
1052
1050
{" n_keep" , params.n_keep },
1053
1051
{" n_left" , n_left},
1054
1052
{" new_tokens" , tokens_to_str (ctx, new_tokens.cbegin (), new_tokens.cend ())},
1055
1053
});
1056
-
1057
1054
slot.truncated = true ;
1058
1055
prompt_tokens = new_tokens;
1059
- } else {
1060
- const size_t ps = slot.num_prompt_tokens ;
1061
- std::fill (slot.last_n_tokens .begin (), slot.last_n_tokens .end () - ps, 0 );
1062
- std::copy (prompt_tokens.begin (), prompt_tokens.end (), slot.last_n_tokens .end () - ps);
1063
1056
}
1057
+ const size_t ps = slot.num_prompt_tokens ;
1058
+ std::fill (slot.last_n_tokens .begin (), slot.last_n_tokens .end () - ps, 0 );
1059
+ std::copy (prompt_tokens.begin (), prompt_tokens.end (), slot.last_n_tokens .end () - ps);
1064
1060
}
1065
1061
1066
1062
llama_kv_cache_seq_rm (ctx, slot.id , num_tokens_system + slot.n_past , -1 );
@@ -1925,7 +1921,7 @@ static void beam_search_callback(void *callback_data, llama_beams_state beams_st
1925
1921
llama.slot ->generated_token_probs .resize (llama.slot ->generated_token_probs .size () + n);
1926
1922
assert (0u < beams_state.n_beams );
1927
1923
const llama_token * tokens = beams_state.beam_views [0 ].tokens ;
1928
- const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
1924
+ const auto map = [](llama_token tok) { return completion_token_output{{},tok, " " }; };
1929
1925
std::transform (tokens, tokens + n, llama.slot ->generated_token_probs .end () - n, map);
1930
1926
printf (" %zu" , n);
1931
1927
}
0 commit comments