@@ -316,6 +316,7 @@ struct llama_client_slot
316
316
struct slot_params params;
317
317
struct llama_sampling_params sparams;
318
318
llama_sampling_context ctx_sampling;
319
+ bool has_next_token = true ;
319
320
320
321
// grammar props
321
322
grammar_parser::parse_state parsed_grammar;
@@ -710,9 +711,14 @@ struct llama_server_context
710
711
if (pos != std::string::npos &&
711
712
(stop_pos == std::string::npos || pos < stop_pos))
712
713
{
714
+ if (type == STOP_FULL)
715
+ {
716
+ slot.stopped_word = true ;
717
+ slot.stopping_word = word;
718
+ slot.has_next_token = false ;
719
+ }
713
720
stop_pos = pos;
714
- slot.stopped_word = true ;
715
- slot.stopping_word = word;
721
+
716
722
}
717
723
}
718
724
return stop_pos;
@@ -727,6 +733,8 @@ struct llama_server_context
727
733
728
734
// search stop word and delete it
729
735
slot.generated_text += token_str;
736
+ slot.has_next_token = true ;
737
+
730
738
size_t pos = std::min (slot.sent_count , slot.generated_text .size ());
731
739
const std::string str_test = slot.generated_text .substr (pos);
732
740
bool is_stop_full = false ;
@@ -744,15 +752,13 @@ struct llama_server_context
744
752
}
745
753
746
754
// check if there is any token to predict
747
- bool has_next_token = !is_stop_full && stop_pos > 0 ;
748
- if (stop_pos == std::string::npos) {
755
+ if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0 )) {
749
756
// no send the stop word in the response
750
757
result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
751
758
slot.sent_count += result.text_to_send .size ();
752
- has_next_token = true ;
759
+ // add the token to slot queue and cache
760
+ slot.addTokenString (result);
753
761
}
754
- // add the token to slot queue and cache
755
- slot.addTokenString (result);
756
762
if (slot.multibyte_pending > 0 )
757
763
{
758
764
slot.multibyte_pending -= token_str.size ();
@@ -781,37 +787,37 @@ struct llama_server_context
781
787
}
782
788
}
783
789
784
- if (slot.multibyte_pending > 0 && !has_next_token)
790
+ if (slot.multibyte_pending > 0 && !slot. has_next_token )
785
791
{
786
- has_next_token = true ;
792
+ slot. has_next_token = true ;
787
793
}
788
794
789
795
// check the limits
790
796
if (
791
- slot.n_decoded > 2 && has_next_token && !slot.hasBudget (params))
797
+ slot.n_decoded > 2 && slot. has_next_token && !slot.hasBudget (params))
792
798
{
793
799
slot.stopped_limit = true ;
794
- has_next_token = false ;
800
+ slot. has_next_token = false ;
795
801
}
796
802
797
803
if (!slot.cache_tokens .empty () && result.tok == llama_token_eos (ctx)){
798
804
slot.stopped_eos = true ;
799
- has_next_token = false ;
805
+ slot. has_next_token = false ;
800
806
LOG_VERBOSE (" eos token found" , {});
801
807
}
802
808
803
809
LOG_VERBOSE (" next token" , {
804
810
{" token" , result.tok },
805
811
{" token_text" , tokens_to_output_formatted_string (ctx, result.tok )},
806
- {" has_next_token" , has_next_token},
812
+ {" has_next_token" , slot. has_next_token },
807
813
{" n_remain" , slot.n_remaining },
808
814
{" num_tokens_predicted" , slot.num_tokens_predicted },
809
815
{" stopped_eos" , slot.stopped_eos },
810
816
{" stopped_word" , slot.stopped_word },
811
817
{" stopped_limit" , slot.stopped_limit },
812
818
{" stopping_word" , slot.stopping_word },
813
819
});
814
- return has_next_token; // continue
820
+ return slot. has_next_token ; // continue
815
821
}
816
822
817
823
#ifdef SERVER_MULTIMODAL_SUPPORT
@@ -2293,7 +2299,6 @@ int main(int argc, char **argv)
2293
2299
const json body = json::parse (req.body );
2294
2300
llama_client_slot* slot = llama.getSlot (-1 );
2295
2301
slot->reset ();
2296
- // llama_reset_timings(llama.ctx);
2297
2302
if (body.count (" content" ) != 0 )
2298
2303
{
2299
2304
slot->prompt = body[" content" ];
0 commit comments