@@ -382,7 +382,7 @@ struct llama_server_context
382
382
gpt_params params;
383
383
int n_ctx;
384
384
int n_vocab;
385
-
385
+ bool clean_kv_cache = true ;
386
386
std::mutex mutex;
387
387
388
388
std::unique_lock<std::mutex> lock ()
@@ -484,6 +484,7 @@ struct llama_server_context
484
484
else
485
485
{
486
486
auto s = json_prompt.template get <std::string>();
487
+ printf (" ----------------------\n prompt:\n %s-----------------------\n " , s.c_str ());
487
488
prompt_tokens = ::llama_tokenize (ctx, s, add_bos);
488
489
}
489
490
@@ -622,17 +623,22 @@ struct llama_server_context
622
623
// has_next_token = true;
623
624
}
624
625
626
+ void cleanKVCache () {
627
+ // clear the entire KV cache
628
+ for (int i = 0 ; i < params.n_parallel ; ++i)
629
+ {
630
+ llama_kv_cache_seq_rm (ctx, i, 0 , -1 );
631
+ }
632
+ clean_kv_cache = false ;
633
+ }
634
+
625
635
void updateSystemPrompt () {
626
636
tokens_system = ::llama_tokenize (ctx, system_prompt, true );
627
637
n_tokens_system = tokens_system.size ();
628
638
629
639
batch.n_tokens = n_tokens_system;
630
640
631
- // clear the entire KV cache
632
- for (int i = 0 ; i < params.n_parallel ; ++i)
633
- {
634
- llama_kv_cache_seq_rm (ctx, i, 0 , -1 );
635
- }
641
+ cleanKVCache ();
636
642
637
643
for (int32_t i = 0 ; i < batch.n_tokens ; ++i)
638
644
{
@@ -732,6 +738,7 @@ struct llama_server_context
732
738
slot.last_n_tokens .erase (slot.last_n_tokens .begin ());
733
739
slot.last_n_tokens .push_back (result.tok );
734
740
const std::string token_str = llama_token_to_piece (ctx, result.tok );
741
+ printf (" %s" , token_str.c_str ());
735
742
slot.sampled = result.tok ;
736
743
737
744
size_t stop_pos =
@@ -819,6 +826,9 @@ struct llama_server_context
819
826
int kv_cache_free = (n_ctx - n_tokens_system);
820
827
821
828
if (all_slots_are_idle) {
829
+ if (system_prompt.empty () && clean_kv_cache) {
830
+ cleanKVCache ();
831
+ }
822
832
// avoid 100% usage of cpu all time
823
833
std::this_thread::sleep_for (std::chrono::milliseconds (5 ));
824
834
}
@@ -865,6 +875,7 @@ struct llama_server_context
865
875
// need process the prompt
866
876
bool keep_gen = slot.state == SLEEPING; // remember generation
867
877
if ((slot.state == IDLE || keep_gen) && slot.command == LOAD_PROMPT) {
878
+ LOG_TEE (" processing prompt\n " );
868
879
slot.state = PROCESSING;
869
880
slot.command = NONE;
870
881
@@ -881,8 +892,12 @@ struct llama_server_context
881
892
{" to_eval" , tokens_to_str (ctx, slot.context_tokens .cbegin () + slot.n_past , slot.context_tokens .cend ())},
882
893
});
883
894
884
- std::fill (slot.last_n_tokens .begin (), slot.last_n_tokens .end (), 0 );
895
+ if (system_prompt.empty ()) {
896
+ LOG_TEE (" cleaning kv: %i\n " , slot.n_past );
897
+ llama_kv_cache_seq_rm (ctx, slot.id , slot.n_past , -1 );
898
+ }
885
899
900
+ std::fill (slot.last_n_tokens .begin (), slot.last_n_tokens .end (), 0 );
886
901
for (size_t i = slot.n_past ; i < slot.context_tokens .size (); ++i) {
887
902
batch.token [batch.n_tokens ] = slot.context_tokens [i];
888
903
batch.pos [batch.n_tokens ] = i + n_tokens_system;
@@ -912,7 +927,6 @@ struct llama_server_context
912
927
913
928
for (int32_t i = 0 ; i < (int32_t ) batch.n_tokens ; i += n_batch) {
914
929
const int32_t n_tokens = std::min (n_batch, (int32_t ) (batch.n_tokens - i));
915
-
916
930
llama_batch batch_view = {
917
931
n_tokens,
918
932
batch.token + i,
@@ -1773,55 +1787,56 @@ int main(int argc, char **argv)
1773
1787
// res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
1774
1788
// "application/json");
1775
1789
} else {
1776
- auto chunked_content_provider = [&](size_t /* offset*/ , DataSink &sink) {
1790
+ printf (" processing -> %s\n " , slot->isProcessing () ? " true" : " false" );
1791
+ const auto chunked_content_provider = [slot](size_t , DataSink & sink) {
1777
1792
size_t sent_count = 0 ;
1778
1793
size_t sent_token_probs_index = 0 ;
1779
1794
while (slot->isProcessing ()) {
1780
1795
if (slot->hasNewToken ()) { // new token notification
1781
- const completion_token_output token = slot->next ();
1782
- std::string token_str = llama_token_to_piece (llama.ctx , token.tok );
1783
-
1784
- std::vector<completion_token_output> probs_output = {};
1785
-
1786
- const json data = format_partial_response (llama, slot, token_str, probs_output);
1787
- const std::string str =
1788
- " data: " +
1789
- data.dump (-1 , ' ' , false , json::error_handler_t ::replace) +
1790
- " \n\n " ;
1791
-
1792
- LOG_VERBOSE (" data stream" , {
1793
- { " to_send" , str }
1794
- });
1795
- if (!sink.write (str.c_str (), str.size ())) {
1796
- slot->release ();
1797
- return false ;
1798
- }
1796
+ // const completion_token_output token = slot->next();
1797
+ // std::string token_str = llama_token_to_piece(llama.ctx, token.tok);
1798
+
1799
+ // std::vector<completion_token_output> probs_output = {};
1800
+
1801
+ // const json data = format_partial_response(llama, slot, token_str, probs_output);
1802
+ // const std::string str =
1803
+ // "data: " +
1804
+ // data.dump(-1, ' ', false, json::error_handler_t::replace) +
1805
+ // "\n\n";
1806
+
1807
+ // LOG_VERBOSE("data stream", {
1808
+ // { "to_send", str }
1809
+ // });
1810
+ // if(!sink.write(str.c_str(), str.size())) {
1811
+ // slot->release();
1812
+ // return false;
1813
+ // }
1799
1814
} else {
1800
1815
std::this_thread::sleep_for (std::chrono::milliseconds (5 ));
1801
1816
}
1802
1817
}
1803
- const json data = format_final_response (
1804
- llama, slot,
1805
- " " ,
1806
- std::vector<completion_token_output>(
1807
- slot->generated_token_probs .begin (),
1808
- slot->generated_token_probs .begin () + sent_token_probs_index)
1809
- );
1810
-
1811
- const std::string str =
1812
- " data: " +
1813
- data.dump (-1 , ' ' , false , json::error_handler_t ::replace) +
1814
- " \n\n " ;
1815
-
1816
- LOG_VERBOSE (" data stream" , {
1817
- { " to_send" , str }
1818
- });
1819
-
1820
- if (!sink.write (str.data (), str.size ())) {
1821
- LOG_VERBOSE (" stream closed" , {});
1822
- llama_print_timings (llama.ctx );
1823
- return false ;
1824
- }
1818
+ // const json data = format_final_response(
1819
+ // llama, slot,
1820
+ // "",
1821
+ // std::vector<completion_token_output>(
1822
+ // slot->generated_token_probs.begin(),
1823
+ // slot->generated_token_probs.begin() + sent_token_probs_index)
1824
+ // );
1825
+
1826
+ // const std::string str =
1827
+ // "data: " +
1828
+ // data.dump(-1, ' ', false, json::error_handler_t::replace) +
1829
+ // "\n\n";
1830
+
1831
+ // LOG_VERBOSE("data stream", {
1832
+ // { "to_send", str }
1833
+ // });
1834
+
1835
+ // if (!sink.write(str.data(), str.size())) {
1836
+ // LOG_VERBOSE("stream closed", {});
1837
+ // llama_print_timings(llama.ctx);
1838
+ // return false;
1839
+ // }
1825
1840
sink.done ();
1826
1841
return true ;
1827
1842
};
0 commit comments