Skip to content

Commit 7850421

Browse files
committed
save dev progress
1 parent 4712302 commit 7850421

File tree

2 files changed

+65
-49
lines changed

2 files changed

+65
-49
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
*.gcno
1111
*.gcda
1212
*.dot
13+
*.bat
1314
*.metallib
1415
.DS_Store
1516
.build/

examples/server/server.cpp

Lines changed: 64 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ struct llama_server_context
382382
gpt_params params;
383383
int n_ctx;
384384
int n_vocab;
385-
385+
bool clean_kv_cache = true;
386386
std::mutex mutex;
387387

388388
std::unique_lock<std::mutex> lock()
@@ -484,6 +484,7 @@ struct llama_server_context
484484
else
485485
{
486486
auto s = json_prompt.template get<std::string>();
487+
printf("----------------------\nprompt:\n%s-----------------------\n", s.c_str());
487488
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
488489
}
489490

@@ -622,17 +623,22 @@ struct llama_server_context
622623
// has_next_token = true;
623624
}
624625

626+
void cleanKVCache() {
627+
// clear the entire KV cache
628+
for (int i = 0; i < params.n_parallel; ++i)
629+
{
630+
llama_kv_cache_seq_rm(ctx, i, 0, -1);
631+
}
632+
clean_kv_cache = false;
633+
}
634+
625635
void updateSystemPrompt() {
626636
tokens_system = ::llama_tokenize(ctx, system_prompt, true);
627637
n_tokens_system = tokens_system.size();
628638

629639
batch.n_tokens = n_tokens_system;
630640

631-
// clear the entire KV cache
632-
for (int i = 0; i < params.n_parallel; ++i)
633-
{
634-
llama_kv_cache_seq_rm(ctx, i, 0, -1);
635-
}
641+
cleanKVCache();
636642

637643
for (int32_t i = 0; i < batch.n_tokens; ++i)
638644
{
@@ -732,6 +738,7 @@ struct llama_server_context
732738
slot.last_n_tokens.erase(slot.last_n_tokens.begin());
733739
slot.last_n_tokens.push_back(result.tok);
734740
const std::string token_str = llama_token_to_piece(ctx, result.tok);
741+
printf("%s", token_str.c_str());
735742
slot.sampled = result.tok;
736743

737744
size_t stop_pos =
@@ -819,6 +826,9 @@ struct llama_server_context
819826
int kv_cache_free = (n_ctx - n_tokens_system);
820827

821828
if(all_slots_are_idle) {
829+
if(system_prompt.empty() && clean_kv_cache) {
830+
cleanKVCache();
831+
}
822832
// avoid 100% usage of cpu all time
823833
std::this_thread::sleep_for(std::chrono::milliseconds(5));
824834
}
@@ -865,6 +875,7 @@ struct llama_server_context
865875
// need process the prompt
866876
bool keep_gen = slot.state == SLEEPING; // remember generation
867877
if ((slot.state == IDLE || keep_gen) && slot.command == LOAD_PROMPT) {
878+
LOG_TEE("processing prompt\n");
868879
slot.state = PROCESSING;
869880
slot.command = NONE;
870881

@@ -881,8 +892,12 @@ struct llama_server_context
881892
{"to_eval", tokens_to_str(ctx, slot.context_tokens.cbegin() + slot.n_past, slot.context_tokens.cend())},
882893
});
883894

884-
std::fill(slot.last_n_tokens.begin(), slot.last_n_tokens.end(), 0);
895+
if(system_prompt.empty()) {
896+
LOG_TEE("cleaning kv: %i\n", slot.n_past);
897+
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
898+
}
885899

900+
std::fill(slot.last_n_tokens.begin(), slot.last_n_tokens.end(), 0);
886901
for (size_t i = slot.n_past; i < slot.context_tokens.size(); ++i) {
887902
batch.token [batch.n_tokens] = slot.context_tokens[i];
888903
batch.pos [batch.n_tokens] = i + n_tokens_system;
@@ -912,7 +927,6 @@ struct llama_server_context
912927

913928
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
914929
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
915-
916930
llama_batch batch_view = {
917931
n_tokens,
918932
batch.token + i,
@@ -1773,55 +1787,56 @@ int main(int argc, char **argv)
17731787
// res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
17741788
// "application/json");
17751789
} else {
1776-
auto chunked_content_provider = [&](size_t /*offset*/, DataSink &sink) {
1790+
printf("processing -> %s\n", slot->isProcessing() ? "true" : "false");
1791+
const auto chunked_content_provider = [slot](size_t, DataSink & sink) {
17771792
size_t sent_count = 0;
17781793
size_t sent_token_probs_index = 0;
17791794
while(slot->isProcessing()) {
17801795
if(slot->hasNewToken()) { // new token notification
1781-
const completion_token_output token = slot->next();
1782-
std::string token_str = llama_token_to_piece(llama.ctx, token.tok);
1783-
1784-
std::vector<completion_token_output> probs_output = {};
1785-
1786-
const json data = format_partial_response(llama, slot, token_str, probs_output);
1787-
const std::string str =
1788-
"data: " +
1789-
data.dump(-1, ' ', false, json::error_handler_t::replace) +
1790-
"\n\n";
1791-
1792-
LOG_VERBOSE("data stream", {
1793-
{ "to_send", str }
1794-
});
1795-
if(!sink.write(str.c_str(), str.size())) {
1796-
slot->release();
1797-
return false;
1798-
}
1796+
// const completion_token_output token = slot->next();
1797+
// std::string token_str = llama_token_to_piece(llama.ctx, token.tok);
1798+
1799+
// std::vector<completion_token_output> probs_output = {};
1800+
1801+
// const json data = format_partial_response(llama, slot, token_str, probs_output);
1802+
// const std::string str =
1803+
// "data: " +
1804+
// data.dump(-1, ' ', false, json::error_handler_t::replace) +
1805+
// "\n\n";
1806+
1807+
// LOG_VERBOSE("data stream", {
1808+
// { "to_send", str }
1809+
// });
1810+
// if(!sink.write(str.c_str(), str.size())) {
1811+
// slot->release();
1812+
// return false;
1813+
// }
17991814
} else {
18001815
std::this_thread::sleep_for(std::chrono::milliseconds(5));
18011816
}
18021817
}
1803-
const json data = format_final_response(
1804-
llama, slot,
1805-
"",
1806-
std::vector<completion_token_output>(
1807-
slot->generated_token_probs.begin(),
1808-
slot->generated_token_probs.begin() + sent_token_probs_index)
1809-
);
1810-
1811-
const std::string str =
1812-
"data: " +
1813-
data.dump(-1, ' ', false, json::error_handler_t::replace) +
1814-
"\n\n";
1815-
1816-
LOG_VERBOSE("data stream", {
1817-
{ "to_send", str }
1818-
});
1819-
1820-
if (!sink.write(str.data(), str.size())) {
1821-
LOG_VERBOSE("stream closed", {});
1822-
llama_print_timings(llama.ctx);
1823-
return false;
1824-
}
1818+
// const json data = format_final_response(
1819+
// llama, slot,
1820+
// "",
1821+
// std::vector<completion_token_output>(
1822+
// slot->generated_token_probs.begin(),
1823+
// slot->generated_token_probs.begin() + sent_token_probs_index)
1824+
// );
1825+
1826+
// const std::string str =
1827+
// "data: " +
1828+
// data.dump(-1, ' ', false, json::error_handler_t::replace) +
1829+
// "\n\n";
1830+
1831+
// LOG_VERBOSE("data stream", {
1832+
// { "to_send", str }
1833+
// });
1834+
1835+
// if (!sink.write(str.data(), str.size())) {
1836+
// LOG_VERBOSE("stream closed", {});
1837+
// llama_print_timings(llama.ctx);
1838+
// return false;
1839+
// }
18251840
sink.done();
18261841
return true;
18271842
};

0 commit comments

Comments
 (0)