Skip to content

Commit 9760dc8

Browse files
committed
deps(llama.cpp): update
Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 5e155fb commit 9760dc8

File tree

2 files changed

+106
-48
lines changed

2 files changed

+106
-48
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
88

99
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
1010

11-
CPPLLAMA_VERSION?=f026f8120f97090d34a52b3dc023c82e0ede3f7d
11+
CPPLLAMA_VERSION?=9350a1cf21b1492c69b20175b73a419b897d6a3a
1212

1313
# gpt4all version
1414
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all

backend/cpp/llama/grpc-server.cpp

Lines changed: 105 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ struct server_params
5151
std::string hostname = "127.0.0.1";
5252
std::vector<std::string> api_keys;
5353
std::string public_path = "examples/server/public";
54+
std::string chat_template = "chatml";
5455
int32_t port = 8080;
5556
int32_t read_timeout = 600;
5657
int32_t write_timeout = 600;
@@ -349,6 +350,7 @@ struct llama_server_context
349350

350351
// slots / clients
351352
std::vector<llama_client_slot> slots;
353+
json default_generation_settings_for_props;
352354

353355
llama_server_queue queue_tasks;
354356
llama_server_response queue_results;
@@ -445,6 +447,9 @@ struct llama_server_context
445447
slots.push_back(slot);
446448
}
447449

450+
default_generation_settings_for_props = get_formated_generation(slots.front());
451+
default_generation_settings_for_props["seed"] = -1;
452+
448453
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
449454

450455
// empty system prompt
@@ -527,27 +532,29 @@ struct llama_server_context
527532
slot_params default_params;
528533
llama_sampling_params default_sparams;
529534

530-
slot->params.stream = json_value(data, "stream", false);
531-
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
532-
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
533-
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
534-
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
535-
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
536-
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
537-
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
538-
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
539-
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
540-
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
541-
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
542-
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
543-
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
544-
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
545-
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
546-
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
547-
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
548-
slot->params.seed = json_value(data, "seed", default_params.seed);
549-
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
550-
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
535+
slot->params.stream = json_value(data, "stream", false);
536+
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
537+
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
538+
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
539+
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
540+
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
541+
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
542+
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
543+
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
544+
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
545+
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
546+
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
547+
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
548+
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
549+
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
550+
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
551+
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
552+
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
553+
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
554+
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
555+
slot->params.seed = json_value(data, "seed", default_params.seed);
556+
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
557+
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
551558

552559
// infill
553560
if (data.count("input_prefix") != 0)
@@ -626,18 +633,36 @@ struct llama_server_context
626633
const int n_vocab = llama_n_vocab(model);
627634
for (const auto &el : *logit_bias)
628635
{
629-
if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
636+
if (el.is_array() && el.size() == 2)
630637
{
631-
llama_token tok = el[0].get<llama_token>();
632-
if (tok >= 0 && tok < n_vocab)
638+
float bias;
639+
if (el[1].is_number())
640+
{
641+
bias = el[1].get<float>();
642+
}
643+
else if (el[1].is_boolean() && !el[1].get<bool>())
644+
{
645+
bias = -INFINITY;
646+
}
647+
else
633648
{
634-
if (el[1].is_number())
649+
continue;
650+
}
651+
652+
if (el[0].is_number_integer())
653+
{
654+
llama_token tok = el[0].get<llama_token>();
655+
if (tok >= 0 && tok < n_vocab)
635656
{
636-
slot->sparams.logit_bias[tok] = el[1].get<float>();
657+
slot->sparams.logit_bias[tok] = bias;
637658
}
638-
else if (el[1].is_boolean() && !el[1].get<bool>())
659+
}
660+
else if (el[0].is_string())
661+
{
662+
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
663+
for (auto tok : toks)
639664
{
640-
slot->sparams.logit_bias[tok] = -INFINITY;
665+
slot->sparams.logit_bias[tok] = bias;
641666
}
642667
}
643668
}
@@ -950,28 +975,44 @@ struct llama_server_context
950975
{
951976
continue;
952977
}
953-
clip_image_f32 * img_res = clip_image_f32_init();
954-
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
978+
clip_image_f32_batch img_res_v;
979+
img_res_v.size = 0;
980+
img_res_v.data = nullptr;
981+
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
955982
{
956983
LOG_TEE("Error processing the given image");
957984
clip_free(clp_ctx);
985+
clip_image_f32_batch_free(img_res_v);
986+
return false;
987+
}
988+
if (img_res_v.size == 0)
989+
{
990+
LOG_TEE("Error processing the given image");
958991
return false;
959992
}
993+
994+
// note: assumes only one image was returned by clip_image_preprocess
995+
clip_image_f32 * img_res = img_res_v.data;
996+
960997
img.image_tokens = clip_n_patches(clp_ctx);
961998
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
962999
if (!img.image_embedding)
9631000
{
9641001
LOG_TEE("Unable to allocate memory for image embeddings\n");
1002+
clip_image_f32_batch_free(img_res_v);
9651003
clip_free(clp_ctx);
9661004
return false;
9671005
}
9681006
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
9691007
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
9701008
{
9711009
LOG_TEE("Unable to encode image\n");
1010+
clip_image_f32_batch_free(img_res_v);
9721011
return false;
9731012
}
974-
clip_image_f32_free(img_res);
1013+
1014+
clip_image_f32_batch_free(img_res_v);
1015+
9751016
img.request_encode_image = false;
9761017
}
9771018

@@ -990,11 +1031,6 @@ struct llama_server_context
9901031
queue_results.send(res);
9911032
}
9921033

993-
json get_model_props()
994-
{
995-
return get_formated_generation(slots[0]);
996-
}
997-
9981034
json get_formated_generation(llama_client_slot &slot)
9991035
{
10001036
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
@@ -1005,6 +1041,8 @@ struct llama_server_context
10051041
{"model", params.model_alias},
10061042
{"seed", slot.params.seed},
10071043
{"temperature", slot.sparams.temp},
1044+
{"dynatemp_range", slot.sparams.dynatemp_range},
1045+
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
10081046
{"top_k", slot.sparams.top_k},
10091047
{"top_p", slot.sparams.top_p},
10101048
{"min_p", slot.sparams.min_p},
@@ -1166,13 +1204,30 @@ struct llama_server_context
11661204
task.multitask_id = multitask_id;
11671205

11681206
// when a completion task's prompt array is not a singleton, we split it into multiple requests
1169-
if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
1170-
{
1171-
split_multiprompt_task(task_id, task);
1172-
}
1173-
11741207
// otherwise, it's a single-prompt task, we actually queue it
1175-
queue_tasks.post(task);
1208+
// if there's numbers in the prompt array it will be treated as an array of tokens
1209+
if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
1210+
bool numbers = false;
1211+
for (const auto& e : task.data.at("prompt")) {
1212+
if (e.is_number()) {
1213+
numbers = true;
1214+
break;
1215+
}
1216+
}
1217+
1218+
// NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
1219+
// it will completely stall the server. I don't know where the bug for this is.
1220+
//
1221+
// if there are numbers, it needs to be treated like a single prompt,
1222+
// queue_tasks handles a mix of strings and numbers just fine.
1223+
if (numbers) {
1224+
queue_tasks.post(task);
1225+
} else {
1226+
split_multiprompt_task(task_id, task);
1227+
}
1228+
} else {
1229+
queue_tasks.post(task);
1230+
}
11761231
}
11771232

11781233
// for multiple images processing
@@ -1254,7 +1309,10 @@ struct llama_server_context
12541309
void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
12551310
{
12561311
int prompt_count = multiprompt_task.data.at("prompt").size();
1257-
assert(prompt_count > 1);
1312+
if (prompt_count <= 1) {
1313+
send_error(multiprompt_task, "error while handling multiple prompts");
1314+
return;
1315+
}
12581316

12591317
// generate all the ID for subtask
12601318
std::vector<int> subtask_ids(prompt_count);
@@ -1566,10 +1624,6 @@ struct llama_server_context
15661624
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
15671625
}
15681626

1569-
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
1570-
1571-
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
1572-
15731627
slot.cache_tokens = prompt_tokens;
15741628

15751629
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
@@ -1583,6 +1637,10 @@ struct llama_server_context
15831637
}
15841638
}
15851639

1640+
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
1641+
1642+
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
1643+
15861644
LOG_VERBOSE("prompt ingested", {
15871645
{"n_past", slot.n_past},
15881646
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},

0 commit comments

Comments
 (0)