Skip to content

Commit c277c4d

Browse files
committed
server : rename params struct to params_base
ggml-ci
1 parent 999ff0a commit c277c4d

File tree

1 file changed

+49
-43
lines changed

1 file changed

+49
-43
lines changed

examples/server/server.cpp

Lines changed: 49 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ struct server_response {
600600
};
601601

602602
struct server_context {
603-
common_params params;
603+
common_params params_base;
604604

605605
llama_model * model = nullptr;
606606
llama_context * ctx = nullptr;
@@ -662,19 +662,19 @@ struct server_context {
662662
llama_batch_free(batch);
663663
}
664664

665-
bool load_model(const common_params & params_) {
666-
SRV_INF("loading model '%s'\n", params_.model.c_str());
665+
bool load_model(const common_params & params) {
666+
SRV_INF("loading model '%s'\n", params.model.c_str());
667667

668-
params = params_;
668+
params_base = params;
669669

670-
common_init_result llama_init = common_init_from_params(params);
670+
common_init_result llama_init = common_init_from_params(params_base);
671671

672672
model = llama_init.model;
673673
ctx = llama_init.context;
674674
loras = llama_init.lora_adapters;
675675

676676
if (model == nullptr) {
677-
SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
677+
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
678678
return false;
679679
}
680680

@@ -683,34 +683,34 @@ struct server_context {
683683
add_bos_token = llama_add_bos_token(model);
684684
has_eos_token = !llama_add_eos_token(model);
685685

686-
if (!params.speculative.model.empty()) {
687-
SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str());
686+
if (!params_base.speculative.model.empty()) {
687+
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
688688

689-
auto params_dft = params;
689+
auto params_dft = params_base;
690690

691-
params_dft.model = params.speculative.model;
692-
params_dft.n_ctx = params.speculative.n_ctx;
693-
params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
691+
params_dft.model = params_base.speculative.model;
692+
params_dft.n_ctx = params_base.speculative.n_ctx;
693+
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
694694

695695
common_init_result llama_init_dft = common_init_from_params(params_dft);
696696

697697
model_dft = llama_init_dft.model;
698698

699699
if (model_dft == nullptr) {
700-
SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str());
700+
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
701701
return false;
702702
}
703703

704704
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
705-
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str());
705+
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
706706

707707
llama_free (llama_init_dft.context);
708708
llama_free_model(llama_init_dft.model);
709709

710710
return false;
711711
}
712712

713-
cparams_dft = common_context_params_to_llama(params);
713+
cparams_dft = common_context_params_to_llama(params_base);
714714
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
715715

716716
// the context is not needed - we will create one for each slot
@@ -734,19 +734,19 @@ struct server_context {
734734
}
735735

736736
void init() {
737-
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
737+
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
738738

739-
SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
739+
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
740740

741-
for (int i = 0; i < params.n_parallel; i++) {
741+
for (int i = 0; i < params_base.n_parallel; i++) {
742742
server_slot slot;
743743

744744
slot.id = i;
745745
slot.n_ctx = n_ctx_slot;
746-
slot.n_predict = params.n_predict;
746+
slot.n_predict = params_base.n_predict;
747747

748748
if (model_dft) {
749-
slot.batch_spec = llama_batch_init(params.speculative.n_max + 1, 0, 1);
749+
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
750750

751751
slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
752752
if (slot.ctx_dft == nullptr) {
@@ -763,7 +763,7 @@ struct server_context {
763763

764764
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
765765

766-
slot.params.sampling = params.sampling;
766+
slot.params.sampling = params_base.sampling;
767767

768768
slot.callback_on_release = [this](int) {
769769
queue_tasks.pop_deferred_task();
@@ -783,7 +783,7 @@ struct server_context {
783783
const int32_t n_batch = llama_n_batch(ctx);
784784

785785
// only a single seq_id per token is needed
786-
batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
786+
batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
787787
}
788788

789789
metrics.init();
@@ -864,8 +864,8 @@ struct server_context {
864864
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
865865
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
866866
slot_params defaults;
867-
defaults.sampling = params.sampling;
868-
defaults.speculative = params.speculative;
867+
defaults.sampling = params_base.sampling;
868+
defaults.speculative = params_base.speculative;
869869

870870
const auto & data = task.data;
871871

@@ -915,6 +915,8 @@ struct server_context {
915915
slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
916916
slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
917917

918+
slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
919+
918920
if (slot.params.sampling.dry_base < 1.0f) {
919921
slot.params.sampling.dry_base = defaults.sampling.dry_base;
920922
}
@@ -1066,7 +1068,7 @@ struct server_context {
10661068

10671069
bool process_token(completion_token_output & result, server_slot & slot) {
10681070
// remember which tokens were sampled - used for repetition penalties during sampling
1069-
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
1071+
const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
10701072
slot.sampled = result.tok;
10711073

10721074
// search stop word and delete it
@@ -1131,7 +1133,7 @@ struct server_context {
11311133
}
11321134

11331135
// check the limits
1134-
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
1136+
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
11351137
slot.stopped_limit = true;
11361138
slot.has_next_token = false;
11371139

@@ -1232,7 +1234,7 @@ struct server_context {
12321234
return json {
12331235
{"n_ctx", slot.n_ctx},
12341236
{"n_predict", slot.n_predict}, // Server configured n_predict
1235-
{"model", params.model_alias},
1237+
{"model", params_base.model_alias},
12361238
{"seed", slot.params.sampling.seed},
12371239
{"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
12381240
{"temperature", slot.params.sampling.temp},
@@ -1268,6 +1270,10 @@ struct server_context {
12681270
{"min_keep", slot.params.sampling.min_keep},
12691271
{"grammar", slot.params.sampling.grammar},
12701272
{"samplers", samplers},
1273+
{"speculative", slot.params.speculative.model.empty() ? false : true},
1274+
{"speculative.n_max", slot.params.speculative.n_max},
1275+
{"speculative.n_min", slot.params.speculative.n_min},
1276+
{"speculative.p_min", slot.params.speculative.p_min},
12711277
};
12721278
}
12731279

@@ -1337,7 +1343,7 @@ struct server_context {
13371343
{"content", !slot.params.stream ? slot.generated_text : ""},
13381344
{"id_slot", slot.id},
13391345
{"stop", true},
1340-
{"model", params.model_alias},
1346+
{"model", params_base.model_alias},
13411347
{"tokens_predicted", slot.n_decoded},
13421348
{"tokens_evaluated", slot.n_prompt_tokens},
13431349
{"generation_settings", get_formated_generation(slot)},
@@ -1510,10 +1516,10 @@ struct server_context {
15101516
data.at("input_prefix"),
15111517
data.at("input_suffix"),
15121518
data.at("input_extra"),
1513-
params.n_batch,
1514-
params.n_predict,
1519+
params_base.n_batch,
1520+
params_base.n_predict,
15151521
slots[0].n_ctx, // TODO: there should be a better way
1516-
params.spm_infill,
1522+
params_base.spm_infill,
15171523
tokenized_prompts[i]
15181524
);
15191525
create_task(data, tokens);
@@ -1886,7 +1892,7 @@ struct server_context {
18861892
// TODO: simplify and improve
18871893
for (server_slot & slot : slots) {
18881894
if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
1889-
if (!params.ctx_shift) {
1895+
if (!params_base.ctx_shift) {
18901896
// this check is redundant (for good)
18911897
// we should never get here, because generation should already stopped in process_token()
18921898
slot.release();
@@ -1952,7 +1958,7 @@ struct server_context {
19521958
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
19531959

19541960
// next, batch any pending prompts without exceeding n_batch
1955-
if (params.cont_batching || batch.n_tokens == 0) {
1961+
if (params_base.cont_batching || batch.n_tokens == 0) {
19561962
for (auto & slot : slots) {
19571963
// this slot still has a prompt to be processed
19581964
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
@@ -2005,7 +2011,7 @@ struct server_context {
20052011
continue;
20062012
}
20072013
} else {
2008-
if (!params.ctx_shift) {
2014+
if (!params_base.ctx_shift) {
20092015
// if context shift is disabled, we make sure prompt size is smaller than KV size
20102016
// TODO: there should be a separate parameter that control prompt truncation
20112017
// context shift should be applied only during the generation phase
@@ -2051,11 +2057,11 @@ struct server_context {
20512057
slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
20522058

20532059
// reuse chunks from the cached prompt by shifting their KV cache in the new position
2054-
if (params.n_cache_reuse > 0) {
2060+
if (params_base.n_cache_reuse > 0) {
20552061
size_t head_c = slot.n_past; // cache
20562062
size_t head_p = slot.n_past; // current prompt
20572063

2058-
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
2064+
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
20592065

20602066
while (head_c < slot.cache_tokens.size() &&
20612067
head_p < prompt_tokens.size()) {
@@ -2068,7 +2074,7 @@ struct server_context {
20682074
n_match++;
20692075
}
20702076

2071-
if (n_match >= (size_t) params.n_cache_reuse) {
2077+
if (n_match >= (size_t) params_base.n_cache_reuse) {
20722078
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
20732079
//for (size_t i = head_p; i < head_p + n_match; i++) {
20742080
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@@ -2303,7 +2309,7 @@ struct server_context {
23032309
// TODO: configurable through requests
23042310
struct common_speculative_params params_spec;
23052311
params_spec.n_draft = slot.params.speculative.n_max;
2306-
params_spec.n_reuse = 256;
2312+
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
23072313
params_spec.p_min = slot.params.speculative.p_min;
23082314

23092315
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
@@ -2847,15 +2853,15 @@ int main(int argc, char ** argv) {
28472853
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
28482854
json data = {
28492855
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
2850-
{ "total_slots", ctx_server.params.n_parallel },
2856+
{ "total_slots", ctx_server.params_base.n_parallel },
28512857
{ "chat_template", llama_get_chat_template(ctx_server.model) },
28522858
};
28532859

28542860
res_ok(res, data);
28552861
};
28562862

28572863
const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
2858-
if (!ctx_server.params.endpoint_props) {
2864+
if (!ctx_server.params_base.endpoint_props) {
28592865
res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
28602866
return;
28612867
}
@@ -2868,7 +2874,7 @@ int main(int argc, char ** argv) {
28682874
};
28692875

28702876
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
2871-
if (ctx_server.params.embedding) {
2877+
if (ctx_server.params_base.embedding) {
28722878
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
28732879
return;
28742880
}
@@ -2974,7 +2980,7 @@ int main(int argc, char ** argv) {
29742980

29752981
// TODO: maybe merge this function with "handle_completions_generic"
29762982
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
2977-
if (ctx_server.params.embedding) {
2983+
if (ctx_server.params_base.embedding) {
29782984
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
29792985
return;
29802986
}
@@ -3151,7 +3157,7 @@ int main(int argc, char ** argv) {
31513157
};
31523158

31533159
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
3154-
if (!ctx_server.params.reranking || ctx_server.params.embedding) {
3160+
if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
31553161
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
31563162
return;
31573163
}

0 commit comments

Comments
 (0)