Skip to content

Commit 0bb47c8

Browse files
committed
server : adapt to new args
ggml-ci
1 parent 61de553 commit 0bb47c8

File tree

1 file changed

+13
-13
lines changed

1 file changed

+13
-13
lines changed

examples/server/server.cpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -681,25 +681,26 @@ struct server_context {
681681
add_bos_token = llama_add_bos_token(model);
682682
has_eos_token = !llama_add_eos_token(model);
683683

684-
if (!params.model_draft.empty()) {
685-
SRV_INF("loading draft model '%s'\n", params_.model_draft.c_str());
684+
if (!params.speculative.model.empty()) {
685+
SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str());
686686

687687
auto params_dft = params;
688688

689-
params_dft.model = params.model_draft;
690-
params_dft.n_gpu_layers = params.n_gpu_layers_draft;
689+
params_dft.model = params.speculative.model;
690+
params_dft.n_ctx = params.speculative.n_ctx;
691+
params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
691692

692693
common_init_result llama_init_dft = common_init_from_params(params_dft);
693694

694695
model_dft = llama_init_dft.model;
695696

696697
if (model_dft == nullptr) {
697-
SRV_ERR("failed to load draft model, '%s'\n", params.model_draft.c_str());
698+
SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str());
698699
return false;
699700
}
700701

701702
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
702-
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.model_draft.c_str(), params.model.c_str());
703+
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str());
703704

704705
llama_free (llama_init_dft.context);
705706
llama_free_model(llama_init_dft.model);
@@ -755,7 +756,7 @@ struct server_context {
755756
return;
756757
}
757758

758-
slot.batch_spec = llama_batch_init(params.n_draft + 1, 0, 1);
759+
slot.batch_spec = llama_batch_init(params.speculative.n_max + 1, 0, 1);
759760
}
760761

761762
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
@@ -2287,13 +2288,14 @@ struct server_context {
22872288

22882289
// TODO: configurable through requests
22892290
struct common_speculative_params params_spec;
2290-
params_spec.n_draft = params.n_draft;
2291+
params_spec.n_draft = params.speculative.n_max;
22912292
params_spec.n_reuse = 256;
2292-
params_spec.p_min = 0.9f;
2293+
params_spec.p_min = params.speculative.p_min;
22932294

22942295
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
22952296

2296-
if (params.n_draft_min > (int) draft.size()) {
2297+
// ignore small drafts
2298+
if (params.speculative.n_min > (int) draft.size()) {
22972299
continue;
22982300
}
22992301

@@ -2321,9 +2323,7 @@ struct server_context {
23212323
for (size_t i = 0; i < ids.size(); ++i) {
23222324
completion_token_output result;
23232325

2324-
id = ids[i];
2325-
2326-
result.tok = id;
2326+
result.tok = ids[i];
23272327

23282328
if (!process_token(result, slot)) {
23292329
// release slot because of stop condition

0 commit comments

Comments
 (0)