@@ -681,25 +681,26 @@ struct server_context {
681
681
add_bos_token = llama_add_bos_token (model);
682
682
has_eos_token = !llama_add_eos_token (model);
683
683
684
- if (!params.model_draft .empty ()) {
685
- SRV_INF (" loading draft model '%s'\n " , params_. model_draft .c_str ());
684
+ if (!params.speculative . model .empty ()) {
685
+ SRV_INF (" loading draft model '%s'\n " , params. speculative . model .c_str ());
686
686
687
687
auto params_dft = params;
688
688
689
- params_dft.model = params.model_draft ;
690
- params_dft.n_gpu_layers = params.n_gpu_layers_draft ;
689
+ params_dft.model = params.speculative .model ;
690
+ params_dft.n_ctx = params.speculative .n_ctx ;
691
+ params_dft.n_gpu_layers = params.speculative .n_gpu_layers ;
691
692
692
693
common_init_result llama_init_dft = common_init_from_params (params_dft);
693
694
694
695
model_dft = llama_init_dft.model ;
695
696
696
697
if (model_dft == nullptr ) {
697
- SRV_ERR (" failed to load draft model, '%s'\n " , params.model_draft .c_str ());
698
+ SRV_ERR (" failed to load draft model, '%s'\n " , params.speculative . model .c_str ());
698
699
return false ;
699
700
}
700
701
701
702
if (!common_speculative_are_compatible (ctx, llama_init_dft.context )) {
702
- SRV_ERR (" the draft model '%s' is not compatible with the target model '%s'\n " , params.model_draft .c_str (), params.model .c_str ());
703
+ SRV_ERR (" the draft model '%s' is not compatible with the target model '%s'\n " , params.speculative . model .c_str (), params.model .c_str ());
703
704
704
705
llama_free (llama_init_dft.context );
705
706
llama_free_model (llama_init_dft.model );
@@ -755,7 +756,7 @@ struct server_context {
755
756
return ;
756
757
}
757
758
758
- slot.batch_spec = llama_batch_init (params.n_draft + 1 , 0 , 1 );
759
+ slot.batch_spec = llama_batch_init (params.speculative . n_max + 1 , 0 , 1 );
759
760
}
760
761
761
762
SLT_INF (slot, " new slot n_ctx_slot = %d\n " , slot.n_ctx );
@@ -2287,13 +2288,14 @@ struct server_context {
2287
2288
2288
2289
// TODO: configurable through requests
2289
2290
struct common_speculative_params params_spec;
2290
- params_spec.n_draft = params.n_draft ;
2291
+ params_spec.n_draft = params.speculative . n_max ;
2291
2292
params_spec.n_reuse = 256 ;
2292
- params_spec.p_min = 0 . 9f ;
2293
+ params_spec.p_min = params. speculative . p_min ;
2293
2294
2294
2295
llama_tokens draft = common_speculative_gen_draft (slot.spec , params_spec, slot.cache_tokens , id);
2295
2296
2296
- if (params.n_draft_min > (int ) draft.size ()) {
2297
+ // ignore small drafts
2298
+ if (params.speculative .n_min > (int ) draft.size ()) {
2297
2299
continue ;
2298
2300
}
2299
2301
@@ -2321,9 +2323,7 @@ struct server_context {
2321
2323
for (size_t i = 0 ; i < ids.size (); ++i) {
2322
2324
completion_token_output result;
2323
2325
2324
- id = ids[i];
2325
-
2326
- result.tok = id;
2326
+ result.tok = ids[i];
2327
2327
2328
2328
if (!process_token (result, slot)) {
2329
2329
// release slot because of stop condition
0 commit comments