Skip to content

Commit b7417a3

Browse files
ggerganovarthw
authored andcommitted
server : fix warmup draft cache type (ggml-org#12446)
ggml-ci
1 parent 568e027 commit b7417a3

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

examples/server/server.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,6 +1872,10 @@ struct server_context {
18721872
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
18731873
params_dft.n_parallel = 1;
18741874

1875+
// force F16 KV cache for the draft model for extra performance
1876+
params_dft.cache_type_k = GGML_TYPE_F16;
1877+
params_dft.cache_type_v = GGML_TYPE_F16;
1878+
18751879
llama_init_dft = common_init_from_params(params_dft);
18761880

18771881
model_dft = llama_init_dft.model.get();
@@ -1892,10 +1896,6 @@ struct server_context {
18921896
cparams_dft = common_context_params_to_llama(params_dft);
18931897
cparams_dft.n_batch = n_ctx_dft;
18941898

1895-
// force F16 KV cache for the draft model for extra performance
1896-
cparams_dft.type_k = GGML_TYPE_F16;
1897-
cparams_dft.type_v = GGML_TYPE_F16;
1898-
18991899
// the context is not needed - we will create one for each slot
19001900
llama_init_dft.context.reset();
19011901
}

0 commit comments

Comments
 (0)