File tree 1 file changed +4
-4
lines changed 1 file changed +4
-4
lines changed Original file line number Diff line number Diff line change @@ -1872,6 +1872,10 @@ struct server_context {
1872
1872
params_dft.n_gpu_layers = params_base.speculative .n_gpu_layers ;
1873
1873
params_dft.n_parallel = 1 ;
1874
1874
1875
+ // force F16 KV cache for the draft model for extra performance
1876
+ params_dft.cache_type_k = GGML_TYPE_F16;
1877
+ params_dft.cache_type_v = GGML_TYPE_F16;
1878
+
1875
1879
llama_init_dft = common_init_from_params (params_dft);
1876
1880
1877
1881
model_dft = llama_init_dft.model .get ();
@@ -1892,10 +1896,6 @@ struct server_context {
1892
1896
cparams_dft = common_context_params_to_llama (params_dft);
1893
1897
cparams_dft.n_batch = n_ctx_dft;
1894
1898
1895
- // force F16 KV cache for the draft model for extra performance
1896
- cparams_dft.type_k = GGML_TYPE_F16;
1897
- cparams_dft.type_v = GGML_TYPE_F16;
1898
-
1899
1899
// the context is not needed - we will create one for each slot
1900
1900
llama_init_dft.context .reset ();
1901
1901
}
You can’t perform that action at this time.
0 commit comments