@@ -2083,35 +2083,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2083
2083
[](common_params & params, int value) {
2084
2084
params.speculative .n_max = value;
2085
2085
}
2086
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2086
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MAX " ) );
2087
2087
add_opt (common_arg (
2088
2088
{" --draft-min" , " --draft-n-min" }, " N" ,
2089
2089
string_format (" minimum number of draft tokens to use for speculative decoding (default: %d)" , params.speculative .n_min ),
2090
2090
[](common_params & params, int value) {
2091
2091
params.speculative .n_min = value;
2092
2092
}
2093
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2093
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MIN " ) );
2094
2094
add_opt (common_arg (
2095
2095
{" --draft-p-split" }, " P" ,
2096
2096
string_format (" speculative decoding split probability (default: %.1f)" , (double )params.speculative .p_split ),
2097
2097
[](common_params & params, const std::string & value) {
2098
2098
params.speculative .p_split = std::stof (value);
2099
2099
}
2100
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
2100
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}). set_env ( " LLAMA_ARG_DRAFT_P_SPLIT " ) );
2101
2101
add_opt (common_arg (
2102
2102
{" --draft-p-min" }, " P" ,
2103
2103
string_format (" minimum speculative decoding probability (greedy) (default: %.1f)" , (double )params.speculative .p_min ),
2104
2104
[](common_params & params, const std::string & value) {
2105
2105
params.speculative .p_min = std::stof (value);
2106
2106
}
2107
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2107
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_P_MIN " ) );
2108
2108
add_opt (common_arg (
2109
2109
{" -cd" , " --ctx-size-draft" }, " N" ,
2110
2110
string_format (" size of the prompt context for the draft model (default: %d, 0 = loaded from model)" , params.speculative .n_ctx ),
2111
2111
[](common_params & params, int value) {
2112
2112
params.speculative .n_ctx = value;
2113
2113
}
2114
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2114
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_CTX_SIZE_DRAFT " ) );
2115
2115
add_opt (common_arg (
2116
2116
{" -devd" , " --device-draft" }, " <dev1,dev2,..>" ,
2117
2117
" comma-separated list of devices to use for offloading the draft model (none = don't offload)\n "
@@ -2131,14 +2131,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2131
2131
fprintf (stderr, " warning: consult docs/build.md for compilation instructions\n " );
2132
2132
}
2133
2133
}
2134
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2134
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_N_GPU_LAYERS_DRAFT " ) );
2135
2135
add_opt (common_arg (
2136
2136
{" -md" , " --model-draft" }, " FNAME" ,
2137
2137
" draft model for speculative decoding (default: unused)" ,
2138
2138
[](common_params & params, const std::string & value) {
2139
2139
params.speculative .model = value;
2140
2140
}
2141
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2141
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_MODEL_DRAFT " ) );
2142
2142
2143
2143
return ctx_arg;
2144
2144
}
0 commit comments