@@ -44,6 +44,7 @@ int32_t get_num_physical_cores();
44
44
45
45
struct gpt_params {
46
46
uint32_t seed = -1 ; // RNG seed
47
+
47
48
int32_t n_threads = get_num_physical_cores();
48
49
int32_t n_threads_batch = -1 ; // number of threads to use for batch processing (-1 = use n_threads)
49
50
int32_t n_predict = -1 ; // new tokens to predict
@@ -54,6 +55,8 @@ struct gpt_params {
54
55
int32_t n_chunks = -1 ; // max number of chunks to process (-1 = unlimited)
55
56
int32_t n_parallel = 1 ; // number of parallel sequences to decode
56
57
int32_t n_sequences = 1 ; // number of sequences to decode
58
+ float p_accept = 0 .5f ; // speculative decoding accept probability
59
+ float p_split = 0 .1f ; // speculative decoding split probability
57
60
int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM (-1 - use default)
58
61
int32_t n_gpu_layers_draft = -1 ; // number of layers to store in VRAM for the draft model (-1 - use default)
59
62
int32_t main_gpu = 0 ; // the GPU that is used for scratch and small tensors
@@ -66,7 +69,8 @@ struct gpt_params {
66
69
float yarn_beta_fast = 32 .0f ; // YaRN low correction dim
67
70
float yarn_beta_slow = 1 .0f ; // YaRN high correction dim
68
71
int32_t yarn_orig_ctx = 0 ; // YaRN original context length
69
- int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
72
+ int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
73
+ // pinging @cebtenzzre
70
74
71
75
// // sampling parameters
72
76
struct llama_sampling_params sparams;
@@ -90,7 +94,7 @@ struct gpt_params {
90
94
int ppl_output_type = 0 ; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
91
95
// (which is more convenient to use for plotting)
92
96
//
93
- bool hellaswag = false ; // compute HellaSwag score over random tasks from datafile supplied in prompt
97
+ bool hellaswag = false ; // compute HellaSwag score over random tasks from datafile supplied in prompt
94
98
size_t hellaswag_tasks = 400 ; // number of tasks to use when computing the HellaSwag score
95
99
96
100
bool mul_mat_q = true ; // if true, use mul_mat_q kernels instead of cuBLAS
0 commit comments