Skip to content

Commit 5eaf996

Browse files
l3utterflyggerganovcebtenzzre
authored
llama : dynamic temperature sampling (#4972)
* implemented dynamic temperature sampling from koboldcpp * removed trailing whitespace * removed unused temp parameter in llama_sample_entropy * exposed exponent_val in dynamic temp sampler * added debug check for printf statements * use nullptr in llama_sample_softmax call during llama_sample_entropy this avoids counting the time taken stats twice Co-authored-by: Georgi Gerganov <[email protected]> * return earlier if there is only 1 candiate (i.e. max_entropy == 0) * reformat 't' case in llama_sample_queue Co-authored-by: Jared Van Bortel <[email protected]> * check for one or zero candidates case in llama_sample_entropy --------- Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: Jared Van Bortel <[email protected]>
1 parent d292f4f commit 5eaf996

File tree

4 files changed

+88
-1
lines changed

4 files changed

+88
-1
lines changed

common/sampling.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ static void sampler_queue(
129129
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
130130

131131
const float temp = params.temp;
132+
const float dynatemp_range = params.dynatemp_range;
133+
const float dynatemp_exponent = params.dynatemp_exponent;
132134
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
133135
const float top_p = params.top_p;
134136
const float min_p = params.min_p;
@@ -143,7 +145,15 @@ static void sampler_queue(
143145
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
144146
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
145147
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
146-
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
148+
case 't':
149+
if (dynatemp_range > 0) {
150+
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
151+
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
152+
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
153+
} else {
154+
llama_sample_temp(ctx_main, &cur_p, temp);
155+
}
156+
break;
147157
default : break;
148158
}
149159
}

common/sampling.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
1818
float tfs_z = 1.00f; // 1.0 = disabled
1919
float typical_p = 1.00f; // 1.0 = disabled
2020
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
21+
float dynatemp_range = 0.00f; // 0.0 = disabled
22+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
2123
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
2224
float penalty_repeat = 1.10f; // 1.0 = disabled
2325
float penalty_freq = 0.00f; // 0.0 = disabled

llama.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8151,6 +8151,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
81518151
}
81528152
}
81538153

8154+
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
8155+
const int64_t t_start_sample_us = ggml_time_us();
8156+
8157+
// no need to do anything if there is only one (or zero) candidates
8158+
if(candidates_p->size <= 1) {
8159+
return;
8160+
}
8161+
8162+
// Calculate maximum possible entropy
8163+
float max_entropy = -logf(1.0f / candidates_p->size);
8164+
8165+
llama_sample_softmax(nullptr, candidates_p);
8166+
8167+
// Calculate entropy of the softmax probabilities
8168+
float entropy = 0.0f;
8169+
for (size_t i = 0; i < candidates_p->size; ++i) {
8170+
float prob = candidates_p->data[i].p;
8171+
if (prob > 0.0f) { // Ensure no log(0)
8172+
entropy -= prob * logf(prob);
8173+
}
8174+
}
8175+
8176+
// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
8177+
float normalized_entropy = entropy / max_entropy;
8178+
8179+
// Map the normalized entropy to the desired temperature range using the power function
8180+
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
8181+
8182+
#ifdef DEBUG
8183+
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
8184+
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
8185+
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
8186+
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
8187+
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
8188+
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
8189+
#endif
8190+
8191+
// Apply the dynamically calculated temperature scaling
8192+
for (size_t i = 0; i < candidates_p->size; ++i) {
8193+
candidates_p->data[i].logit /= dyn_temp;
8194+
}
8195+
8196+
// Re-compute softmax probabilities after scaling logits with dynamic temperature
8197+
double max_l_double = candidates_p->data[0].logit;
8198+
double cum_sum_double = 0.0;
8199+
for (size_t i = 0; i < candidates_p->size; ++i) {
8200+
double p = exp(candidates_p->data[i].logit - max_l_double);
8201+
candidates_p->data[i].p = p; // Store the scaled probability
8202+
cum_sum_double += p;
8203+
}
8204+
for (size_t i = 0; i < candidates_p->size; ++i) {
8205+
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
8206+
}
8207+
8208+
#ifdef DEBUG
8209+
// Print the updated top 25 probabilities after temperature scaling
8210+
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
8211+
for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
8212+
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
8213+
}
8214+
#endif
8215+
8216+
if (ctx) {
8217+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
8218+
}
8219+
}
8220+
81548221
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
81558222
const int64_t t_start_sample_us = ggml_time_us();
81568223

llama.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,14 @@ extern "C" {
775775
float p,
776776
size_t min_keep);
777777

778+
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
779+
LLAMA_API void llama_sample_entropy(
780+
struct llama_context * ctx,
781+
llama_token_data_array * candidates_p,
782+
float min_temp,
783+
float max_temp,
784+
float exponent_val);
785+
778786
LLAMA_API void llama_sample_temp(
779787
struct llama_context * ctx,
780788
llama_token_data_array * candidates,

0 commit comments

Comments
 (0)