ggml-org
diff --git a/‎common/common.cpp
+1-1 b/‎common/common.cpp
+1-1
diff --git a/‎common/sampling.cpp
+32-33 b/‎common/sampling.cpp
+32-33
diff --git a/‎common/sampling.h
+2-7 b/‎common/sampling.h
+2-7
diff --git a/‎examples/batched-bench/batched-bench.cpp
+1-1 b/‎examples/batched-bench/batched-bench.cpp
+1-1
diff --git a/‎examples/batched/batched.cpp
+8-6 b/‎examples/batched/batched.cpp
+8-6
diff --git a/‎examples/embedding/embedding.cpp
+1-1 b/‎examples/embedding/embedding.cpp
+1-1
diff --git a/‎examples/eval-callback/eval-callback.cpp
+1-1 b/‎examples/eval-callback/eval-callback.cpp
+1-1
diff --git a/‎examples/gritlm/gritlm.cpp
+15-14 b/‎examples/gritlm/gritlm.cpp
+15-14
@@ -2125,7 +2125,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
-        llama_reset_timings(lctx);
+        llama_reset_timings(lctx, nullptr, nullptr);
     }
 
     return std::make_tuple(model, lctx);
 
@@ -2,12 +2,11 @@
 
 #include <random>
 
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_context * ctx, llama_seq_id seq_id) {
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_sampling * smpl) {
     struct llama_sampling_context * result = new llama_sampling_context();
 
     result->params  = params;
-    result->seq_id  = seq_id;
-    result->ctx     = ctx;
+    result->smpl    = smpl;
     result->grammar = nullptr;
 
     // if there is a grammar, parse it
@@ -43,7 +42,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
 
     result->n_valid = 0;
 
-    llama_sampling_set_rng_seed(result, params.seed);
+    llama_sampling_set_rng_seed(result->smpl, params.seed);
 
     return result;
 }
@@ -79,13 +78,6 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
     ctx->n_valid = 0;
 }
 
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = std::random_device{}();
-    }
-    llama_set_rng_seed_seq(ctx->ctx, seed, ctx->seq_id);
-}
-
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
     if (dst->grammar) {
         llama_grammar_free(dst->grammar);
@@ -230,10 +222,13 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
 
 // no reasons to expose this function in header
 static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
+          struct llama_sampling_context * ctx_sampling,
                  llama_token_data_array & cur_p,
                                  size_t   min_keep) {
+    llama_sampling * smpl = ctx_sampling->smpl;
+
+    const llama_sampling_params & params = ctx_sampling->params;
+
     const float         temp              = params.temp;
     const float         dynatemp_range    = params.dynatemp_range;
     const float         dynatemp_exponent = params.dynatemp_exponent;
@@ -246,18 +241,18 @@ static void sampler_queue(
 
     for (auto sampler_type : samplers_sequence) {
         switch (sampler_type) {
-            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case llama_sampler_type::TOP_K    : llama_sampling_top_k    (smpl, &cur_p, top_k,     min_keep); break;
+            case llama_sampler_type::TFS_Z    : llama_sampling_tail_free(smpl, &cur_p, tfs_z,     min_keep); break;
+            case llama_sampler_type::TYPICAL_P: llama_sampling_typical  (smpl, &cur_p, typical_p, min_keep); break;
+            case llama_sampler_type::TOP_P    : llama_sampling_top_p    (smpl, &cur_p, top_p,     min_keep); break;
+            case llama_sampler_type::MIN_P    : llama_sampling_min_p    (smpl, &cur_p, min_p,     min_keep); break;
             case llama_sampler_type::TEMPERATURE:
                 if (dynatemp_range > 0) {
                     float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
                     float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
-                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
+                    llama_sampling_entropy(smpl, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
                 } else {
-                    llama_sample_temp(ctx_main, &cur_p, temp);
+                    llama_sampling_temp(smpl, &cur_p, temp);
                 }
                 break;
             default : break;
@@ -271,6 +266,8 @@ static llama_token llama_sampling_sample_impl(
                   struct llama_context * ctx_cfg,
                   const int idx,
                   bool is_resampling) {
+    llama_sampling * smpl = ctx_sampling->smpl;
+
     const llama_sampling_params & params = ctx_sampling->params;
 
     const float temp         = params.temp;
@@ -287,26 +284,26 @@ static llama_token llama_sampling_sample_impl(
 
     if (temp < 0.0) {
         // greedy sampling, with probs
-        llama_sample_softmax(ctx_main, &cur_p);
+        llama_sampling_softmax(smpl, &cur_p);
         id = cur_p.data[0].id;
     } else if (temp == 0.0) {
         // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx_main, &cur_p);
+        id = llama_sampling_sample_greedy(smpl, &cur_p);
     } else {
         if (mirostat == 1) {
             const int mirostat_m = 100;
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
+            llama_sampling_temp(smpl, &cur_p, temp);
+            id = llama_sampling_sample_mirostat(smpl, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
         } else if (mirostat == 2) {
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
+            llama_sampling_temp(smpl, &cur_p, temp);
+            id = llama_sampling_sample_mirostat_v2(smpl, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
         } else {
             // temperature sampling
             size_t min_keep = std::max(1, params.min_keep);
 
-            sampler_queue(ctx_main, params, cur_p, min_keep);
+            sampler_queue(ctx_sampling, cur_p, min_keep);
 
-            id = llama_sample_token_seq(ctx_main, &cur_p, ctx_sampling->seq_id);
+            id = llama_sampling_sample(smpl, &cur_p);
 
             //{
             //    const int n_top = 10;
@@ -315,11 +312,11 @@ static llama_token llama_sampling_sample_impl(
             //    for (int i = 0; i < n_top; i++) {
             //        const llama_token id = cur_p.data[i].id;
             //        (void)id; // To avoid a warning that id is unused when logging is disabled.
-            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
+            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(smpl, id).c_str(), cur_p.data[i].p);
             //    }
             //}
 
-            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
+            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(smpl, id).c_str());
         }
     }
 
@@ -360,6 +357,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
                   const int idx,
                   bool apply_grammar,
                   std::vector<float> * original_logits) {
+    llama_sampling * smpl = ctx_sampling->smpl;
+
     const llama_sampling_params & params = ctx_sampling->params;
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
@@ -390,7 +389,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
 
     if (ctx_cfg) {
         float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
+        llama_sampling_apply_guidance(smpl, logits, logits_guidance, params.cfg_scale);
     }
 
     cur.resize(n_vocab);
@@ -407,7 +406,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
     if (penalty_tokens_used_size) {
         const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
 
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
+        llama_sampling_repetition_penalties(smpl, &cur_p,
                 penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
                 penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
 
@@ -445,7 +444,7 @@ llama_token_data_array llama_sampling_prepare(
                   const int idx,
                   bool apply_grammar,
                   std::vector<float> * original_logits) {
-    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
+    return llama_sampling_prepare_impl(ctx_sampling, ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
 }
 
 void llama_sampling_accept(
 
@@ -70,12 +70,10 @@ struct llama_sampling_context {
     // parameters that will be used for sampling
     llama_sampling_params params;
 
-    llama_seq_id seq_id;
-
     // mirostat sampler state
     float mirostat_mu;
 
-    llama_context * ctx; // TMP
+    llama_sampling * smpl;
     llama_grammar * grammar;
 
     // internal
@@ -91,7 +89,7 @@ struct llama_sampling_context {
 #include "common.h"
 
 // Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_context * ctx, llama_seq_id seq_id);
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_sampling * smpl);
 
 void llama_sampling_free(struct llama_sampling_context * ctx);
 
@@ -100,9 +98,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
 // - reset grammar
 void llama_sampling_reset(llama_sampling_context * ctx);
 
-// Set the sampler seed
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
-
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
 
 
@@ -200,7 +200,7 @@ int main(int argc, char ** argv) {
         }
     }
 
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);
 
     llama_batch_free(batch);
 
 
@@ -64,6 +64,7 @@ int main(int argc, char ** argv) {
     ctx_params.n_batch = std::max(n_predict, n_parallel);
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    llama_sampling * smpl = llama_get_sampling(ctx);
 
     if (ctx == NULL) {
         fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@@ -180,13 +181,13 @@ int main(int argc, char ** argv) {
             const float top_p = 0.9f;
             const float temp  = 0.4f;
 
-            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-            llama_sample_temp (ctx, &candidates_p, temp);
+            llama_sampling_top_k(smpl, &candidates_p, top_k, 1);
+            llama_sampling_top_p(smpl, &candidates_p, top_p, 1);
+            llama_sampling_temp (smpl, &candidates_p, temp);
 
-            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample(smpl, &candidates_p);
 
-            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            //const llama_token new_token_id = llama_sampling_sample_greedy(smpl, &candidates_p);
 
             // is it an end of generation? -> mark the stream as finished
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@@ -244,12 +245,13 @@ int main(int argc, char ** argv) {
     LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, smpl, nullptr);
 
     fprintf(stderr, "\n");
 
     llama_batch_free(batch);
 
+    llama_sampling_free(smpl);
     llama_free(ctx);
     llama_free_model(model);
 
 
@@ -258,7 +258,7 @@ int main(int argc, char ** argv) {
     }
 
     // clean up
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);
     llama_batch_free(batch);
     llama_free(ctx);
     llama_free_model(model);
 
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);
 
     llama_free(ctx);
     llama_free_model(model);
 
@@ -9,7 +9,7 @@
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
     std::vector<std::vector<float>> result;
 
-    const llama_model * mdl = llama_get_model(ctx);
+    const llama_model * model = llama_get_model(ctx);
 
     llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
@@ -18,16 +18,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
 
         const std::string input_string = instruction + sentences[i];
 
-        std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
+        std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
 
         const int32_t n_toks = inputs.size();
 
         // GritLM seems to have EOS = ""
         // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_token_eos(mdl));
+        // inputs.push_back(llama_token_eos(model));
 
         // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
+        const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
 
 #ifdef GRIT_DEBUG
         // debug tokens - should be matching as referenced in the GritLM sample
@@ -51,7 +51,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         llama_decode(ctx, batch);
 
         // get embedding dimensions
-        uint64_t n_embd = llama_n_embd(mdl);
+        uint64_t n_embd = llama_n_embd(model);
 
         // allocate embedding output
         std::vector<float> emb_unorm(n_embd, 0.0f);
@@ -95,16 +95,17 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
 static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
     std::string result;
 
-    const llama_model * mdl = llama_get_model(ctx);
-    llama_token eos_token = llama_token_eos(mdl);
+    const llama_model * model = llama_get_model(ctx);
+    llama_sampling * smpl = llama_get_sampling(ctx);
+    llama_token eos_token = llama_token_eos(model);
 
     llama_kv_cache_clear(ctx);
     llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
 
     llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
-    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
+    std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
     int32_t i_current_token = 0;
 
     while (true) {
@@ -118,14 +119,14 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
         llama_decode(ctx, bat);
         auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
 
-        auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
+        auto candidates = std::vector<llama_token_data>(llama_n_vocab(model));
         auto n_candidates = (int32_t)candidates.size();
         for (int32_t token = 0; token < n_candidates; token++) {
             candidates[token] = llama_token_data{ token, logits[token], 0.0f };
         }
         auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
 
-        llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
+        llama_token token = llama_sampling_sample_greedy(smpl, &candidates_p);
         if (token == eos_token) {
             break;
         }
@@ -167,10 +168,10 @@ int main(int argc, char * argv[]) {
 
     llama_backend_init();
 
-    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
 
     // create generation context
-    llama_context * ctx = llama_new_context_with_model(mdl, cparams);
+    llama_context * ctx = llama_new_context_with_model(model, cparams);
 
     // ### Embedding/Representation ###
     // samples taken from: https://github.com/ContextualAI/gritlm#basic
@@ -191,7 +192,7 @@ int main(int argc, char * argv[]) {
         const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
         const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
 
-        const int n_embd = llama_n_embd(mdl);
+        const int n_embd = llama_n_embd(model);
 
         const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
         const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@@ -212,7 +213,7 @@ int main(int argc, char * argv[]) {
     }
 
     llama_free(ctx);
-    llama_free_model(mdl);
+    llama_free_model(model);
     llama_backend_free();
 
     return 0;
Original file line number	Diff line number	Diff line change
`@@ -2125,7 +2125,7 @@ std::tuple<struct llama_model , struct llama_context > llama_init_from_gpt_par`
`2125`	`2125`	`llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));`
`2126`	`2126`	`llama_kv_cache_clear(lctx);`
`2127`	`2127`	`llama_synchronize(lctx);`
`2128`		`- llama_reset_timings(lctx);`
	`2128`	`+ llama_reset_timings(lctx, nullptr, nullptr);`
`2129`	`2129`	`}`
`2130`	`2130`
`2131`	`2131`	`return std::make_tuple(model, lctx);`
Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,7 @@ int main(int argc, char ** argv) {`
`200`	`200`	`}`
`201`	`201`	`}`
`202`	`202`
`203`		`- llama_print_timings(ctx);`
	`203`	`+ llama_print_timings(ctx, nullptr, nullptr);`
`204`	`204`
`205`	`205`	`llama_batch_free(batch);`
`206`	`206`
Original file line number	Diff line number	Diff line change
`@@ -258,7 +258,7 @@ int main(int argc, char ** argv) {`
`258`	`258`	`}`
`259`	`259`
`260`	`260`	`// clean up`
`261`		`- llama_print_timings(ctx);`
	`261`	`+ llama_print_timings(ctx, nullptr, nullptr);`
`262`	`262`	`llama_batch_free(batch);`
`263`	`263`	`llama_free(ctx);`
`264`	`264`	`llama_free_model(model);`
Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {`
`182`	`182`	`return 1;`
`183`	`183`	`}`
`184`	`184`
`185`		`- llama_print_timings(ctx);`
	`185`	`+ llama_print_timings(ctx, nullptr, nullptr);`
`186`	`186`
`187`	`187`	`llama_free(ctx);`
`188`	`188`	`llama_free_model(model);`