ggml-org
diff --git a/‎common/sampling.cpp
+10-8 b/‎common/sampling.cpp
+10-8
diff --git a/‎common/sampling.h
+5-3 b/‎common/sampling.h
+5-3
diff --git a/‎examples/gbnf-validator/gbnf-validator.cpp
+1-2 b/‎examples/gbnf-validator/gbnf-validator.cpp
+1-2
diff --git a/‎examples/infill/infill.cpp
+1-1 b/‎examples/infill/infill.cpp
+1-1
diff --git a/‎examples/llava/llava-cli.cpp
+1-1 b/‎examples/llava/llava-cli.cpp
+1-1
diff --git a/‎examples/lookahead/lookahead.cpp
+1-1 b/‎examples/lookahead/lookahead.cpp
+1-1
diff --git a/‎examples/lookup/lookup.cpp
+1-1 b/‎examples/lookup/lookup.cpp
+1-1
diff --git a/‎examples/main/main.cpp
+1-1 b/‎examples/main/main.cpp
+1-1
diff --git a/‎examples/parallel/parallel.cpp
+1-1 b/‎examples/parallel/parallel.cpp
+1-1
diff --git a/‎examples/quantize-stats/quantize-stats.cpp
+1-1 b/‎examples/quantize-stats/quantize-stats.cpp
+1-1
diff --git a/‎examples/server/server.cpp
+1-1 b/‎examples/server/server.cpp
+1-1
diff --git a/‎examples/speculative/speculative.cpp
+2-2 b/‎examples/speculative/speculative.cpp
+2-2
diff --git a/‎include/llama.h
+12-57 b/‎include/llama.h
+12-57
diff --git a/‎src/llama-grammar.cpp
+20-31 b/‎src/llama-grammar.cpp
+20-31
@@ -1,11 +1,13 @@
-#define LLAMA_API_INTERNAL
 #include "sampling.h"
+
 #include <random>
 
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_context * ctx, llama_seq_id seq_id) {
     struct llama_sampling_context * result = new llama_sampling_context();
 
     result->params  = params;
+    result->seq_id  = seq_id;
+    result->ctx     = ctx;
     result->grammar = nullptr;
 
     // if there is a grammar, parse it
@@ -81,7 +83,7 @@ void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t s
     if (seed == LLAMA_DEFAULT_SEED) {
         seed = std::random_device{}();
     }
-    ctx->rng.seed(seed);
+    llama_set_rng_seed_seq(ctx->ctx, seed, ctx->seq_id);
 }
 
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
@@ -271,10 +273,10 @@ static llama_token llama_sampling_sample_impl(
                   bool is_resampling) {
     const llama_sampling_params & params = ctx_sampling->params;
 
-    const float   temp            = params.temp;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
+    const float temp         = params.temp;
+    const int   mirostat     = params.mirostat;
+    const float mirostat_tau = params.mirostat_tau;
+    const float mirostat_eta = params.mirostat_eta;
 
     std::vector<float> original_logits;
     auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
@@ -304,7 +306,7 @@ static llama_token llama_sampling_sample_impl(
 
             sampler_queue(ctx_main, params, cur_p, min_keep);
 
-            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
+            id = llama_sample_token_seq(ctx_main, &cur_p, ctx_sampling->seq_id);
 
             //{
             //    const int n_top = 10;
 
@@ -70,9 +70,12 @@ struct llama_sampling_context {
     // parameters that will be used for sampling
     llama_sampling_params params;
 
+    llama_seq_id seq_id;
+
     // mirostat sampler state
     float mirostat_mu;
 
+    llama_context * ctx; // TMP
     llama_grammar * grammar;
 
     // internal
@@ -81,15 +84,14 @@ struct llama_sampling_context {
     // TODO: replace with ring-buffer
     std::vector<llama_token>      prev;
     std::vector<llama_token_data> cur;
-    size_t n_valid; // Number of correct top tokens with correct probabilities.
 
-    std::mt19937 rng;
+    size_t n_valid; // Number of correct top tokens with correct probabilities.
 };
 
 #include "common.h"
 
 // Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_context * ctx, llama_seq_id seq_id);
 
 void llama_sampling_free(struct llama_sampling_context * ctx);
 
 
@@ -1,8 +1,7 @@
-#define LLAMA_API_INTERNAL
-
 #include "grammar-parser.h"
 #include "ggml.h"
 #include "llama.h"
+#include "llama-impl.h"
 #include "unicode.h"
 
 #include <cstdio>
 
@@ -346,7 +346,7 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd;
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams, ctx, 0);
 
     while (n_remain != 0 || params.interactive) {
         // predict
 
@@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 
     LOG_TEE("\n");
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, ctx_llava->ctx_llama, 0);
     if (!ctx_sampling) {
         fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
 
@@ -118,7 +118,7 @@ int main(int argc, char ** argv) {
     llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
 
     // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, ctx, 0);
 
     // verification n-grams
     std::vector<ngram_data> ngrams_cur(G);
 
@@ -106,7 +106,7 @@ int main(int argc, char ** argv){
 
     bool has_eos = false;
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, ctx, 0);
 
     std::vector<llama_token> draft;
 
 
@@ -527,7 +527,7 @@ int main(int argc, char ** argv) {
         antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
     }
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams, ctx, 0);
     if (!ctx_sampling) {
         fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
 
@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.ctx_sampling = llama_sampling_init(params.sparams);
+        client.ctx_sampling = llama_sampling_init(params.sparams, ctx, i);
     }
 
     std::vector<llama_token> tokens_system;
 
@@ -1,7 +1,7 @@
-#define LLAMA_API_INTERNAL
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "llama-impl.h"
 
 #include <algorithm>
 #include <cassert>
 
@@ -1090,7 +1090,7 @@ struct server_context {
             if (slot.ctx_sampling != nullptr) {
                 llama_sampling_free(slot.ctx_sampling);
             }
-            slot.ctx_sampling = llama_sampling_init(slot.sparams);
+            slot.ctx_sampling = llama_sampling_init(slot.sparams, ctx, slot.id);
             if (slot.ctx_sampling == nullptr) {
                 // for now, the only error that may happen here is invalid grammar
                 send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
 
@@ -175,7 +175,7 @@ int main(int argc, char ** argv) {
     bool has_eos = false;
 
     // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, ctx_tgt, 0);
 
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
@@ -186,7 +186,7 @@ int main(int argc, char ** argv) {
     }
 
     for (int s = 0; s < n_seq_dft; ++s) {
-        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+        drafts[s].ctx_sampling = llama_sampling_init(params.sparams, ctx_dft, s);
     }
 
     llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
 
@@ -40,7 +40,7 @@
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
 
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 7
+#define LLAMA_SESSION_VERSION 8
 
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 1
@@ -1031,6 +1031,9 @@ extern "C" {
     // Sets the current rng seed.
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
 
+    LLAMA_API DEPRECATED(void llama_set_rng_seed_seq(struct llama_context * ctx, uint32_t seed, llama_seq_id),
+        "temporary API, until llama_sampling_context is implemented, do not use");
+
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
     LLAMA_API void llama_sample_repetition_penalties(
@@ -1137,11 +1140,18 @@ extern "C" {
             struct llama_context * ctx,
           llama_token_data_array * candidates);
 
-    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
+    /// @details Randomly selects a token from the candidates based on their probabilities using RNG[0] of ctx.
     LLAMA_API llama_token llama_sample_token(
             struct llama_context * ctx,
           llama_token_data_array * candidates);
 
+    /// @details Same as llama_sample_token, but uses a seqeuence-specific RNG[seq_id].
+    LLAMA_API DEPRECATED(llama_token llama_sample_token_seq(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                    llama_seq_id   seq_id),
+        "temporary API, until llama_sampling_context is implemented, do not use");
+
     //
     // Model split
     //
@@ -1175,59 +1185,4 @@ extern "C" {
 }
 #endif
 
-// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
-#ifdef LLAMA_API_INTERNAL
-
-#include <random>
-#include <string>
-#include <vector>
-
-struct ggml_tensor;
-
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
-);
-
-struct llama_partial_utf8 {
-    uint32_t value;    // bit value so far (unshifted)
-    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
-};
-
-struct llama_grammar_candidate {
-    size_t               index;
-    const uint32_t     * code_points;
-    llama_partial_utf8   partial_utf8;
-};
-
-using llama_grammar_rule  = std::vector<      llama_grammar_element>;
-using llama_grammar_stack = std::vector<const llama_grammar_element *>;
-
-using llama_grammar_rules      = std::vector<llama_grammar_rule>;
-using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
-using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
-
-const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
-      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
-
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
-        const uint32_t chr,
-              llama_grammar_stacks & new_stacks);
-
-std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
-        const llama_grammar_rules & rules,
-        const llama_grammar_stack & stack,
-        const llama_grammar_candidates & candidates);
-
-std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-        const std::string & src,
-        llama_partial_utf8 partial_start);
-
-// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
-// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
-llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
-
-#endif // LLAMA_API_INTERNAL
-
 #endif // LLAMA_H
@@ -445,15 +445,15 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
     delete grammar;
 }
 
-struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar) {
-    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
+struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar & grammar) {
+    llama_grammar * result = new llama_grammar{ grammar.rules, grammar.stacks, grammar.partial_utf8 };
 
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
         for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
-            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
-                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
-                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
+            for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
+                for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
+                    if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
                          result->stacks[is][ie]  =  &result->rules[ir0][ir1];
                     }
                 }
@@ -464,14 +464,9 @@ struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * gram
     return result;
 }
 
-void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) {
-    GGML_ASSERT(grammar);
-    GGML_ASSERT(vocab);
-
-    int64_t t_start_sample_us = ggml_time_us();
-
+void llama_grammar_sample_impl(const struct llama_grammar & grammar, const struct llama_vocab & vocab, llama_token_data_array * candidates) {
     bool allow_eog = false;
-    for (const auto & stack : grammar->stacks) {
+    for (const auto & stack : grammar.stacks) {
         if (stack.empty()) {
             allow_eog = true;
             break;
@@ -486,54 +481,48 @@ void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struc
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id      = candidates->data[i].id;
-        const std::string & piece = vocab->cache_token_to_piece.at(id);
+        const std::string & piece = vocab.cache_token_to_piece.at(id);
 
-        if (llama_token_is_eog_impl(*vocab, id)) {
+        if (llama_token_is_eog_impl(vocab, id)) {
             if (!allow_eog) {
                 candidates->data[i].logit = -INFINITY;
             }
         } else if (piece.empty() || piece[0] == 0) {
             candidates->data[i].logit = -INFINITY;
         } else {
-            candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
+            candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
             candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
         }
     }
 
-    const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
+    const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
     for (const auto & reject : rejects) {
         candidates->data[reject.index].logit = -INFINITY;
     }
-
-    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 
-void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    if (llama_token_is_eog_impl(*vocab, token)) {
-        for (const auto & stack : grammar->stacks) {
+void llama_grammar_accept_token_impl(struct llama_grammar & grammar, const struct llama_vocab & vocab, llama_token token) {
+    if (llama_token_is_eog_impl(vocab, token)) {
+        for (const auto & stack : grammar.stacks) {
             if (stack.empty()) {
                 return;
             }
         }
         GGML_ASSERT(false);
     }
 
-    const std::string & piece = vocab->cache_token_to_piece.at(token);
+    const std::string & piece = vocab.cache_token_to_piece.at(token);
 
     // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
+    const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
     const auto & code_points = decoded.first;
 
     llama_grammar_stacks tmp_new_stacks;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
-        grammar->stacks = tmp_new_stacks;
+        llama_grammar_accept(grammar.rules, grammar.stacks, *it, tmp_new_stacks);
+        grammar.stacks = tmp_new_stacks;
     }
 
-    grammar->partial_utf8 = decoded.second;
-    GGML_ASSERT(!grammar->stacks.empty());
-
-    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    grammar.partial_utf8 = decoded.second;
+    GGML_ASSERT(!grammar.stacks.empty());
 }
Original file line number	Diff line number	Diff line change
`@@ -527,7 +527,7 @@ int main(int argc, char ** argv) {`
`527`	`527`	`antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));`
`528`	`528`	`}`
`529`	`529`
`530`		`- struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);`
	`530`	`+ struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams, ctx, 0);`
`531`	`531`	`if (!ctx_sampling) {`
`532`	`532`	`fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);`
`533`	`533`	`exit(1);`
Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {`
`161`	`161`	`for (size_t i = 0; i < clients.size(); ++i) {`
`162`	`162`	`auto & client = clients[i];`
`163`	`163`	`client.id = i;`
`164`		`- client.ctx_sampling = llama_sampling_init(params.sparams);`
	`164`	`+ client.ctx_sampling = llama_sampling_init(params.sparams, ctx, i);`
`165`	`165`	`}`
`166`	`166`
`167`	`167`	`std::vector<llama_token> tokens_system;`
Original file line number	Diff line number	Diff line change
`@@ -1090,7 +1090,7 @@ struct server_context {`
`1090`	`1090`	`if (slot.ctx_sampling != nullptr) {`
`1091`	`1091`	`llama_sampling_free(slot.ctx_sampling);`
`1092`	`1092`	`}`
`1093`		`- slot.ctx_sampling = llama_sampling_init(slot.sparams);`
	`1093`	`+ slot.ctx_sampling = llama_sampling_init(slot.sparams, ctx, slot.id);`
`1094`	`1094`	`if (slot.ctx_sampling == nullptr) {`
`1095`	`1095`	`// for now, the only error that may happen here is invalid grammar`
`1096`	`1096`	`send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);`
Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,7 @@ int main(int argc, char ** argv) {`
`175`	`175`	`bool has_eos = false;`
`176`	`176`
`177`	`177`	`// target model sampling context`
`178`		`- struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);`
	`178`	`+ struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, ctx_tgt, 0);`
`179`	`179`
`180`	`180`	`// draft sequence data`
`181`	`181`	`std::vector<seq_draft> drafts(n_seq_dft);`
`@@ -186,7 +186,7 @@ int main(int argc, char ** argv) {`
`186`	`186`	`}`
`187`	`187`
`188`	`188`	`for (int s = 0; s < n_seq_dft; ++s) {`
`189`		`- drafts[s].ctx_sampling = llama_sampling_init(params.sparams);`
	`189`	`+ drafts[s].ctx_sampling = llama_sampling_init(params.sparams, ctx_dft, s);`
`190`	`190`	`}`
`191`	`191`
`192`	`192`	`llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);`