examples : simplify sampling using new API

ggerganov · ggerganov · commit b4fbb2aa67bd · 2024-08-29T16:22:24.000+03:00
ggml-ci
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
@@ -2,7 +2,6 @@
 #include "llama.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -66,6 +65,8 @@ int main(int argc, char ** argv) {
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
     auto sparams = llama_sampling_default_params();
+
+    sparams.seed  = params.sparams.seed;
     sparams.top_k = 40;
     sparams.top_p = 0.9f;
     sparams.temp  = 0.4f;
@@ -171,25 +172,17 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
-
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
+            const auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
 
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_sampling_set_logits(smpl, logits);
 
-            llama_sampling_top_k(smpl, &candidates_p);
-            llama_sampling_top_p(smpl, &candidates_p);
-            llama_sampling_temp (smpl, &candidates_p);
+            llama_sampling_top_k(smpl, nullptr);
+            llama_sampling_top_p(smpl, nullptr);
+            llama_sampling_temp (smpl, nullptr);
 
-            const llama_token new_token_id = llama_sampling_sample_dist(smpl, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample_dist(smpl, nullptr);
 
-            //const llama_token new_token_id = llama_sampling_sample_greedy(smpl, &candidates_p);
+            //const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);
 
             // is it an end of generation? -> mark the stream as finished
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
@@ -118,14 +118,9 @@ static std::string generate(llama_context * ctx, llama_sampling * smpl, const st
         llama_decode(ctx, bat);
         auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
 
-        auto candidates = std::vector<llama_token_data>(llama_n_vocab(model));
-        auto n_candidates = (int32_t)candidates.size();
-        for (int32_t token = 0; token < n_candidates; token++) {
-            candidates[token] = llama_token_data{ token, logits[token], 0.0f };
-        }
-        auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
+        llama_sampling_set_logits(smpl, logits);
 
-        llama_token token = llama_sampling_sample_greedy(smpl, &candidates_p);
+        llama_token token = llama_sampling_sample_greedy(smpl, nullptr);
         if (token == eos_token) {
             break;
         }
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -396,17 +396,10 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
     auto n_vocab = llama_n_vocab(model);
     auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
 
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    llama_sampling_set_logits(sampling, logits);
 
     // sample the most likely token
-    const auto new_token_id = llama_sampling_sample_greedy(sampling, &candidates_p);
+    const auto new_token_id = llama_sampling_sample_greedy(sampling, nullptr);
 
     const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
     if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -149,17 +149,9 @@ actor LlamaContext {
         let n_vocab = llama_n_vocab(model)
         let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
 
-        var candidates = Array<llama_token_data>()
-        candidates.reserveCapacity(Int(n_vocab))
+        llama_sampling_set_logits(sampling, logits);
 
-        for token_id in 0..<n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-        candidates.withUnsafeMutableBufferPointer() { buffer in
-            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
-
-            new_token_id = llama_sampling_sample_greedy(sampling, &candidates_p)
-        }
+        new_token_id = llama_sampling_sample_greedy(sampling, nil)
 
         if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
             print("\n")
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
@@ -216,20 +216,12 @@ int main(int argc, char ** argv) {
     while (n_cur <= n_len) {
         // sample the next token
         {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
 
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_sampling_set_logits(smpl, logits);
 
             // sample the most likely token
-            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);
 
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
@@ -69,16 +69,11 @@ int main(int argc, char ** argv) {
     printf("\nfirst run: %s", params.prompt.c_str());
 
     for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(model);
+        const auto * logits = llama_get_logits(ctx);
 
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sampling_sample_dist(smpl, &candidates_p);
+        llama_sampling_set_logits(smpl, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl, nullptr);
         auto next_token_str = llama_token_to_piece(ctx, next_token);
 
         printf("%s", next_token_str.c_str());
@@ -131,15 +126,11 @@ int main(int argc, char ** argv) {
 
     // second run
     for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx2);
-        auto n_vocab = llama_n_vocab(model);
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sampling_sample_dist(smpl2, &candidates_p);
+        const auto * logits = llama_get_logits(ctx2);
+
+        llama_sampling_set_logits(smpl2, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl2, nullptr);
         auto next_token_str = llama_token_to_piece(ctx2, next_token);
 
         printf("%s", next_token_str.c_str());
@@ -224,15 +215,11 @@ int main(int argc, char ** argv) {
 
     // third run with seq 1 instead of 0
     for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx3);
-        auto n_vocab = llama_n_vocab(model);
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sampling_sample_dist(smpl3, &candidates_p);
+        const auto * logits = llama_get_logits(ctx3);
+
+        llama_sampling_set_logits(smpl3, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl3, nullptr);
         auto next_token_str = llama_token_to_piece(ctx3, next_token);
 
         printf("%s", next_token_str.c_str());
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -112,20 +112,12 @@ int main(int argc, char ** argv) {
     while (n_cur <= n_predict) {
         // sample the next token
         {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
 
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_sampling_set_logits(smpl, logits);
 
             // sample the most likely token
-            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);
 
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {