Skip to content

Commit d1031cf

Browse files
authored
sampling : refactor init to use llama_sampling_params (#3696)
* sampling : refactor init to use llama_sampling_params * llama : combine repetition, frequency and presence penalties in 1 call * examples : remove embd-input and gptneox-wip * sampling : rename penalty params + reduce size of "prev" vector * sampling : add llama_sampling_print helper * sampling : hide prev behind API and apply #3661 ggml-ci
1 parent 8cf19d6 commit d1031cf

30 files changed

+364
-4501
lines changed

Makefile

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Define the default target now so that it is always the first target
22
BUILD_TARGETS = \
33
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4-
simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search \
4+
simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search \
55
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
66

77
# Binaries only useful for tests
@@ -608,13 +608,6 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
608608
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
609609
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
610610

611-
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
612-
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
613-
614-
615-
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
616-
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
617-
618611
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
619612
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
620613

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -962,7 +962,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
962962

963963
- [main](./examples/main/README.md)
964964
- [server](./examples/server/README.md)
965-
- [embd-input](./examples/embd-input/README.md)
966965
- [jeopardy](./examples/jeopardy/README.md)
967966
- [BLIS](./docs/BLIS.md)
968967
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)

common/common.cpp

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
107107
std::string arg;
108108
gpt_params default_params;
109109
const std::string arg_prefix = "--";
110-
llama_sampling_params & sparams = params.sampling_params;
110+
llama_sampling_params & sparams = params.sparams;
111111

112112
for (int i = 1; i < argc; i++) {
113113
arg = argv[i];
@@ -241,25 +241,26 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
241241
invalid_param = true;
242242
break;
243243
}
244-
sparams.repeat_last_n = std::stoi(argv[i]);
244+
sparams.penalty_last_n = std::stoi(argv[i]);
245+
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
245246
} else if (arg == "--repeat-penalty") {
246247
if (++i >= argc) {
247248
invalid_param = true;
248249
break;
249250
}
250-
sparams.repeat_penalty = std::stof(argv[i]);
251+
sparams.penalty_repeat = std::stof(argv[i]);
251252
} else if (arg == "--frequency-penalty") {
252253
if (++i >= argc) {
253254
invalid_param = true;
254255
break;
255256
}
256-
sparams.frequency_penalty = std::stof(argv[i]);
257+
sparams.penalty_freq = std::stof(argv[i]);
257258
} else if (arg == "--presence-penalty") {
258259
if (++i >= argc) {
259260
invalid_param = true;
260261
break;
261262
}
262-
sparams.presence_penalty = std::stof(argv[i]);
263+
sparams.penalty_present = std::stof(argv[i]);
263264
} else if (arg == "--mirostat") {
264265
if (++i >= argc) {
265266
invalid_param = true;
@@ -572,7 +573,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
572573
invalid_param = true;
573574
break;
574575
}
575-
params.grammar = argv[i];
576+
sparams.grammar = argv[i];
576577
} else if (arg == "--grammar-file") {
577578
if (++i >= argc) {
578579
invalid_param = true;
@@ -587,7 +588,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
587588
std::copy(
588589
std::istreambuf_iterator<char>(file),
589590
std::istreambuf_iterator<char>(),
590-
std::back_inserter(params.grammar)
591+
std::back_inserter(sparams.grammar)
591592
);
592593
#ifndef LOG_DISABLE_LOGS
593594
// Parse args for logging parameters
@@ -640,7 +641,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
640641
}
641642

642643
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
643-
const llama_sampling_params & sparams = params.sampling_params;
644+
const llama_sampling_params & sparams = params.sparams;
644645

645646
printf("usage: %s [options]\n", argv[0]);
646647
printf("\n");
@@ -678,10 +679,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
678679
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
679680
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
680681
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
681-
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
682-
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
683-
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
684-
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
682+
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
683+
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
684+
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
685+
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
685686
printf(" --mirostat N use Mirostat sampling.\n");
686687
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
687688
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@@ -878,7 +879,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
878879
}
879880

880881
if (params.ignore_eos) {
881-
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
882+
params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY;
882883
}
883884

884885
{
@@ -1123,28 +1124,28 @@ std::string get_sortable_timestamp() {
11231124

11241125
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
11251126
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1126-
const llama_sampling_params & sparams = params.sampling_params;
1127+
const llama_sampling_params & sparams = params.sparams;
11271128

11281129
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
11291130
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
1130-
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
1131-
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
1132-
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
1133-
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
1131+
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
1132+
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
1133+
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
1134+
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
11341135
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
11351136
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
1136-
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
1137-
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
1138-
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
1139-
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
1140-
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
1141-
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
1142-
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
1143-
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
1144-
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
1145-
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
1146-
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
1147-
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
1137+
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
1138+
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
1139+
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
1140+
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
1141+
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
1142+
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
1143+
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
1144+
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
1145+
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
1146+
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
1147+
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
1148+
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
11481149

11491150
#ifdef NDEBUG
11501151
fprintf(stream, "debug: false\n");
@@ -1178,8 +1179,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
11781179
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
11791180
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
11801181
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
1181-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
1182-
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
1182+
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
1183+
dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
11831184
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
11841185
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
11851186
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
@@ -1238,14 +1239,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
12381239
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
12391240
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
12401241
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
1241-
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
1242+
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
12421243
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
12431244
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
12441245
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
12451246
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
12461247
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
12471248
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
1248-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
1249+
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
12491250

12501251
fprintf(stream, "reverse_prompt:\n");
12511252
for (std::string ap : params.antiprompt) {

common/common.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ struct gpt_params {
5656
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
5757

5858
// // sampling parameters
59-
struct llama_sampling_params sampling_params;
59+
struct llama_sampling_params sparams;
6060

6161
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
6262
std::string model_draft = ""; // draft model for speculative decoding
@@ -66,7 +66,6 @@ struct gpt_params {
6666
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
6767
std::string input_prefix = ""; // string to prefix user inputs with
6868
std::string input_suffix = ""; // string to suffix user inputs with
69-
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
7069
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
7170
std::string logdir = ""; // directory in which to save YAML log files
7271

common/sampling.cpp

Lines changed: 51 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#include "sampling.h"
22

3-
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
3+
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
44
struct llama_sampling_context * result = new llama_sampling_context();
55

6-
result->params = params.sampling_params;
6+
result->params = params;
77
result->grammar = nullptr;
88

99
// if there is a grammar, parse it
@@ -23,7 +23,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa
2323
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
2424
}
2525

26-
result->prev.resize(params.n_ctx);
26+
result->prev.resize(params.n_prev);
2727

2828
return result;
2929
}
@@ -66,25 +66,56 @@ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * ds
6666
dst->prev = src->prev;
6767
}
6868

69+
llama_token llama_sampling_last(llama_sampling_context * ctx) {
70+
return ctx->prev.back();
71+
}
72+
73+
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
74+
const int size = ctx_sampling->prev.size();
75+
76+
n = std::min(n, size);
77+
78+
std::string result;
79+
80+
for (int i = size - n; i < size; i++) {
81+
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
82+
}
83+
84+
return result;
85+
}
86+
87+
std::string llama_sampling_print(const llama_sampling_params & params) {
88+
char result[1024];
89+
90+
snprintf(result, sizeof(result),
91+
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
92+
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
93+
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
94+
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
95+
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
96+
params.mirostat, params.mirostat_eta, params.mirostat_tau);
97+
98+
return std::string(result);
99+
}
100+
69101
llama_token llama_sampling_sample(
70102
struct llama_sampling_context * ctx_sampling,
71103
struct llama_context * ctx_main,
72104
struct llama_context * ctx_cfg,
73105
const int idx) {
74-
const int n_ctx = llama_n_ctx(ctx_main);
75-
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
76-
77106
const llama_sampling_params & params = ctx_sampling->params;
78107

108+
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
109+
79110
const float temp = params.temp;
80111
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
81112
const float top_p = params.top_p;
82113
const float tfs_z = params.tfs_z;
83114
const float typical_p = params.typical_p;
84-
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
85-
const float repeat_penalty = params.repeat_penalty;
86-
const float alpha_presence = params.presence_penalty;
87-
const float alpha_frequency = params.frequency_penalty;
115+
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
116+
const float penalty_repeat = params.penalty_repeat;
117+
const float penalty_freq = params.penalty_freq;
118+
const float penalty_present = params.penalty_present;
88119
const int mirostat = params.mirostat;
89120
const float mirostat_tau = params.mirostat_tau;
90121
const float mirostat_eta = params.mirostat_eta;
@@ -97,7 +128,7 @@ llama_token llama_sampling_sample(
97128

98129
float * logits = llama_get_logits_ith(ctx_main, idx);
99130

100-
// Apply params.logit_bias map
131+
// apply params.logit_bias map
101132
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
102133
logits[it->first] += it->second;
103134
}
@@ -117,14 +148,10 @@ llama_token llama_sampling_sample(
117148
// apply penalties
118149
if (!prev.empty()) {
119150
const float nl_logit = logits[llama_token_nl(ctx_main)];
120-
const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);
121151

122-
llama_sample_repetition_penalty(ctx_main, &cur_p,
123-
prev.data() + prev.size() - last_n_repeat,
124-
last_n_repeat, repeat_penalty);
125-
llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
126-
prev.data() + prev.size() - last_n_repeat,
127-
last_n_repeat, alpha_frequency, alpha_presence);
152+
llama_sample_repetition_penalties(ctx_main, &cur_p,
153+
prev.data() + prev.size() - penalty_last_n,
154+
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
128155

129156
if (!penalize_nl) {
130157
for (size_t idx = 0; idx < cur_p.size; idx++) {
@@ -141,7 +168,7 @@ llama_token llama_sampling_sample(
141168
}
142169

143170
if (temp <= 0) {
144-
// Greedy sampling
171+
// greedy sampling
145172
id = llama_sample_token_greedy(ctx_main, &cur_p);
146173
} else {
147174
if (mirostat == 1) {
@@ -152,8 +179,9 @@ llama_token llama_sampling_sample(
152179
llama_sample_temp(ctx_main, &cur_p, temp);
153180
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
154181
} else {
155-
// Temperature sampling
182+
// temperature sampling
156183
size_t min_keep = std::max(1, params.n_probs);
184+
157185
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
158186
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
159187
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
@@ -183,11 +211,12 @@ llama_token llama_sampling_sample(
183211
void llama_sampling_accept(
184212
struct llama_sampling_context * ctx_sampling,
185213
struct llama_context * ctx_main,
186-
llama_token id) {
214+
llama_token id,
215+
bool apply_grammar) {
187216
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
188217
ctx_sampling->prev.push_back(id);
189218

190-
if (ctx_sampling->grammar != NULL) {
219+
if (ctx_sampling->grammar != NULL && apply_grammar) {
191220
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
192221
}
193222
}

0 commit comments

Comments
 (0)