Skip to content

Commit 5243e3f

Browse files
committed
grammar : timing
ggml-ci
1 parent dccb789 commit 5243e3f

File tree

5 files changed

+44
-39
lines changed

5 files changed

+44
-39
lines changed

include/llama.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,12 +401,13 @@ extern "C" {
401401
double t_load_ms;
402402
double t_sampling_ms;
403403
double t_grammar_ms;
404+
double t_accept_ms;
404405
double t_p_eval_ms;
405406
double t_eval_ms;
406407

407408
int32_t n_sampling;
408-
int32_t n_grammar_sample;
409-
int32_t n_grammar_accept;
409+
int32_t n_grammar;
410+
int32_t n_accept;
410411
int32_t n_p_eval;
411412
int32_t n_eval;
412413
};

src/llama-grammar.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -961,7 +961,7 @@ struct llama_grammar * llama_grammar_init_impl(
961961
// Important: vec_rules has to be moved here, not copied, because stacks contains
962962
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
963963
// then the pointers would be invalidated when the local vec_rules goes out of scope.
964-
return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, 0, 0, 0 };
964+
return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
965965
}
966966

967967
struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
@@ -1039,15 +1039,15 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
10391039
// Important: vec_rules has to be moved here, not copied, because stacks contains
10401040
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
10411041
// then the pointers would be invalidated when the local vec_rules goes out of scope.
1042-
return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, 0, 0, 0 };
1042+
return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
10431043
}
10441044

10451045
void llama_grammar_free_impl(struct llama_grammar * grammar) {
10461046
delete grammar;
10471047
}
10481048

10491049
struct llama_grammar * llama_grammar_cp_impl(const struct llama_grammar & grammar) {
1050-
llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, 0, 0, 0 };
1050+
llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };
10511051

10521052
// redirect elements in stacks to point to new rules
10531053
for (size_t is = 0; is < result->stacks.size(); is++) {

src/llama-grammar.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,6 @@ struct llama_grammar {
114114

115115
// buffer for partially generated UTF-8 sequence from accepted tokens
116116
llama_partial_utf8 partial_utf8;
117-
118-
mutable int64_t t_total_us;
119-
120-
mutable int32_t n_sample;
121-
mutable int32_t n_accept;
122117
};
123118

124119
//

src/llama-sampling.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,13 @@ struct llama_sampling {
3434
// mirostat sampler state
3535
float mirostat_mu;
3636

37-
mutable int64_t t_total_us = 0;
37+
mutable int64_t t_sample_us = 0;
38+
mutable int64_t t_grammar_us = 0;
39+
mutable int64_t t_accept_us = 0;
3840

39-
mutable int32_t n_sample = 0;
41+
mutable int32_t n_sample = 0;
42+
mutable int32_t n_grammar = 0;
43+
mutable int32_t n_accept = 0;
4044
};
4145

4246
//

src/llama.cpp

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19547,43 +19547,43 @@ void llama_sampling_set_logit_bias(struct llama_sampling * smpl, int32_t n_logit
1954719547
}
1954819548

1954919549
void llama_sampling_softmax(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19550-
time_meas tm(smpl->t_total_us);
19550+
time_meas tm(smpl->t_sample_us);
1955119551

1955219552
llama_sampling_softmax_impl(candidates);
1955319553
}
1955419554

1955519555
void llama_sampling_top_k(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19556-
time_meas tm(smpl->t_total_us);
19556+
time_meas tm(smpl->t_sample_us);
1955719557

1955819558
llama_sampling_top_k_impl(candidates, smpl->params.top_k, smpl->params.min_keep);
1955919559
}
1956019560

1956119561
void llama_sampling_top_p(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19562-
time_meas tm(smpl->t_total_us);
19562+
time_meas tm(smpl->t_sample_us);
1956319563

1956419564
llama_sampling_top_p_impl(candidates, smpl->params.top_p, smpl->params.min_keep);
1956519565
}
1956619566

1956719567
void llama_sampling_min_p(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19568-
time_meas tm(smpl->t_total_us);
19568+
time_meas tm(smpl->t_sample_us);
1956919569

1957019570
llama_sampling_min_p_impl(candidates, smpl->params.min_p, smpl->params.min_keep);
1957119571
}
1957219572

1957319573
void llama_sampling_tail_free(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19574-
time_meas tm(smpl->t_total_us);
19574+
time_meas tm(smpl->t_sample_us);
1957519575

1957619576
llama_sampling_tail_free_impl(candidates, smpl->params.tfs_z, smpl->params.min_keep);
1957719577
}
1957819578

1957919579
void llama_sampling_typical(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19580-
time_meas tm(smpl->t_total_us);
19580+
time_meas tm(smpl->t_sample_us);
1958119581

1958219582
llama_sampling_typical_impl(candidates, smpl->params.typical_p, smpl->params.min_keep);
1958319583
}
1958419584

1958519585
void llama_sampling_temp(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19586-
time_meas tm(smpl->t_total_us);
19586+
time_meas tm(smpl->t_sample_us);
1958719587

1958819588
if (smpl->params.dynatemp_range > 0) {
1958919589
const float dynatemp_min = std::max(0.0f, smpl->params.temp - smpl->params.dynatemp_range);
@@ -19596,17 +19596,19 @@ void llama_sampling_temp(struct llama_sampling * smpl, llama_token_data_array *
1959619596
}
1959719597

1959819598
void llama_sampling_grammar(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19599-
time_meas tm(smpl->t_total_us); // TODO: measure grammar time separately from sampling
19599+
time_meas tm(smpl->t_grammar_us);
1960019600

1960119601
if (smpl->grammar) {
1960219602
llama_sampling_grammar_impl(candidates, *smpl->grammar);
1960319603
}
19604+
19605+
smpl->n_grammar++;
1960419606
}
1960519607

1960619608
void llama_sampling_penalties(
1960719609
struct llama_sampling * smpl,
1960819610
llama_token_data_array * candidates) {
19609-
time_meas tm(smpl->t_total_us);
19611+
time_meas tm(smpl->t_sample_us);
1961019612

1961119613
const size_t penalty_last_n = std::min<size_t>(smpl->params.penalty_last_n, smpl->prev.size());
1961219614

@@ -19633,13 +19635,13 @@ void llama_sampling_cfg(
1963319635
struct llama_sampling * smpl,
1963419636
float * logits,
1963519637
float * logits_guidance) {
19636-
time_meas tm(smpl->t_total_us);
19638+
time_meas tm(smpl->t_sample_us);
1963719639

1963819640
llama_sampling_cfg_impl(*smpl, logits, logits_guidance);
1963919641
}
1964019642

1964119643
llama_token llama_sampling_sample_mirostat(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19642-
time_meas tm(smpl->t_total_us);
19644+
time_meas tm(smpl->t_sample_us);
1964319645

1964419646
const auto type = smpl->params.mirostat;
1964519647

@@ -19669,7 +19671,7 @@ llama_token llama_sampling_sample_mirostat(struct llama_sampling * smpl, llama_t
1966919671
}
1967019672

1967119673
llama_token llama_sampling_sample_greedy(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19672-
time_meas tm(smpl->t_total_us);
19674+
time_meas tm(smpl->t_sample_us);
1967319675

1967419676
auto res = llama_sampling_sample_greedy_impl(candidates);
1967519677

@@ -19679,7 +19681,7 @@ llama_token llama_sampling_sample_greedy(struct llama_sampling * smpl, llama_tok
1967919681
}
1968019682

1968119683
llama_token llama_sampling_sample(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19682-
time_meas tm(smpl->t_total_us);
19684+
time_meas tm(smpl->t_sample_us);
1968319685

1968419686
auto res = llama_sampling_sample_impl(candidates, smpl->rng);
1968519687

@@ -19692,9 +19694,11 @@ void llama_sampling_accept(
1969219694
struct llama_sampling * smpl,
1969319695
llama_token token,
1969419696
bool apply_grammar) {
19695-
time_meas tm(smpl->t_total_us); // TODO: measure grammar time separately from sampling
19697+
time_meas tm(smpl->t_accept_us);
1969619698

1969719699
llama_sampling_accept_impl(*smpl, token, apply_grammar);
19700+
19701+
smpl->n_accept++;
1969819702
}
1969919703

1970019704
llama_token llama_sampling_prev(const struct llama_sampling * smpl, int32_t ith) {
@@ -19738,24 +19742,27 @@ void llama_print_timings(struct llama_context * ctx, struct llama_sampling * smp
1973819742
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
1973919743
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
1974019744
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
19741-
/*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_total_us : 0.0),
19742-
/*.t_grammar_ms =*/ 1e-3 * (smpl && smpl->grammar ? smpl->grammar->t_total_us : 0.0),
19745+
/*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_sample_us : 0.0),
19746+
/*.t_grammar_ms =*/ 1e-3 * (smpl ? smpl->t_grammar_us : 0.0),
19747+
/*.t_accept_ms =*/ 1e-3 * (smpl ? smpl->t_accept_us : 0.0),
1974319748
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
1974419749
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
1974519750

19746-
/*.n_sampling =*/ std::max(0, smpl ? smpl->n_sample : 0),
19747-
/*.n_grammar_sample =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_sample : 0),
19748-
/*.n_grammar_accept =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_accept : 0),
19749-
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
19750-
/*.n_eval =*/ std::max(1, ctx->n_eval),
19751+
/*.n_sampling =*/ std::max(0, smpl ? smpl->n_sample : 0),
19752+
/*.n_grammar =*/ std::max(0, smpl ? smpl->n_grammar : 0),
19753+
/*.n_accept =*/ std::max(0, smpl ? smpl->n_accept : 0),
19754+
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
19755+
/*.n_eval =*/ std::max(1, ctx->n_eval),
1975119756
};
1975219757

1975319758
LLAMA_LOG_INFO("\n");
1975419759
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
1975519760
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
1975619761
__func__, timings.t_sampling_ms, timings.n_sampling, timings.t_sampling_ms / timings.n_sampling, 1e3 / timings.t_sampling_ms * timings.n_sampling);
1975719762
LLAMA_LOG_INFO("%s: grammar time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
19758-
__func__, timings.t_grammar_ms, timings.n_grammar_sample, timings.t_grammar_ms / timings.n_grammar_sample, 1e3 / timings.t_grammar_ms * timings.n_grammar_sample);
19763+
__func__, timings.t_grammar_ms, timings.n_grammar, timings.t_grammar_ms / timings.n_grammar, 1e3 / timings.t_grammar_ms * timings.n_grammar);
19764+
//LLAMA_LOG_INFO("%s: accept time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
19765+
// __func__, timings.t_accept_ms, timings.n_accept, timings.t_accept_ms / timings.n_accept, 1e3 / timings.t_accept_ms * timings.n_accept);
1975919766
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
1976019767
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
1976119768
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
@@ -19769,11 +19776,9 @@ void llama_reset_timings(struct llama_context * ctx, struct llama_sampling * smp
1976919776
ctx->t_p_eval_us = ctx->n_p_eval = 0;
1977019777

1977119778
if (smpl) {
19772-
smpl->t_total_us = smpl->n_sample = 0;
19773-
19774-
if (smpl->grammar) {
19775-
smpl->grammar->t_total_us = smpl->grammar->n_sample = smpl->grammar->n_accept = 0;
19776-
}
19779+
smpl->t_sample_us = smpl->n_sample = 0;
19780+
smpl->t_grammar_us = smpl->n_grammar = 0;
19781+
smpl->t_accept_us = smpl->n_accept = 0;
1977719782
}
1977819783
}
1977919784

0 commit comments

Comments
 (0)