@@ -19547,43 +19547,43 @@ void llama_sampling_set_logit_bias(struct llama_sampling * smpl, int32_t n_logit
19547
19547
}
19548
19548
19549
19549
void llama_sampling_softmax(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19550
- time_meas tm(smpl->t_total_us );
19550
+ time_meas tm(smpl->t_sample_us );
19551
19551
19552
19552
llama_sampling_softmax_impl(candidates);
19553
19553
}
19554
19554
19555
19555
void llama_sampling_top_k(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19556
- time_meas tm(smpl->t_total_us );
19556
+ time_meas tm(smpl->t_sample_us );
19557
19557
19558
19558
llama_sampling_top_k_impl(candidates, smpl->params.top_k, smpl->params.min_keep);
19559
19559
}
19560
19560
19561
19561
void llama_sampling_top_p(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19562
- time_meas tm(smpl->t_total_us );
19562
+ time_meas tm(smpl->t_sample_us );
19563
19563
19564
19564
llama_sampling_top_p_impl(candidates, smpl->params.top_p, smpl->params.min_keep);
19565
19565
}
19566
19566
19567
19567
void llama_sampling_min_p(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19568
- time_meas tm(smpl->t_total_us );
19568
+ time_meas tm(smpl->t_sample_us );
19569
19569
19570
19570
llama_sampling_min_p_impl(candidates, smpl->params.min_p, smpl->params.min_keep);
19571
19571
}
19572
19572
19573
19573
void llama_sampling_tail_free(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19574
- time_meas tm(smpl->t_total_us );
19574
+ time_meas tm(smpl->t_sample_us );
19575
19575
19576
19576
llama_sampling_tail_free_impl(candidates, smpl->params.tfs_z, smpl->params.min_keep);
19577
19577
}
19578
19578
19579
19579
void llama_sampling_typical(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19580
- time_meas tm(smpl->t_total_us );
19580
+ time_meas tm(smpl->t_sample_us );
19581
19581
19582
19582
llama_sampling_typical_impl(candidates, smpl->params.typical_p, smpl->params.min_keep);
19583
19583
}
19584
19584
19585
19585
void llama_sampling_temp(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19586
- time_meas tm(smpl->t_total_us );
19586
+ time_meas tm(smpl->t_sample_us );
19587
19587
19588
19588
if (smpl->params.dynatemp_range > 0) {
19589
19589
const float dynatemp_min = std::max(0.0f, smpl->params.temp - smpl->params.dynatemp_range);
@@ -19596,17 +19596,19 @@ void llama_sampling_temp(struct llama_sampling * smpl, llama_token_data_array *
19596
19596
}
19597
19597
19598
19598
void llama_sampling_grammar(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19599
- time_meas tm(smpl->t_total_us); // TODO: measure grammar time separately from sampling
19599
+ time_meas tm(smpl->t_grammar_us);
19600
19600
19601
19601
if (smpl->grammar) {
19602
19602
llama_sampling_grammar_impl(candidates, *smpl->grammar);
19603
19603
}
19604
+
19605
+ smpl->n_grammar++;
19604
19606
}
19605
19607
19606
19608
void llama_sampling_penalties(
19607
19609
struct llama_sampling * smpl,
19608
19610
llama_token_data_array * candidates) {
19609
- time_meas tm(smpl->t_total_us );
19611
+ time_meas tm(smpl->t_sample_us );
19610
19612
19611
19613
const size_t penalty_last_n = std::min<size_t>(smpl->params.penalty_last_n, smpl->prev.size());
19612
19614
@@ -19633,13 +19635,13 @@ void llama_sampling_cfg(
19633
19635
struct llama_sampling * smpl,
19634
19636
float * logits,
19635
19637
float * logits_guidance) {
19636
- time_meas tm(smpl->t_total_us );
19638
+ time_meas tm(smpl->t_sample_us );
19637
19639
19638
19640
llama_sampling_cfg_impl(*smpl, logits, logits_guidance);
19639
19641
}
19640
19642
19641
19643
llama_token llama_sampling_sample_mirostat(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19642
- time_meas tm(smpl->t_total_us );
19644
+ time_meas tm(smpl->t_sample_us );
19643
19645
19644
19646
const auto type = smpl->params.mirostat;
19645
19647
@@ -19669,7 +19671,7 @@ llama_token llama_sampling_sample_mirostat(struct llama_sampling * smpl, llama_t
19669
19671
}
19670
19672
19671
19673
llama_token llama_sampling_sample_greedy(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19672
- time_meas tm(smpl->t_total_us );
19674
+ time_meas tm(smpl->t_sample_us );
19673
19675
19674
19676
auto res = llama_sampling_sample_greedy_impl(candidates);
19675
19677
@@ -19679,7 +19681,7 @@ llama_token llama_sampling_sample_greedy(struct llama_sampling * smpl, llama_tok
19679
19681
}
19680
19682
19681
19683
llama_token llama_sampling_sample(struct llama_sampling * smpl, llama_token_data_array * candidates) {
19682
- time_meas tm(smpl->t_total_us );
19684
+ time_meas tm(smpl->t_sample_us );
19683
19685
19684
19686
auto res = llama_sampling_sample_impl(candidates, smpl->rng);
19685
19687
@@ -19692,9 +19694,11 @@ void llama_sampling_accept(
19692
19694
struct llama_sampling * smpl,
19693
19695
llama_token token,
19694
19696
bool apply_grammar) {
19695
- time_meas tm(smpl->t_total_us); // TODO: measure grammar time separately from sampling
19697
+ time_meas tm(smpl->t_accept_us);
19696
19698
19697
19699
llama_sampling_accept_impl(*smpl, token, apply_grammar);
19700
+
19701
+ smpl->n_accept++;
19698
19702
}
19699
19703
19700
19704
llama_token llama_sampling_prev(const struct llama_sampling * smpl, int32_t ith) {
@@ -19738,24 +19742,27 @@ void llama_print_timings(struct llama_context * ctx, struct llama_sampling * smp
19738
19742
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
19739
19743
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
19740
19744
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
19741
- /*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_total_us : 0.0),
19742
- /*.t_grammar_ms =*/ 1e-3 * (smpl && smpl->grammar ? smpl->grammar->t_total_us : 0.0),
19745
+ /*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_sample_us : 0.0),
19746
+ /*.t_grammar_ms =*/ 1e-3 * (smpl ? smpl->t_grammar_us : 0.0),
19747
+ /*.t_accept_ms =*/ 1e-3 * (smpl ? smpl->t_accept_us : 0.0),
19743
19748
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
19744
19749
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
19745
19750
19746
- /*.n_sampling =*/ std::max(0, smpl ? smpl->n_sample : 0),
19747
- /*.n_grammar_sample =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_sample : 0),
19748
- /*.n_grammar_accept =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar-> n_accept : 0),
19749
- /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
19750
- /*.n_eval =*/ std::max(1, ctx->n_eval),
19751
+ /*.n_sampling =*/ std::max(0, smpl ? smpl->n_sample : 0),
19752
+ /*.n_grammar =*/ std::max(0, smpl ? smpl->n_grammar : 0),
19753
+ /*.n_accept =*/ std::max(0, smpl ? smpl->n_accept : 0),
19754
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
19755
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
19751
19756
};
19752
19757
19753
19758
LLAMA_LOG_INFO("\n");
19754
19759
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
19755
19760
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
19756
19761
__func__, timings.t_sampling_ms, timings.n_sampling, timings.t_sampling_ms / timings.n_sampling, 1e3 / timings.t_sampling_ms * timings.n_sampling);
19757
19762
LLAMA_LOG_INFO("%s: grammar time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
19758
- __func__, timings.t_grammar_ms, timings.n_grammar_sample, timings.t_grammar_ms / timings.n_grammar_sample, 1e3 / timings.t_grammar_ms * timings.n_grammar_sample);
19763
+ __func__, timings.t_grammar_ms, timings.n_grammar, timings.t_grammar_ms / timings.n_grammar, 1e3 / timings.t_grammar_ms * timings.n_grammar);
19764
+ //LLAMA_LOG_INFO("%s: accept time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
19765
+ // __func__, timings.t_accept_ms, timings.n_accept, timings.t_accept_ms / timings.n_accept, 1e3 / timings.t_accept_ms * timings.n_accept);
19759
19766
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
19760
19767
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
19761
19768
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
@@ -19769,11 +19776,9 @@ void llama_reset_timings(struct llama_context * ctx, struct llama_sampling * smp
19769
19776
ctx->t_p_eval_us = ctx->n_p_eval = 0;
19770
19777
19771
19778
if (smpl) {
19772
- smpl->t_total_us = smpl->n_sample = 0;
19773
-
19774
- if (smpl->grammar) {
19775
- smpl->grammar->t_total_us = smpl->grammar->n_sample = smpl->grammar->n_accept = 0;
19776
- }
19779
+ smpl->t_sample_us = smpl->n_sample = 0;
19780
+ smpl->t_grammar_us = smpl->n_grammar = 0;
19781
+ smpl->t_accept_us = smpl->n_accept = 0;
19777
19782
}
19778
19783
}
19779
19784
0 commit comments