Skip to content

Commit 29b7baa

Browse files
authored
Add timings for the prompt evaluation (#478)
1 parent 4a7129a commit 29b7baa

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

llama.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,11 @@ struct llama_context {
168168

169169
int64_t t_sample_us = 0;
170170
int64_t t_eval_us = 0;
171+
int64_t t_p_eval_us = 0;
171172

172173
int32_t n_sample = 0; // number of tokens sampled
173174
int32_t n_eval = 0; // number of eval calls
175+
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
174176

175177
llama_model model;
176178
llama_vocab vocab;
@@ -1070,6 +1072,10 @@ static bool llama_eval_internal(
10701072
lctx.t_eval_us += ggml_time_us() - t_start_us;
10711073
lctx.n_eval++;
10721074
}
1075+
else if (N > 1) {
1076+
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
1077+
lctx.n_p_eval += N;
1078+
}
10731079

10741080
return true;
10751081
}
@@ -1811,19 +1817,22 @@ void llama_print_timings(struct llama_context * ctx) {
18111817

18121818
const int32_t n_sample = std::max(1, ctx->n_sample);
18131819
const int32_t n_eval = std::max(1, ctx->n_eval);
1820+
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
18141821

18151822
fprintf(stderr, "\n");
1816-
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
1817-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
1818-
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_eval_us, n_eval, 1e-3f * ctx->t_eval_us / n_eval);
1819-
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
1823+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
1824+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
1825+
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3f * ctx->t_p_eval_us, n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval);
1826+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_eval_us, n_eval, 1e-3f * ctx->t_eval_us / n_eval);
1827+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
18201828
}
18211829

18221830
void llama_reset_timings(struct llama_context * ctx) {
18231831
ctx->t_start_us = ggml_time_us();
18241832

18251833
ctx->t_sample_us = ctx->n_sample = 0;
18261834
ctx->t_eval_us = ctx->n_eval = 0;
1835+
ctx->t_p_eval_us = ctx->n_p_eval = 0;
18271836
}
18281837

18291838
const char * llama_print_system_info(void) {

0 commit comments

Comments
 (0)