@@ -168,9 +168,11 @@ struct llama_context {
168168
169169 int64_t t_sample_us = 0 ;
170170 int64_t t_eval_us = 0 ;
171+ int64_t t_p_eval_us = 0 ;
171172
172173 int32_t n_sample = 0 ; // number of tokens sampled
173174 int32_t n_eval = 0 ; // number of eval calls
175+ int32_t n_p_eval = 0 ; // number of tokens in eval calls for the prompt (with batch size > 1)
174176
175177 llama_model model;
176178 llama_vocab vocab;
@@ -1070,6 +1072,10 @@ static bool llama_eval_internal(
10701072 lctx.t_eval_us += ggml_time_us () - t_start_us;
10711073 lctx.n_eval ++;
10721074 }
1075+ else if (N > 1 ) {
1076+ lctx.t_p_eval_us += ggml_time_us () - t_start_us;
1077+ lctx.n_p_eval += N;
1078+ }
10731079
10741080 return true ;
10751081}
@@ -1811,19 +1817,22 @@ void llama_print_timings(struct llama_context * ctx) {
18111817
18121818 const int32_t n_sample = std::max (1 , ctx->n_sample );
18131819 const int32_t n_eval = std::max (1 , ctx->n_eval );
1820+ const int32_t n_p_eval = std::max (1 , ctx->n_p_eval );
18141821
18151822 fprintf (stderr, " \n " );
1816- fprintf (stderr, " %s: load time = %8.2f ms\n " , __func__, ctx->t_load_us / 1000 .0f );
1817- fprintf (stderr, " %s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n " , __func__, 1e-3f * ctx->t_sample_us , n_sample, 1e-3f * ctx->t_sample_us / n_sample);
1818- fprintf (stderr, " %s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n " , __func__, 1e-3f * ctx->t_eval_us , n_eval, 1e-3f * ctx->t_eval_us / n_eval);
1819- fprintf (stderr, " %s: total time = %8.2f ms\n " , __func__, (t_end_us - ctx->t_start_us )/1000 .0f );
1823+ fprintf (stderr, " %s: load time = %8.2f ms\n " , __func__, ctx->t_load_us / 1000 .0f );
1824+ fprintf (stderr, " %s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n " , __func__, 1e-3f * ctx->t_sample_us , n_sample, 1e-3f * ctx->t_sample_us / n_sample);
1825+ fprintf (stderr, " %s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n " , __func__, 1e-3f * ctx->t_p_eval_us , n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval);
1826+ fprintf (stderr, " %s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n " , __func__, 1e-3f * ctx->t_eval_us , n_eval, 1e-3f * ctx->t_eval_us / n_eval);
1827+ fprintf (stderr, " %s: total time = %8.2f ms\n " , __func__, (t_end_us - ctx->t_start_us )/1000 .0f );
18201828}
18211829
18221830void llama_reset_timings (struct llama_context * ctx) {
18231831 ctx->t_start_us = ggml_time_us ();
18241832
18251833 ctx->t_sample_us = ctx->n_sample = 0 ;
18261834 ctx->t_eval_us = ctx->n_eval = 0 ;
1835+ ctx->t_p_eval_us = ctx->n_p_eval = 0 ;
18271836}
18281837
18291838const char * llama_print_system_info (void ) {
0 commit comments