@@ -335,8 +335,12 @@ struct server_slot {
335
335
};
336
336
337
337
struct server_metrics {
338
+ const int64_t t_start = ggml_time_us();
339
+
338
340
uint64_t n_prompt_tokens_processed_total = 0 ;
341
+ uint64_t t_prompt_processing_total = 0 ;
339
342
uint64_t n_tokens_predicted_total = 0 ;
343
+ uint64_t t_tokens_generation_total = 0 ;
340
344
341
345
uint64_t n_prompt_tokens_processed = 0 ;
342
346
uint64_t t_prompt_processing = 0 ;
@@ -348,12 +352,14 @@ struct server_metrics {
348
352
n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed ;
349
353
n_prompt_tokens_processed += slot.n_prompt_tokens_processed ;
350
354
t_prompt_processing += slot.t_prompt_processing ;
355
+ t_prompt_processing_total += slot.t_prompt_processing ;
351
356
}
352
357
353
358
void on_prediction (const server_slot &slot) {
354
- n_tokens_predicted_total += slot.n_decoded ;
355
- n_tokens_predicted += slot.n_decoded ;
356
- t_tokens_generation += slot.t_token_generation ;
359
+ n_tokens_predicted_total += slot.n_decoded ;
360
+ n_tokens_predicted += slot.n_decoded ;
361
+ t_tokens_generation += slot.t_token_generation ;
362
+ t_tokens_generation_total += slot.t_token_generation ;
357
363
}
358
364
359
365
void reset_bucket () {
@@ -1502,9 +1508,12 @@ struct server_context {
1502
1508
{ " idle" , n_idle_slots },
1503
1509
{ " processing" , n_processing_slots },
1504
1510
{ " deferred" , queue_tasks.queue_tasks_deferred .size () },
1511
+ { " t_start" , metrics.t_start },
1505
1512
1506
1513
{ " n_prompt_tokens_processed_total" , metrics.n_prompt_tokens_processed_total },
1514
+ { " t_tokens_generation_total" , metrics.t_tokens_generation_total },
1507
1515
{ " n_tokens_predicted_total" , metrics.n_tokens_predicted_total },
1516
+ { " t_prompt_processing_total" , metrics.t_prompt_processing_total },
1508
1517
1509
1518
{ " n_prompt_tokens_processed" , metrics.n_prompt_tokens_processed },
1510
1519
{ " t_prompt_processing" , metrics.t_prompt_processing },
@@ -1517,7 +1526,9 @@ struct server_context {
1517
1526
{ " slots" , slots_data },
1518
1527
};
1519
1528
1520
- metrics.reset_bucket ();
1529
+ if (json_value (task.data , " reset_bucket" , false )) {
1530
+ metrics.reset_bucket ();
1531
+ }
1521
1532
queue_results.send (res);
1522
1533
} break ;
1523
1534
}
@@ -2709,6 +2720,7 @@ int main(int argc, char ** argv) {
2709
2720
task.id_multi = -1 ;
2710
2721
task.id_target = -1 ;
2711
2722
task.type = SERVER_TASK_TYPE_METRICS;
2723
+ task.data .push_back ({{" reset_bucket" , true }});
2712
2724
2713
2725
ctx_server.queue_results .add_waiting_task_id (task.id );
2714
2726
ctx_server.queue_tasks .post (task);
@@ -2732,36 +2744,44 @@ int main(int argc, char ** argv) {
2732
2744
{" counter" , {{
2733
2745
{" name" , " prompt_tokens_total" },
2734
2746
{" help" , " Number of prompt tokens processed." },
2735
- {" value" , data[" n_prompt_tokens_processed_total" ]}
2747
+ {" value" , (uint64_t ) data[" n_prompt_tokens_processed_total" ]}
2748
+ }, {
2749
+ {" name" , " prompt_seconds_total" },
2750
+ {" help" , " Prompt process time" },
2751
+ {" value" , (uint64_t ) data[" t_prompt_processing_total" ] / 1 .e3 }
2736
2752
}, {
2737
2753
{" name" , " tokens_predicted_total" },
2738
2754
{" help" , " Number of generation tokens processed." },
2739
- {" value" , data[" n_tokens_predicted_total" ]}
2755
+ {" value" , (uint64_t ) data[" n_tokens_predicted_total" ]}
2756
+ }, {
2757
+ {" name" , " tokens_predicted_seconds_total" },
2758
+ {" help" , " Predict process time" },
2759
+ {" value" , (uint64_t ) data[" t_tokens_generation_total" ] / 1 .e3 }
2740
2760
}}},
2741
2761
{" gauge" , {{
2742
2762
{" name" , " prompt_tokens_seconds" },
2743
2763
{" help" , " Average prompt throughput in tokens/s." },
2744
- {" value" , n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0 }
2764
+ {" value" , n_prompt_tokens_processed ? 1 . e3 / t_prompt_processing * n_prompt_tokens_processed : 0 . }
2745
2765
},{
2746
2766
{" name" , " predicted_tokens_seconds" },
2747
2767
{" help" , " Average generation throughput in tokens/s." },
2748
- {" value" , n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0 }
2768
+ {" value" , n_tokens_predicted ? 1 . e3 / t_tokens_generation * n_tokens_predicted : 0 . }
2749
2769
},{
2750
2770
{" name" , " kv_cache_usage_ratio" },
2751
2771
{" help" , " KV-cache usage. 1 means 100 percent usage." },
2752
2772
{" value" , 1 . * kv_cache_used_cells / params.n_ctx }
2753
2773
},{
2754
2774
{" name" , " kv_cache_tokens" },
2755
2775
{" help" , " KV-cache tokens." },
2756
- {" value" , data[" kv_cache_tokens_count" ]}
2776
+ {" value" , ( uint64_t ) data[" kv_cache_tokens_count" ]}
2757
2777
},{
2758
2778
{" name" , " requests_processing" },
2759
2779
{" help" , " Number of request processing." },
2760
- {" value" , data[" processing" ]}
2780
+ {" value" , ( uint64_t ) data[" processing" ]}
2761
2781
},{
2762
2782
{" name" , " requests_deferred" },
2763
2783
{" help" , " Number of request deferred." },
2764
- {" value" , data[" deferred" ]}
2784
+ {" value" , ( uint64_t ) data[" deferred" ]}
2765
2785
}}}
2766
2786
};
2767
2787
@@ -2775,13 +2795,16 @@ int main(int argc, char ** argv) {
2775
2795
const std::string name = metric_def[" name" ];
2776
2796
const std::string help = metric_def[" help" ];
2777
2797
2778
- auto value = json_value (metric_def, " value" , 0 );
2798
+ auto value = json_value (metric_def, " value" , 0 . );
2779
2799
prometheus << " # HELP llamacpp:" << name << " " << help << " \n "
2780
2800
<< " # TYPE llamacpp:" << name << " " << type << " \n "
2781
2801
<< " llamacpp:" << name << " " << value << " \n " ;
2782
2802
}
2783
2803
}
2784
2804
2805
+ const int64_t t_start = data[" t_start" ];
2806
+ res.set_header (" Process-Start-Time-Unix" , std::to_string (t_start));
2807
+
2785
2808
res.set_content (prometheus.str (), " text/plain; version=0.0.4" );
2786
2809
res.status = 200 ; // HTTP OK
2787
2810
});
0 commit comments