Skip to content

Commit 76e8688

Browse files
authored
server: metrics: add llamacpp:prompt_seconds_total and llamacpp:tokens_predicted_seconds_total, reset bucket only on /metrics. Fix values cast to int. Add Process-Start-Time-Unix header. (#5937)
Closes #5850
1 parent e457fb3 commit 76e8688

File tree

3 files changed

+46
-13
lines changed

3 files changed

+46
-13
lines changed

examples/server/server.cpp

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,12 @@ struct server_slot {
335335
};
336336

337337
struct server_metrics {
338+
const int64_t t_start = ggml_time_us();
339+
338340
uint64_t n_prompt_tokens_processed_total = 0;
341+
uint64_t t_prompt_processing_total = 0;
339342
uint64_t n_tokens_predicted_total = 0;
343+
uint64_t t_tokens_generation_total = 0;
340344

341345
uint64_t n_prompt_tokens_processed = 0;
342346
uint64_t t_prompt_processing = 0;
@@ -348,12 +352,14 @@ struct server_metrics {
348352
n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
349353
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
350354
t_prompt_processing += slot.t_prompt_processing;
355+
t_prompt_processing_total += slot.t_prompt_processing;
351356
}
352357

353358
void on_prediction(const server_slot &slot) {
354-
n_tokens_predicted_total += slot.n_decoded;
355-
n_tokens_predicted += slot.n_decoded;
356-
t_tokens_generation += slot.t_token_generation;
359+
n_tokens_predicted_total += slot.n_decoded;
360+
n_tokens_predicted += slot.n_decoded;
361+
t_tokens_generation += slot.t_token_generation;
362+
t_tokens_generation_total += slot.t_token_generation;
357363
}
358364

359365
void reset_bucket() {
@@ -1502,9 +1508,12 @@ struct server_context {
15021508
{ "idle", n_idle_slots },
15031509
{ "processing", n_processing_slots },
15041510
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
1511+
{ "t_start", metrics.t_start},
15051512

15061513
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
1514+
{ "t_tokens_generation_total", metrics.t_tokens_generation_total},
15071515
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
1516+
{ "t_prompt_processing_total", metrics.t_prompt_processing_total},
15081517

15091518
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
15101519
{ "t_prompt_processing", metrics.t_prompt_processing},
@@ -1517,7 +1526,9 @@ struct server_context {
15171526
{ "slots", slots_data },
15181527
};
15191528

1520-
metrics.reset_bucket();
1529+
if (json_value(task.data, "reset_bucket", false)) {
1530+
metrics.reset_bucket();
1531+
}
15211532
queue_results.send(res);
15221533
} break;
15231534
}
@@ -2709,6 +2720,7 @@ int main(int argc, char ** argv) {
27092720
task.id_multi = -1;
27102721
task.id_target = -1;
27112722
task.type = SERVER_TASK_TYPE_METRICS;
2723+
task.data.push_back({{"reset_bucket", true}});
27122724

27132725
ctx_server.queue_results.add_waiting_task_id(task.id);
27142726
ctx_server.queue_tasks.post(task);
@@ -2732,36 +2744,44 @@ int main(int argc, char ** argv) {
27322744
{"counter", {{
27332745
{"name", "prompt_tokens_total"},
27342746
{"help", "Number of prompt tokens processed."},
2735-
{"value", data["n_prompt_tokens_processed_total"]}
2747+
{"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
2748+
}, {
2749+
{"name", "prompt_seconds_total"},
2750+
{"help", "Prompt process time"},
2751+
{"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
27362752
}, {
27372753
{"name", "tokens_predicted_total"},
27382754
{"help", "Number of generation tokens processed."},
2739-
{"value", data["n_tokens_predicted_total"]}
2755+
{"value", (uint64_t) data["n_tokens_predicted_total"]}
2756+
}, {
2757+
{"name", "tokens_predicted_seconds_total"},
2758+
{"help", "Predict process time"},
2759+
{"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
27402760
}}},
27412761
{"gauge", {{
27422762
{"name", "prompt_tokens_seconds"},
27432763
{"help", "Average prompt throughput in tokens/s."},
2744-
{"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
2764+
{"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.}
27452765
},{
27462766
{"name", "predicted_tokens_seconds"},
27472767
{"help", "Average generation throughput in tokens/s."},
2748-
{"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
2768+
{"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.}
27492769
},{
27502770
{"name", "kv_cache_usage_ratio"},
27512771
{"help", "KV-cache usage. 1 means 100 percent usage."},
27522772
{"value", 1. * kv_cache_used_cells / params.n_ctx}
27532773
},{
27542774
{"name", "kv_cache_tokens"},
27552775
{"help", "KV-cache tokens."},
2756-
{"value", data["kv_cache_tokens_count"]}
2776+
{"value", (uint64_t) data["kv_cache_tokens_count"]}
27572777
},{
27582778
{"name", "requests_processing"},
27592779
{"help", "Number of request processing."},
2760-
{"value", data["processing"]}
2780+
{"value", (uint64_t) data["processing"]}
27612781
},{
27622782
{"name", "requests_deferred"},
27632783
{"help", "Number of request deferred."},
2764-
{"value", data["deferred"]}
2784+
{"value", (uint64_t) data["deferred"]}
27652785
}}}
27662786
};
27672787

@@ -2775,13 +2795,16 @@ int main(int argc, char ** argv) {
27752795
const std::string name = metric_def["name"];
27762796
const std::string help = metric_def["help"];
27772797

2778-
auto value = json_value(metric_def, "value", 0);
2798+
auto value = json_value(metric_def, "value", 0.);
27792799
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
27802800
<< "# TYPE llamacpp:" << name << " " << type << "\n"
27812801
<< "llamacpp:" << name << " " << value << "\n";
27822802
}
27832803
}
27842804

2805+
const int64_t t_start = data["t_start"];
2806+
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
2807+
27852808
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
27862809
res.status = 200; // HTTP OK
27872810
});

examples/server/tests/features/server.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Feature: llama.cpp server
2929
And a completion request with no api error
3030
Then <n_predicted> tokens are predicted matching <re_content>
3131
And prometheus metrics are exposed
32+
And metric llamacpp:tokens_predicted is <n_predicted>
3233

3334
Examples: Prompts
3435
| prompt | n_predict | re_content | n_predicted |

examples/server/tests/features/steps/steps.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,14 +586,24 @@ async def step_prometheus_metrics_exported(context):
586586
metric_exported = False
587587
if context.debug:
588588
print(f"/metrics answer:\n{metrics_raw}\n")
589+
context.metrics = {}
589590
for metric in parser.text_string_to_metric_families(metrics_raw):
590591
match metric.name:
591592
case "llamacpp:kv_cache_usage_ratio":
592593
assert len(metric.samples) > 0
593594
metric_exported = True
595+
context.metrics[metric.name] = metric
596+
assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time"
594597
assert metric_exported, "No metrics exported"
595598

596599

600+
@step(u'metric {metric_name} is {metric_value:d}')
601+
def step_assert_metric_value(context, metric_name, metric_value):
602+
if metric_name not in context.metrics:
603+
assert False, f"no metric {metric_name} in {context.metrics.keys()}"
604+
assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
605+
606+
597607
@step(u'available models')
598608
def step_available_models(context):
599609
# openai client always expects an api_key
@@ -879,7 +889,6 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
879889
f' {n_predicted} <> {expected_predicted_n}')
880890

881891

882-
883892
async def gather_tasks_results(context):
884893
n_tasks = len(context.concurrent_tasks)
885894
if context.debug:

0 commit comments

Comments
 (0)