Skip to content

Commit d52d781

Browse files
authored
server: concurrency fix + monitoring - add /metrics prometheus compatible endpoint (#5708)
* server: monitoring - add /metrics prometheus compatible endpoint * server: concurrency issue, when 2 task are waiting for results, only one call thread is notified * server: metrics - move to a dedicated struct
1 parent 1289408 commit d52d781

File tree

7 files changed

+191
-8
lines changed

7 files changed

+191
-8
lines changed

examples/server/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
4141
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
4242
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
4343
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
44+
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
4445
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
4546

4647
## Build
@@ -457,6 +458,18 @@ Notice that each `probs` is an array of length `n_probs`.
457458
]
458459
```
459460

461+
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
462+
463+
Available metrics:
464+
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
465+
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
466+
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
467+
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
468+
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
469+
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
470+
- `llamacpp:requests_processing`: Number of request processing.
471+
- `llamacpp:requests_deferred`: Number of request deferred.
472+
460473
## More examples
461474

462475
### Change system prompt on runtime

examples/server/server.cpp

Lines changed: 144 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct server_params
4343
int32_t read_timeout = 600;
4444
int32_t write_timeout = 600;
4545
bool slots_endpoint = true;
46+
bool metrics_endpoint = false;
4647
};
4748

4849
bool server_verbose = false;
@@ -310,6 +311,39 @@ struct llama_client_slot
310311
}
311312
};
312313

314+
struct llama_metrics {
315+
uint64_t n_prompt_tokens_processed_total = 0;
316+
uint64_t n_tokens_predicted_total = 0;
317+
318+
uint64_t n_prompt_tokens_processed = 0;
319+
uint64_t t_prompt_processing = 0;
320+
321+
uint64_t n_tokens_predicted = 0;
322+
uint64_t t_tokens_generation = 0;
323+
324+
325+
void on_prompt_eval(const llama_client_slot &slot) {
326+
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
327+
328+
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
329+
t_prompt_processing += slot.t_prompt_processing;
330+
}
331+
332+
void on_prediction(const llama_client_slot &slot) {
333+
n_tokens_predicted_total += slot.n_decoded;
334+
335+
n_tokens_predicted += slot.n_decoded;
336+
t_tokens_generation += slot.t_token_generation;
337+
}
338+
339+
void reset_bucket() {
340+
n_prompt_tokens_processed = 0;
341+
t_prompt_processing = 0;
342+
n_tokens_predicted = 0;
343+
t_tokens_generation = 0;
344+
}
345+
};
346+
313347
struct llama_server_context
314348
{
315349
llama_model *model = nullptr;
@@ -344,6 +378,8 @@ struct llama_server_context
344378
llama_server_queue queue_tasks;
345379
llama_server_response queue_results;
346380

381+
llama_metrics metrics;
382+
347383
~llama_server_context()
348384
{
349385
if (ctx)
@@ -1404,7 +1440,7 @@ struct llama_server_context
14041440
case TASK_TYPE_NEXT_RESPONSE: {
14051441
// do nothing
14061442
} break;
1407-
case TASK_TYPE_SLOTS_DATA: {
1443+
case TASK_TYPE_METRICS: {
14081444
json slots_data = json::array();
14091445
int n_idle_slots = 0;
14101446
int n_processing_slots = 0;
@@ -1438,10 +1474,24 @@ struct llama_server_context
14381474
res.stop = true;
14391475
res.error = false;
14401476
res.result_json = {
1441-
{ "idle", n_idle_slots },
1442-
{ "processing", n_processing_slots },
1443-
{ "slots", slots_data }
1477+
{ "idle", n_idle_slots },
1478+
{ "processing", n_processing_slots },
1479+
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
1480+
1481+
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
1482+
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
1483+
1484+
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
1485+
{ "t_prompt_processing", metrics.t_prompt_processing},
1486+
{ "n_tokens_predicted", metrics.n_tokens_predicted},
1487+
{ "t_tokens_generation", metrics.t_tokens_generation},
1488+
1489+
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
1490+
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
1491+
1492+
{ "slots", slots_data },
14441493
};
1494+
metrics.reset_bucket();
14451495
queue_results.send(res);
14461496
} break;
14471497
}
@@ -1849,6 +1899,7 @@ struct llama_server_context
18491899
{
18501900
slot.t_start_genereration = ggml_time_us();
18511901
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
1902+
metrics.on_prompt_eval(slot);
18521903
}
18531904

18541905
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
@@ -1871,6 +1922,7 @@ struct llama_server_context
18711922
slot.release();
18721923
slot.print_timings();
18731924
send_final_response(slot);
1925+
metrics.on_prediction(slot);
18741926
}
18751927

18761928
slot.i_batch = -1;
@@ -1955,6 +2007,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
19552007
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
19562008
printf(" --log-disable disables logging to a file.\n");
19572009
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
2010+
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
19582011
printf("\n");
19592012
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
19602013
printf(" --override-kv KEY=TYPE:VALUE\n");
@@ -2414,6 +2467,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
24142467
{
24152468
sparams.slots_endpoint = false;
24162469
}
2470+
else if (arg == "--metrics")
2471+
{
2472+
sparams.metrics_endpoint = true;
2473+
}
24172474
else if (arg == "--chat-template")
24182475
{
24192476
if (++i >= argc)
@@ -2621,7 +2678,7 @@ int main(int argc, char **argv)
26212678
// request slots data using task queue
26222679
task_server task;
26232680
task.id = llama.queue_tasks.get_new_id();
2624-
task.type = TASK_TYPE_SLOTS_DATA;
2681+
task.type = TASK_TYPE_METRICS;
26252682
task.target_id = -1;
26262683

26272684
llama.queue_results.add_waiting_task_id(task.id);
@@ -2668,7 +2725,7 @@ int main(int argc, char **argv)
26682725
// request slots data using task queue
26692726
task_server task;
26702727
task.id = llama.queue_tasks.get_new_id();
2671-
task.type = TASK_TYPE_SLOTS_DATA;
2728+
task.type = TASK_TYPE_METRICS;
26722729
task.target_id = -1;
26732730

26742731
llama.queue_results.add_waiting_task_id(task.id);
@@ -2683,6 +2740,87 @@ int main(int argc, char **argv)
26832740
});
26842741
}
26852742

2743+
if (sparams.metrics_endpoint) {
2744+
svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
2745+
// request slots data using task queue
2746+
task_server task;
2747+
task.id = llama.queue_tasks.get_new_id();
2748+
task.type = TASK_TYPE_METRICS;
2749+
task.target_id = -1;
2750+
2751+
llama.queue_results.add_waiting_task_id(task.id);
2752+
llama.queue_tasks.post(task);
2753+
2754+
// get the result
2755+
task_result result = llama.queue_results.recv(task.id);
2756+
llama.queue_results.remove_waiting_task_id(task.id);
2757+
2758+
json data = result.result_json;
2759+
2760+
uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
2761+
uint64_t t_prompt_processing = data["t_prompt_processing"];
2762+
2763+
uint64_t n_tokens_predicted = data["n_tokens_predicted"];
2764+
uint64_t t_tokens_generation = data["t_tokens_generation"];
2765+
2766+
int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
2767+
2768+
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
2769+
json all_metrics_def = json {
2770+
{"counter", {{
2771+
{"name", "prompt_tokens_total"},
2772+
{"help", "Number of prompt tokens processed."},
2773+
{"value", data["n_prompt_tokens_processed_total"]}
2774+
}, {
2775+
{"name", "tokens_predicted_total"},
2776+
{"help", "Number of generation tokens processed."},
2777+
{"value", data["n_tokens_predicted_total"]}
2778+
}}},
2779+
{"gauge", {{
2780+
{"name", "prompt_tokens_seconds"},
2781+
{"help", "Average prompt throughput in tokens/s."},
2782+
{"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
2783+
},{
2784+
{"name", "predicted_tokens_seconds"},
2785+
{"help", "Average generation throughput in tokens/s."},
2786+
{"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
2787+
},{
2788+
{"name", "kv_cache_usage_ratio"},
2789+
{"help", "KV-cache usage. 1 means 100 percent usage."},
2790+
{"value", 1. * kv_cache_used_cells / params.n_ctx}
2791+
},{
2792+
{"name", "kv_cache_tokens"},
2793+
{"help", "KV-cache tokens."},
2794+
{"value", data["kv_cache_tokens_count"]}
2795+
},{
2796+
{"name", "requests_processing"},
2797+
{"help", "Number of request processing."},
2798+
{"value", data["processing"]}
2799+
},{
2800+
{"name", "requests_deferred"},
2801+
{"help", "Number of request deferred."},
2802+
{"value", data["deferred"]}
2803+
}}}
2804+
};
2805+
2806+
std::stringstream prometheus;
2807+
for (const auto& el : all_metrics_def.items()) {
2808+
const auto& type = el.key();
2809+
const auto& metrics_def = el.value();
2810+
for (const auto& metric_def : metrics_def) {
2811+
std::string name = metric_def["name"];
2812+
std::string help = metric_def["help"];
2813+
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
2814+
<< "# TYPE llamacpp:" << name << " " << type << "\n"
2815+
<< "llamacpp:" << name << " " << metric_def["value"] << "\n";
2816+
}
2817+
}
2818+
2819+
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
2820+
res.status = 200; // HTTP OK
2821+
});
2822+
}
2823+
26862824
svr.set_logger(log_server_request);
26872825

26882826
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)

examples/server/tests/features/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ def before_scenario(context, scenario):
1616

1717

1818
def after_scenario(context, scenario):
19+
if context.server_process is None:
20+
return
1921
if scenario.status == "failed":
2022
if 'GITHUB_ACTIONS' in os.environ:
2123
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")

examples/server/tests/features/server.feature

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Feature: llama.cpp server
1313
And 1 slots
1414
And embeddings extraction
1515
And 32 server max tokens to predict
16+
And prometheus compatible metrics exposed
1617
Then the server is starting
1718
Then the server is healthy
1819

@@ -25,6 +26,7 @@ Feature: llama.cpp server
2526
And <n_predict> max tokens to predict
2627
And a completion request with no api error
2728
Then <n_predicted> tokens are predicted matching <re_content>
29+
And prometheus metrics are exposed
2830

2931
Examples: Prompts
3032
| prompt | n_predict | re_content | n_predicted |

examples/server/tests/features/steps/steps.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import openai
1414
from behave import step
1515
from behave.api.async_step import async_run_until_complete
16+
from prometheus_client import parser
1617

1718

1819
@step(u"a server listening on {server_fqdn}:{server_port}")
@@ -34,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port):
3435
context.server_api_key = None
3536
context.server_continuous_batching = False
3637
context.server_embeddings = False
38+
context.server_metrics = False
39+
context.server_process = None
3740
context.server_seed = None
3841
context.user_api_key = None
3942

@@ -82,6 +85,11 @@ def step_server_embeddings(context):
8285
context.server_embeddings = True
8386

8487

88+
@step(u'prometheus compatible metrics exposed')
89+
def step_server_metrics(context):
90+
context.server_metrics = True
91+
92+
8593
@step(u"the server is starting")
8694
def step_start_server(context):
8795
start_server_background(context)
@@ -424,6 +432,23 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
424432
assert context.options_response.headers[cors_header] == cors_header_value
425433

426434

435+
@step(u'prometheus metrics are exposed')
436+
@async_run_until_complete
437+
async def step_prometheus_metrics_exported(context):
438+
async with aiohttp.ClientSession() as session:
439+
async with await session.get(f'{context.base_url}/metrics') as metrics_response:
440+
assert metrics_response.status == 200
441+
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
442+
metrics_raw = await metrics_response.text()
443+
metric_exported = False
444+
for metric in parser.text_string_to_metric_families(metrics_raw):
445+
match metric.name:
446+
case "llamacpp:kv_cache_usage_ratio":
447+
assert len(metric.samples) > 0
448+
metric_exported = True
449+
assert metric_exported, "No metrics exported"
450+
451+
427452
async def concurrent_requests(context, f_completion, *args, **kwargs):
428453
n_prompts = len(context.prompts)
429454
if context.debug:
@@ -753,6 +778,8 @@ def start_server_background(context):
753778
server_args.append('--cont-batching')
754779
if context.server_embeddings:
755780
server_args.append('--embedding')
781+
if context.server_metrics:
782+
server_args.append('--metrics')
756783
if context.model_alias is not None:
757784
server_args.extend(['--alias', context.model_alias])
758785
if context.n_ctx is not None:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
aiohttp~=3.9.3
22
behave~=1.2.6
33
openai~=0.25.0
4+
prometheus-client~=0.20.0

examples/server/utils.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ enum task_type {
5050
TASK_TYPE_COMPLETION,
5151
TASK_TYPE_CANCEL,
5252
TASK_TYPE_NEXT_RESPONSE,
53-
TASK_TYPE_SLOTS_DATA
53+
TASK_TYPE_METRICS
5454
};
5555

5656
struct task_server {
@@ -441,7 +441,7 @@ struct llama_server_response {
441441
{
442442
LOG_VERBOSE("queue_results.push_back", {});
443443
queue_results.push_back(result);
444-
condition_results.notify_one();
444+
condition_results.notify_all();
445445
return;
446446
}
447447
}

0 commit comments

Comments
 (0)