12
12
// increase max payload length to allow use of larger context size
13
13
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
14
14
#include "httplib.h"
15
+ // Change JSON_ASSERT from assert() to GGML_ASSERT:
16
+ #define JSON_ASSERT GGML_ASSERT
15
17
#include "json.hpp"
16
18
17
19
// auto generated files (update with ./deps.sh)
@@ -859,7 +861,7 @@ struct server_context {
859
861
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
860
862
861
863
// process "json_schema" and "grammar"
862
- if (data.contains (" json_schema" ) && !data[ " json_schema" ] .is_null () && data.contains (" grammar" ) && !data[ " grammar" ] .is_null ()) {
864
+ if (data.contains("json_schema") && !data.at( "json_schema") .is_null() && data.contains("grammar") && !data.at( "grammar") .is_null()) {
863
865
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
864
866
return false;
865
867
} else if (data.contains("json_schema") && !data.contains("grammar")) {
@@ -1512,7 +1514,7 @@ struct server_context {
1512
1514
// add subtasks
1513
1515
for (int i = 0; i < prompt_count; i++) {
1514
1516
json subtask_data = multiprompt_task.data;
1515
- subtask_data[" prompt" ] = subtask_data[ " prompt" ] [i];
1517
+ subtask_data["prompt"] = subtask_data.at( "prompt") [i];
1516
1518
1517
1519
// subtasks inherit everything else (infill mode, embedding mode, etc.)
1518
1520
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
@@ -1532,7 +1534,7 @@ struct server_context {
1532
1534
}
1533
1535
1534
1536
if (task.data.contains("system_prompt")) {
1535
- system_prompt_set (task.data [ " system_prompt" ] );
1537
+ system_prompt_set(task.data.at( "system_prompt") );
1536
1538
1537
1539
for (server_slot & slot : slots) {
1538
1540
slot.n_past = 0;
@@ -1644,7 +1646,7 @@ struct server_context {
1644
1646
} break;
1645
1647
case SERVER_TASK_TYPE_SLOT_SAVE:
1646
1648
{
1647
- int id_slot = task.data [ " id_slot" ] ;
1649
+ int id_slot = task.data.at( "id_slot") ;
1648
1650
server_slot * slot = get_slot(id_slot);
1649
1651
if (slot == nullptr) {
1650
1652
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1654,8 +1656,8 @@ struct server_context {
1654
1656
const size_t token_count = slot->cache_tokens.size();
1655
1657
const int64_t t_start = ggml_time_us();
1656
1658
1657
- std::string filename = task.data [ " filename" ] ;
1658
- std::string filepath = task.data [ " filepath" ] ;
1659
+ std::string filename = task.data.at( "filename") ;
1660
+ std::string filepath = task.data.at( "filepath") ;
1659
1661
1660
1662
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
1661
1663
@@ -1679,7 +1681,7 @@ struct server_context {
1679
1681
} break;
1680
1682
case SERVER_TASK_TYPE_SLOT_RESTORE:
1681
1683
{
1682
- int id_slot = task.data [ " id_slot" ] ;
1684
+ int id_slot = task.data.at( "id_slot") ;
1683
1685
server_slot * slot = get_slot(id_slot);
1684
1686
if (slot == nullptr) {
1685
1687
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1688,8 +1690,8 @@ struct server_context {
1688
1690
1689
1691
const int64_t t_start = ggml_time_us();
1690
1692
1691
- std::string filename = task.data [ " filename" ] ;
1692
- std::string filepath = task.data [ " filepath" ] ;
1693
+ std::string filename = task.data.at( "filename") ;
1694
+ std::string filepath = task.data.at( "filepath") ;
1693
1695
1694
1696
slot->cache_tokens.resize(slot->n_ctx);
1695
1697
size_t token_count = 0;
@@ -1721,7 +1723,7 @@ struct server_context {
1721
1723
} break;
1722
1724
case SERVER_TASK_TYPE_SLOT_ERASE:
1723
1725
{
1724
- int id_slot = task.data [ " id_slot" ] ;
1726
+ int id_slot = task.data.at( "id_slot") ;
1725
1727
server_slot * slot = get_slot(id_slot);
1726
1728
if (slot == nullptr) {
1727
1729
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -3136,8 +3138,8 @@ int main(int argc, char ** argv) {
3136
3138
server_task_result result = ctx_server.queue_results.recv(task.id);
3137
3139
ctx_server.queue_results.remove_waiting_task_id(task.id);
3138
3140
3139
- const int n_idle_slots = result.data [ " idle" ] ;
3140
- const int n_processing_slots = result.data [ " processing" ] ;
3141
+ const int n_idle_slots = result.data.at( "idle") ;
3142
+ const int n_processing_slots = result.data.at( "processing") ;
3141
3143
3142
3144
json health = {
3143
3145
{"status", "ok"},
@@ -3147,7 +3149,7 @@ int main(int argc, char ** argv) {
3147
3149
3148
3150
res.status = 200; // HTTP OK
3149
3151
if (sparams.slots_endpoint && req.has_param("include_slots")) {
3150
- health[" slots" ] = result.data [ " slots" ] ;
3152
+ health["slots"] = result.data.at( "slots") ;
3151
3153
}
3152
3154
3153
3155
if (n_idle_slots == 0) {
@@ -3191,7 +3193,7 @@ int main(int argc, char ** argv) {
3191
3193
server_task_result result = ctx_server.queue_results.recv(task.id);
3192
3194
ctx_server.queue_results.remove_waiting_task_id(task.id);
3193
3195
3194
- res.set_content (result.data [ " slots" ] .dump (), " application/json" );
3196
+ res.set_content(result.data.at( "slots") .dump(), "application/json");
3195
3197
res.status = 200; // HTTP OK
3196
3198
};
3197
3199
@@ -3218,32 +3220,32 @@ int main(int argc, char ** argv) {
3218
3220
3219
3221
json data = result.data;
3220
3222
3221
- const uint64_t n_prompt_tokens_processed = data[ " n_prompt_tokens_processed" ] ;
3222
- const uint64_t t_prompt_processing = data[ " t_prompt_processing" ] ;
3223
+ const uint64_t n_prompt_tokens_processed = data.at( "n_prompt_tokens_processed") ;
3224
+ const uint64_t t_prompt_processing = data.at( "t_prompt_processing") ;
3223
3225
3224
- const uint64_t n_tokens_predicted = data[ " n_tokens_predicted" ] ;
3225
- const uint64_t t_tokens_generation = data[ " t_tokens_generation" ] ;
3226
+ const uint64_t n_tokens_predicted = data.at( "n_tokens_predicted") ;
3227
+ const uint64_t t_tokens_generation = data.at( "t_tokens_generation") ;
3226
3228
3227
- const int32_t kv_cache_used_cells = data[ " kv_cache_used_cells" ] ;
3229
+ const int32_t kv_cache_used_cells = data.at( "kv_cache_used_cells") ;
3228
3230
3229
3231
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
3230
3232
json all_metrics_def = json {
3231
3233
{"counter", {{
3232
3234
{"name", "prompt_tokens_total"},
3233
3235
{"help", "Number of prompt tokens processed."},
3234
- {" value" , (uint64_t ) data[ " n_prompt_tokens_processed_total" ] }
3236
+ {"value", (uint64_t) data.at( "n_prompt_tokens_processed_total") }
3235
3237
}, {
3236
3238
{"name", "prompt_seconds_total"},
3237
3239
{"help", "Prompt process time"},
3238
- {" value" , (uint64_t ) data[ " t_prompt_processing_total" ] / 1 .e3 }
3240
+ {"value", (uint64_t) data.at( "t_prompt_processing_total") / 1.e3}
3239
3241
}, {
3240
3242
{"name", "tokens_predicted_total"},
3241
3243
{"help", "Number of generation tokens processed."},
3242
- {" value" , (uint64_t ) data[ " n_tokens_predicted_total" ] }
3244
+ {"value", (uint64_t) data.at( "n_tokens_predicted_total") }
3243
3245
}, {
3244
3246
{"name", "tokens_predicted_seconds_total"},
3245
3247
{"help", "Predict process time"},
3246
- {" value" , (uint64_t ) data[ " t_tokens_generation_total" ] / 1 .e3 }
3248
+ {"value", (uint64_t) data.at( "t_tokens_generation_total") / 1.e3}
3247
3249
}}},
3248
3250
{"gauge", {{
3249
3251
{"name", "prompt_tokens_seconds"},
@@ -3260,15 +3262,15 @@ int main(int argc, char ** argv) {
3260
3262
},{
3261
3263
{"name", "kv_cache_tokens"},
3262
3264
{"help", "KV-cache tokens."},
3263
- {" value" , (uint64_t ) data[ " kv_cache_tokens_count" ] }
3265
+ {"value", (uint64_t) data.at( "kv_cache_tokens_count") }
3264
3266
},{
3265
3267
{"name", "requests_processing"},
3266
3268
{"help", "Number of request processing."},
3267
- {" value" , (uint64_t ) data[ " processing" ] }
3269
+ {"value", (uint64_t) data.at( "processing") }
3268
3270
},{
3269
3271
{"name", "requests_deferred"},
3270
3272
{"help", "Number of request deferred."},
3271
- {" value" , (uint64_t ) data[ " deferred" ] }
3273
+ {"value", (uint64_t) data.at( "deferred") }
3272
3274
}}}
3273
3275
};
3274
3276
@@ -3279,8 +3281,8 @@ int main(int argc, char ** argv) {
3279
3281
const auto & metrics_def = el.value();
3280
3282
3281
3283
for (const auto & metric_def : metrics_def) {
3282
- const std::string name = metric_def[ " name" ] ;
3283
- const std::string help = metric_def[ " help" ] ;
3284
+ const std::string name = metric_def.at( "name") ;
3285
+ const std::string help = metric_def.at( "help") ;
3284
3286
3285
3287
auto value = json_value(metric_def, "value", 0.);
3286
3288
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
@@ -3289,7 +3291,7 @@ int main(int argc, char ** argv) {
3289
3291
}
3290
3292
}
3291
3293
3292
- const int64_t t_start = data[ " t_start" ] ;
3294
+ const int64_t t_start = data.at( "t_start") ;
3293
3295
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
3294
3296
3295
3297
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
@@ -3298,7 +3300,7 @@ int main(int argc, char ** argv) {
3298
3300
3299
3301
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3300
3302
json request_data = json::parse(req.body);
3301
- std::string filename = request_data[ " filename" ] ;
3303
+ std::string filename = request_data.at( "filename") ;
3302
3304
if (!validate_file_name(filename)) {
3303
3305
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3304
3306
return;
@@ -3328,7 +3330,7 @@ int main(int argc, char ** argv) {
3328
3330
3329
3331
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3330
3332
json request_data = json::parse(req.body);
3331
- std::string filename = request_data[ " filename" ] ;
3333
+ std::string filename = request_data.at( "filename") ;
3332
3334
if (!validate_file_name(filename)) {
3333
3335
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3334
3336
return;
@@ -3648,7 +3650,7 @@ int main(int argc, char ** argv) {
3648
3650
std::vector<llama_token> tokens;
3649
3651
if (body.count("content") != 0) {
3650
3652
const bool add_special = json_value(body, "add_special", false);
3651
- tokens = ctx_server.tokenize (body[ " content" ] , add_special);
3653
+ tokens = ctx_server.tokenize(body.at( "content") , add_special);
3652
3654
}
3653
3655
const json data = format_tokenizer_response(tokens);
3654
3656
return res.set_content(data.dump(), "application/json; charset=utf-8");
@@ -3660,7 +3662,7 @@ int main(int argc, char ** argv) {
3660
3662
3661
3663
std::string content;
3662
3664
if (body.count("tokens") != 0) {
3663
- const std::vector<llama_token> tokens = body[ " tokens" ] ;
3665
+ const std::vector<llama_token> tokens = body.at( "tokens") ;
3664
3666
content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
3665
3667
}
3666
3668
@@ -3683,10 +3685,10 @@ int main(int argc, char ** argv) {
3683
3685
json prompt;
3684
3686
if (body.count("input") != 0) {
3685
3687
is_openai = true;
3686
- prompt = body[ " input" ] ;
3688
+ prompt = body.at( "input") ;
3687
3689
} else if (body.count("content") != 0) {
3688
3690
// with "content", we only support single prompt
3689
- prompt = std::vector<std::string>{body[ " content" ] };
3691
+ prompt = std::vector<std::string>{body.at( "content") };
3690
3692
} else {
3691
3693
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
3692
3694
return;
@@ -3705,7 +3707,7 @@ int main(int argc, char ** argv) {
3705
3707
if (!result.error) {
3706
3708
if (result.data.count("results")) {
3707
3709
// result for multi-task
3708
- responses = result.data [ " results" ] ;
3710
+ responses = result.data.at( "results") ;
3709
3711
} else {
3710
3712
// result for single task
3711
3713
responses = std::vector<json>{result.data};
0 commit comments