12
12
// increase max payload length to allow use of larger context size
13
13
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
14
14
#include " httplib.h"
15
+ // Change JSON_ASSERT from assert() to GGML_ASSERT:
16
+ #define JSON_ASSERT GGML_ASSERT
15
17
#include " json.hpp"
16
18
17
19
// auto generated files (update with ./deps.sh)
@@ -859,7 +861,7 @@ struct server_context {
859
861
slot.sparams .min_keep = json_value (data, " min_keep" , default_sparams.min_keep );
860
862
861
863
// process "json_schema" and "grammar"
862
- if (data.contains (" json_schema" ) && !data[ " json_schema" ] .is_null () && data.contains (" grammar" ) && !data[ " grammar" ] .is_null ()) {
864
+ if (data.contains (" json_schema" ) && !data. at ( " json_schema" ) .is_null () && data.contains (" grammar" ) && !data. at ( " grammar" ) .is_null ()) {
863
865
send_error (task, " Either \" json_schema\" or \" grammar\" can be specified, but not both" , ERROR_TYPE_INVALID_REQUEST);
864
866
return false ;
865
867
} else if (data.contains (" json_schema" ) && !data.contains (" grammar" )) {
@@ -1512,7 +1514,7 @@ struct server_context {
1512
1514
// add subtasks
1513
1515
for (int i = 0 ; i < prompt_count; i++) {
1514
1516
json subtask_data = multiprompt_task.data ;
1515
- subtask_data[" prompt" ] = subtask_data[ " prompt" ] [i];
1517
+ subtask_data[" prompt" ] = subtask_data. at ( " prompt" ) [i];
1516
1518
1517
1519
// subtasks inherit everything else (infill mode, embedding mode, etc.)
1518
1520
request_completion (subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill , multiprompt_task.embedding );
@@ -1532,7 +1534,7 @@ struct server_context {
1532
1534
}
1533
1535
1534
1536
if (task.data .contains (" system_prompt" )) {
1535
- system_prompt_set (task.data [ " system_prompt" ] );
1537
+ system_prompt_set (task.data . at ( " system_prompt" ) );
1536
1538
1537
1539
for (server_slot & slot : slots) {
1538
1540
slot.n_past = 0 ;
@@ -1644,7 +1646,7 @@ struct server_context {
1644
1646
} break ;
1645
1647
case SERVER_TASK_TYPE_SLOT_SAVE:
1646
1648
{
1647
- int id_slot = task.data [ " id_slot" ] ;
1649
+ int id_slot = task.data . at ( " id_slot" ) ;
1648
1650
server_slot * slot = get_slot (id_slot);
1649
1651
if (slot == nullptr ) {
1650
1652
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
@@ -1654,8 +1656,8 @@ struct server_context {
1654
1656
const size_t token_count = slot->cache_tokens .size ();
1655
1657
const int64_t t_start = ggml_time_us ();
1656
1658
1657
- std::string filename = task.data [ " filename" ] ;
1658
- std::string filepath = task.data [ " filepath" ] ;
1659
+ std::string filename = task.data . at ( " filename" ) ;
1660
+ std::string filepath = task.data . at ( " filepath" ) ;
1659
1661
1660
1662
const size_t nwrite = llama_state_seq_save_file (ctx, filepath.c_str (), slot->id + 1 , slot->cache_tokens .data (), token_count);
1661
1663
@@ -1679,7 +1681,7 @@ struct server_context {
1679
1681
} break ;
1680
1682
case SERVER_TASK_TYPE_SLOT_RESTORE:
1681
1683
{
1682
- int id_slot = task.data [ " id_slot" ] ;
1684
+ int id_slot = task.data . at ( " id_slot" ) ;
1683
1685
server_slot * slot = get_slot (id_slot);
1684
1686
if (slot == nullptr ) {
1685
1687
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
@@ -1688,8 +1690,8 @@ struct server_context {
1688
1690
1689
1691
const int64_t t_start = ggml_time_us ();
1690
1692
1691
- std::string filename = task.data [ " filename" ] ;
1692
- std::string filepath = task.data [ " filepath" ] ;
1693
+ std::string filename = task.data . at ( " filename" ) ;
1694
+ std::string filepath = task.data . at ( " filepath" ) ;
1693
1695
1694
1696
slot->cache_tokens .resize (slot->n_ctx );
1695
1697
size_t token_count = 0 ;
@@ -1721,7 +1723,7 @@ struct server_context {
1721
1723
} break ;
1722
1724
case SERVER_TASK_TYPE_SLOT_ERASE:
1723
1725
{
1724
- int id_slot = task.data [ " id_slot" ] ;
1726
+ int id_slot = task.data . at ( " id_slot" ) ;
1725
1727
server_slot * slot = get_slot (id_slot);
1726
1728
if (slot == nullptr ) {
1727
1729
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
@@ -3136,8 +3138,8 @@ int main(int argc, char ** argv) {
3136
3138
server_task_result result = ctx_server.queue_results .recv (task.id );
3137
3139
ctx_server.queue_results .remove_waiting_task_id (task.id );
3138
3140
3139
- const int n_idle_slots = result.data [ " idle" ] ;
3140
- const int n_processing_slots = result.data [ " processing" ] ;
3141
+ const int n_idle_slots = result.data . at ( " idle" ) ;
3142
+ const int n_processing_slots = result.data . at ( " processing" ) ;
3141
3143
3142
3144
json health = {
3143
3145
{" status" , " ok" },
@@ -3147,7 +3149,7 @@ int main(int argc, char ** argv) {
3147
3149
3148
3150
res.status = 200 ; // HTTP OK
3149
3151
if (sparams.slots_endpoint && req.has_param (" include_slots" )) {
3150
- health[" slots" ] = result.data [ " slots" ] ;
3152
+ health[" slots" ] = result.data . at ( " slots" ) ;
3151
3153
}
3152
3154
3153
3155
if (n_idle_slots == 0 ) {
@@ -3191,7 +3193,7 @@ int main(int argc, char ** argv) {
3191
3193
server_task_result result = ctx_server.queue_results .recv (task.id );
3192
3194
ctx_server.queue_results .remove_waiting_task_id (task.id );
3193
3195
3194
- res.set_content (result.data [ " slots" ] .dump (), " application/json" );
3196
+ res.set_content (result.data . at ( " slots" ) .dump (), " application/json" );
3195
3197
res.status = 200 ; // HTTP OK
3196
3198
};
3197
3199
@@ -3218,32 +3220,32 @@ int main(int argc, char ** argv) {
3218
3220
3219
3221
json data = result.data ;
3220
3222
3221
- const uint64_t n_prompt_tokens_processed = data[ " n_prompt_tokens_processed" ] ;
3222
- const uint64_t t_prompt_processing = data[ " t_prompt_processing" ] ;
3223
+ const uint64_t n_prompt_tokens_processed = data. at ( " n_prompt_tokens_processed" ) ;
3224
+ const uint64_t t_prompt_processing = data. at ( " t_prompt_processing" ) ;
3223
3225
3224
- const uint64_t n_tokens_predicted = data[ " n_tokens_predicted" ] ;
3225
- const uint64_t t_tokens_generation = data[ " t_tokens_generation" ] ;
3226
+ const uint64_t n_tokens_predicted = data. at ( " n_tokens_predicted" ) ;
3227
+ const uint64_t t_tokens_generation = data. at ( " t_tokens_generation" ) ;
3226
3228
3227
- const int32_t kv_cache_used_cells = data[ " kv_cache_used_cells" ] ;
3229
+ const int32_t kv_cache_used_cells = data. at ( " kv_cache_used_cells" ) ;
3228
3230
3229
3231
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
3230
3232
json all_metrics_def = json {
3231
3233
{" counter" , {{
3232
3234
{" name" , " prompt_tokens_total" },
3233
3235
{" help" , " Number of prompt tokens processed." },
3234
- {" value" , (uint64_t ) data[ " n_prompt_tokens_processed_total" ] }
3236
+ {" value" , (uint64_t ) data. at ( " n_prompt_tokens_processed_total" ) }
3235
3237
}, {
3236
3238
{" name" , " prompt_seconds_total" },
3237
3239
{" help" , " Prompt process time" },
3238
- {" value" , (uint64_t ) data[ " t_prompt_processing_total" ] / 1 .e3 }
3240
+ {" value" , (uint64_t ) data. at ( " t_prompt_processing_total" ) / 1 .e3 }
3239
3241
}, {
3240
3242
{" name" , " tokens_predicted_total" },
3241
3243
{" help" , " Number of generation tokens processed." },
3242
- {" value" , (uint64_t ) data[ " n_tokens_predicted_total" ] }
3244
+ {" value" , (uint64_t ) data. at ( " n_tokens_predicted_total" ) }
3243
3245
}, {
3244
3246
{" name" , " tokens_predicted_seconds_total" },
3245
3247
{" help" , " Predict process time" },
3246
- {" value" , (uint64_t ) data[ " t_tokens_generation_total" ] / 1 .e3 }
3248
+ {" value" , (uint64_t ) data. at ( " t_tokens_generation_total" ) / 1 .e3 }
3247
3249
}}},
3248
3250
{" gauge" , {{
3249
3251
{" name" , " prompt_tokens_seconds" },
@@ -3260,15 +3262,15 @@ int main(int argc, char ** argv) {
3260
3262
},{
3261
3263
{" name" , " kv_cache_tokens" },
3262
3264
{" help" , " KV-cache tokens." },
3263
- {" value" , (uint64_t ) data[ " kv_cache_tokens_count" ] }
3265
+ {" value" , (uint64_t ) data. at ( " kv_cache_tokens_count" ) }
3264
3266
},{
3265
3267
{" name" , " requests_processing" },
3266
3268
{" help" , " Number of request processing." },
3267
- {" value" , (uint64_t ) data[ " processing" ] }
3269
+ {" value" , (uint64_t ) data. at ( " processing" ) }
3268
3270
},{
3269
3271
{" name" , " requests_deferred" },
3270
3272
{" help" , " Number of request deferred." },
3271
- {" value" , (uint64_t ) data[ " deferred" ] }
3273
+ {" value" , (uint64_t ) data. at ( " deferred" ) }
3272
3274
}}}
3273
3275
};
3274
3276
@@ -3279,8 +3281,8 @@ int main(int argc, char ** argv) {
3279
3281
const auto & metrics_def = el.value ();
3280
3282
3281
3283
for (const auto & metric_def : metrics_def) {
3282
- const std::string name = metric_def[ " name" ] ;
3283
- const std::string help = metric_def[ " help" ] ;
3284
+ const std::string name = metric_def. at ( " name" ) ;
3285
+ const std::string help = metric_def. at ( " help" ) ;
3284
3286
3285
3287
auto value = json_value (metric_def, " value" , 0 .);
3286
3288
prometheus << " # HELP llamacpp:" << name << " " << help << " \n "
@@ -3289,7 +3291,7 @@ int main(int argc, char ** argv) {
3289
3291
}
3290
3292
}
3291
3293
3292
- const int64_t t_start = data[ " t_start" ] ;
3294
+ const int64_t t_start = data. at ( " t_start" ) ;
3293
3295
res.set_header (" Process-Start-Time-Unix" , std::to_string (t_start));
3294
3296
3295
3297
res.set_content (prometheus.str (), " text/plain; version=0.0.4" );
@@ -3298,7 +3300,7 @@ int main(int argc, char ** argv) {
3298
3300
3299
3301
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3300
3302
json request_data = json::parse (req.body );
3301
- std::string filename = request_data[ " filename" ] ;
3303
+ std::string filename = request_data. at ( " filename" ) ;
3302
3304
if (!validate_file_name (filename)) {
3303
3305
res_error (res, format_error_response (" Invalid filename" , ERROR_TYPE_INVALID_REQUEST));
3304
3306
return ;
@@ -3328,7 +3330,7 @@ int main(int argc, char ** argv) {
3328
3330
3329
3331
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3330
3332
json request_data = json::parse (req.body );
3331
- std::string filename = request_data[ " filename" ] ;
3333
+ std::string filename = request_data. at ( " filename" ) ;
3332
3334
if (!validate_file_name (filename)) {
3333
3335
res_error (res, format_error_response (" Invalid filename" , ERROR_TYPE_INVALID_REQUEST));
3334
3336
return ;
@@ -3648,7 +3650,7 @@ int main(int argc, char ** argv) {
3648
3650
std::vector<llama_token> tokens;
3649
3651
if (body.count (" content" ) != 0 ) {
3650
3652
const bool add_special = json_value (body, " add_special" , false );
3651
- tokens = ctx_server.tokenize (body[ " content" ] , add_special);
3653
+ tokens = ctx_server.tokenize (body. at ( " content" ) , add_special);
3652
3654
}
3653
3655
const json data = format_tokenizer_response (tokens);
3654
3656
return res.set_content (data.dump (), " application/json; charset=utf-8" );
@@ -3660,7 +3662,7 @@ int main(int argc, char ** argv) {
3660
3662
3661
3663
std::string content;
3662
3664
if (body.count (" tokens" ) != 0 ) {
3663
- const std::vector<llama_token> tokens = body[ " tokens" ] ;
3665
+ const std::vector<llama_token> tokens = body. at ( " tokens" ) ;
3664
3666
content = tokens_to_str (ctx_server.ctx , tokens.cbegin (), tokens.cend ());
3665
3667
}
3666
3668
@@ -3683,10 +3685,10 @@ int main(int argc, char ** argv) {
3683
3685
json prompt;
3684
3686
if (body.count (" input" ) != 0 ) {
3685
3687
is_openai = true ;
3686
- prompt = body[ " input" ] ;
3688
+ prompt = body. at ( " input" ) ;
3687
3689
} else if (body.count (" content" ) != 0 ) {
3688
3690
// with "content", we only support single prompt
3689
- prompt = std::vector<std::string>{body[ " content" ] };
3691
+ prompt = std::vector<std::string>{body. at ( " content" ) };
3690
3692
} else {
3691
3693
res_error (res, format_error_response (" \" input\" or \" content\" must be provided" , ERROR_TYPE_INVALID_REQUEST));
3692
3694
return ;
@@ -3705,7 +3707,7 @@ int main(int argc, char ** argv) {
3705
3707
if (!result.error ) {
3706
3708
if (result.data .count (" results" )) {
3707
3709
// result for multi-task
3708
- responses = result.data [ " results" ] ;
3710
+ responses = result.data . at ( " results" ) ;
3709
3711
} else {
3710
3712
// result for single task
3711
3713
responses = std::vector<json>{result.data };
0 commit comments