12
12
// increase max payload length to allow use of larger context size
13
13
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
14
14
#include " httplib.h"
15
+ // Change JSON_ASSERT from assert() to GGML_ASSERT:
16
+ #define JSON_ASSERT GGML_ASSERT
15
17
#include " json.hpp"
16
18
17
19
// auto generated files (update with ./deps.sh)
@@ -745,7 +747,7 @@ struct server_context {
745
747
}
746
748
747
749
default_generation_settings_for_props = get_formated_generation (slots.front ());
748
- default_generation_settings_for_props[ " seed" ] = -1 ;
750
+ default_generation_settings_for_props. at ( " seed" ) = -1 ;
749
751
750
752
// the update_slots() logic will always submit a maximum of n_batch tokens
751
753
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
@@ -859,7 +861,7 @@ struct server_context {
859
861
slot.sparams .min_keep = json_value (data, " min_keep" , default_sparams.min_keep );
860
862
861
863
// process "json_schema" and "grammar"
862
- if (data.contains (" json_schema" ) && !data[ " json_schema" ] .is_null () && data.contains (" grammar" ) && !data[ " grammar" ] .is_null ()) {
864
+ if (data.contains (" json_schema" ) && !data. at ( " json_schema" ) .is_null () && data.contains (" grammar" ) && !data. at ( " grammar" ) .is_null ()) {
863
865
send_error (task, " Either \" json_schema\" or \" grammar\" can be specified, but not both" , ERROR_TYPE_INVALID_REQUEST);
864
866
return false ;
865
867
} else if (data.contains (" json_schema" ) && !data.contains (" grammar" )) {
@@ -1343,12 +1345,12 @@ struct server_context {
1343
1345
}
1344
1346
slot.n_sent_token_probs = probs_stop_pos;
1345
1347
1346
- res.data [ " completion_probabilities" ] = probs_vector_to_json (ctx, probs_output);
1348
+ res.data . at ( " completion_probabilities" ) = probs_vector_to_json (ctx, probs_output);
1347
1349
}
1348
1350
1349
1351
if (slot.oaicompat ) {
1350
- res.data [ " oaicompat_token_ctr" ] = slot.n_decoded ;
1351
- res.data [ " model" ] = slot.oaicompat_model ;
1352
+ res.data . at ( " oaicompat_token_ctr" ) = slot.n_decoded ;
1353
+ res.data . at ( " model" ) = slot.oaicompat_model ;
1352
1354
}
1353
1355
1354
1356
queue_results.send (res);
@@ -1393,12 +1395,12 @@ struct server_context {
1393
1395
slot.generated_token_probs .end ());
1394
1396
}
1395
1397
1396
- res.data [ " completion_probabilities" ] = probs_vector_to_json (ctx, probs);
1398
+ res.data . at ( " completion_probabilities" ) = probs_vector_to_json (ctx, probs);
1397
1399
}
1398
1400
1399
1401
if (slot.oaicompat ) {
1400
- res.data [ " oaicompat_token_ctr" ] = slot.n_decoded ;
1401
- res.data [ " model" ] = slot.oaicompat_model ;
1402
+ res.data . at ( " oaicompat_token_ctr" ) = slot.n_decoded ;
1403
+ res.data . at ( " model" ) = slot.oaicompat_model ;
1402
1404
}
1403
1405
1404
1406
queue_results.send (res);
@@ -1512,7 +1514,7 @@ struct server_context {
1512
1514
// add subtasks
1513
1515
for (int i = 0 ; i < prompt_count; i++) {
1514
1516
json subtask_data = multiprompt_task.data ;
1515
- subtask_data[ " prompt" ] = subtask_data[ " prompt" ] [i];
1517
+ subtask_data. at ( " prompt" ) = subtask_data. at ( " prompt" ) [i];
1516
1518
1517
1519
// subtasks inherit everything else (infill mode, embedding mode, etc.)
1518
1520
request_completion (subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill , multiprompt_task.embedding );
@@ -1532,7 +1534,7 @@ struct server_context {
1532
1534
}
1533
1535
1534
1536
if (task.data .contains (" system_prompt" )) {
1535
- system_prompt_set (task.data [ " system_prompt" ] );
1537
+ system_prompt_set (task.data . at ( " system_prompt" ) );
1536
1538
1537
1539
for (server_slot & slot : slots) {
1538
1540
slot.n_past = 0 ;
@@ -1575,11 +1577,11 @@ struct server_context {
1575
1577
1576
1578
for (server_slot & slot : slots) {
1577
1579
json slot_data = get_formated_generation (slot);
1578
- slot_data[ " id" ] = slot.id ;
1579
- slot_data[ " id_task" ] = slot.id_task ;
1580
- slot_data[ " state" ] = slot.state ;
1581
- slot_data[ " prompt" ] = slot.prompt ;
1582
- slot_data[ " next_token" ] = {
1580
+ slot_data. at ( " id" ) = slot.id ;
1581
+ slot_data. at ( " id_task" ) = slot.id_task ;
1582
+ slot_data. at ( " state" ) = slot.state ;
1583
+ slot_data. at ( " prompt" ) = slot.prompt ;
1584
+ slot_data. at ( " next_token" ) = {
1583
1585
{" has_next_token" , slot.has_next_token },
1584
1586
{" n_remain" , slot.n_remaining },
1585
1587
{" n_decoded" , slot.n_decoded },
@@ -1589,7 +1591,7 @@ struct server_context {
1589
1591
{" stopping_word" , slot.stopping_word },
1590
1592
};
1591
1593
1592
- if (slot_data[ " state" ] == SLOT_STATE_IDLE) {
1594
+ if (slot_data. at ( " state" ) == SLOT_STATE_IDLE) {
1593
1595
n_idle_slots++;
1594
1596
} else {
1595
1597
n_processing_slots++;
@@ -1644,7 +1646,7 @@ struct server_context {
1644
1646
} break ;
1645
1647
case SERVER_TASK_TYPE_SLOT_SAVE:
1646
1648
{
1647
- int id_slot = task.data [ " id_slot" ] ;
1649
+ int id_slot = task.data . at ( " id_slot" ) ;
1648
1650
server_slot * slot = get_slot (id_slot);
1649
1651
if (slot == nullptr ) {
1650
1652
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
@@ -1654,8 +1656,8 @@ struct server_context {
1654
1656
const size_t token_count = slot->cache_tokens .size ();
1655
1657
const int64_t t_start = ggml_time_us ();
1656
1658
1657
- std::string filename = task.data [ " filename" ] ;
1658
- std::string filepath = task.data [ " filepath" ] ;
1659
+ std::string filename = task.data . at ( " filename" ) ;
1660
+ std::string filepath = task.data . at ( " filepath" ) ;
1659
1661
1660
1662
const size_t nwrite = llama_state_seq_save_file (ctx, filepath.c_str (), slot->id + 1 , slot->cache_tokens .data (), token_count);
1661
1663
@@ -1679,7 +1681,7 @@ struct server_context {
1679
1681
} break ;
1680
1682
case SERVER_TASK_TYPE_SLOT_RESTORE:
1681
1683
{
1682
- int id_slot = task.data [ " id_slot" ] ;
1684
+ int id_slot = task.data . at ( " id_slot" ) ;
1683
1685
server_slot * slot = get_slot (id_slot);
1684
1686
if (slot == nullptr ) {
1685
1687
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
@@ -1688,8 +1690,8 @@ struct server_context {
1688
1690
1689
1691
const int64_t t_start = ggml_time_us ();
1690
1692
1691
- std::string filename = task.data [ " filename" ] ;
1692
- std::string filepath = task.data [ " filepath" ] ;
1693
+ std::string filename = task.data . at ( " filename" ) ;
1694
+ std::string filepath = task.data . at ( " filepath" ) ;
1693
1695
1694
1696
slot->cache_tokens .resize (slot->n_ctx );
1695
1697
size_t token_count = 0 ;
@@ -1721,7 +1723,7 @@ struct server_context {
1721
1723
} break ;
1722
1724
case SERVER_TASK_TYPE_SLOT_ERASE:
1723
1725
{
1724
- int id_slot = task.data [ " id_slot" ] ;
1726
+ int id_slot = task.data . at ( " id_slot" ) ;
1725
1727
server_slot * slot = get_slot (id_slot);
1726
1728
if (slot == nullptr ) {
1727
1729
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
@@ -3136,8 +3138,8 @@ int main(int argc, char ** argv) {
3136
3138
server_task_result result = ctx_server.queue_results .recv (task.id );
3137
3139
ctx_server.queue_results .remove_waiting_task_id (task.id );
3138
3140
3139
- const int n_idle_slots = result.data [ " idle" ] ;
3140
- const int n_processing_slots = result.data [ " processing" ] ;
3141
+ const int n_idle_slots = result.data . at ( " idle" ) ;
3142
+ const int n_processing_slots = result.data . at ( " processing" ) ;
3141
3143
3142
3144
json health = {
3143
3145
{" status" , " ok" },
@@ -3147,11 +3149,11 @@ int main(int argc, char ** argv) {
3147
3149
3148
3150
res.status = 200 ; // HTTP OK
3149
3151
if (sparams.slots_endpoint && req.has_param (" include_slots" )) {
3150
- health[ " slots" ] = result.data [ " slots" ] ;
3152
+ health. at ( " slots" ) = result.data . at ( " slots" ) ;
3151
3153
}
3152
3154
3153
3155
if (n_idle_slots == 0 ) {
3154
- health[ " status" ] = " no slot available" ;
3156
+ health. at ( " status" ) = " no slot available" ;
3155
3157
if (req.has_param (" fail_on_no_slot" )) {
3156
3158
res.status = 503 ; // HTTP Service Unavailable
3157
3159
}
@@ -3191,7 +3193,7 @@ int main(int argc, char ** argv) {
3191
3193
server_task_result result = ctx_server.queue_results .recv (task.id );
3192
3194
ctx_server.queue_results .remove_waiting_task_id (task.id );
3193
3195
3194
- res.set_content (result.data [ " slots" ] .dump (), " application/json" );
3196
+ res.set_content (result.data . at ( " slots" ) .dump (), " application/json" );
3195
3197
res.status = 200 ; // HTTP OK
3196
3198
};
3197
3199
@@ -3218,32 +3220,32 @@ int main(int argc, char ** argv) {
3218
3220
3219
3221
json data = result.data ;
3220
3222
3221
- const uint64_t n_prompt_tokens_processed = data[ " n_prompt_tokens_processed" ] ;
3222
- const uint64_t t_prompt_processing = data[ " t_prompt_processing" ] ;
3223
+ const uint64_t n_prompt_tokens_processed = data. at ( " n_prompt_tokens_processed" ) ;
3224
+ const uint64_t t_prompt_processing = data. at ( " t_prompt_processing" ) ;
3223
3225
3224
- const uint64_t n_tokens_predicted = data[ " n_tokens_predicted" ] ;
3225
- const uint64_t t_tokens_generation = data[ " t_tokens_generation" ] ;
3226
+ const uint64_t n_tokens_predicted = data. at ( " n_tokens_predicted" ) ;
3227
+ const uint64_t t_tokens_generation = data. at ( " t_tokens_generation" ) ;
3226
3228
3227
- const int32_t kv_cache_used_cells = data[ " kv_cache_used_cells" ] ;
3229
+ const int32_t kv_cache_used_cells = data. at ( " kv_cache_used_cells" ) ;
3228
3230
3229
3231
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
3230
3232
json all_metrics_def = json {
3231
3233
{" counter" , {{
3232
3234
{" name" , " prompt_tokens_total" },
3233
3235
{" help" , " Number of prompt tokens processed." },
3234
- {" value" , (uint64_t ) data[ " n_prompt_tokens_processed_total" ] }
3236
+ {" value" , (uint64_t ) data. at ( " n_prompt_tokens_processed_total" ) }
3235
3237
}, {
3236
3238
{" name" , " prompt_seconds_total" },
3237
3239
{" help" , " Prompt process time" },
3238
- {" value" , (uint64_t ) data[ " t_prompt_processing_total" ] / 1 .e3 }
3240
+ {" value" , (uint64_t ) data. at ( " t_prompt_processing_total" ) / 1 .e3 }
3239
3241
}, {
3240
3242
{" name" , " tokens_predicted_total" },
3241
3243
{" help" , " Number of generation tokens processed." },
3242
- {" value" , (uint64_t ) data[ " n_tokens_predicted_total" ] }
3244
+ {" value" , (uint64_t ) data. at ( " n_tokens_predicted_total" ) }
3243
3245
}, {
3244
3246
{" name" , " tokens_predicted_seconds_total" },
3245
3247
{" help" , " Predict process time" },
3246
- {" value" , (uint64_t ) data[ " t_tokens_generation_total" ] / 1 .e3 }
3248
+ {" value" , (uint64_t ) data. at ( " t_tokens_generation_total" ) / 1 .e3 }
3247
3249
}}},
3248
3250
{" gauge" , {{
3249
3251
{" name" , " prompt_tokens_seconds" },
@@ -3260,15 +3262,15 @@ int main(int argc, char ** argv) {
3260
3262
},{
3261
3263
{" name" , " kv_cache_tokens" },
3262
3264
{" help" , " KV-cache tokens." },
3263
- {" value" , (uint64_t ) data[ " kv_cache_tokens_count" ] }
3265
+ {" value" , (uint64_t ) data. at ( " kv_cache_tokens_count" ) }
3264
3266
},{
3265
3267
{" name" , " requests_processing" },
3266
3268
{" help" , " Number of request processing." },
3267
- {" value" , (uint64_t ) data[ " processing" ] }
3269
+ {" value" , (uint64_t ) data. at ( " processing" ) }
3268
3270
},{
3269
3271
{" name" , " requests_deferred" },
3270
3272
{" help" , " Number of request deferred." },
3271
- {" value" , (uint64_t ) data[ " deferred" ] }
3273
+ {" value" , (uint64_t ) data. at ( " deferred" ) }
3272
3274
}}}
3273
3275
};
3274
3276
@@ -3279,8 +3281,8 @@ int main(int argc, char ** argv) {
3279
3281
const auto & metrics_def = el.value ();
3280
3282
3281
3283
for (const auto & metric_def : metrics_def) {
3282
- const std::string name = metric_def[ " name" ] ;
3283
- const std::string help = metric_def[ " help" ] ;
3284
+ const std::string name = metric_def. at ( " name" ) ;
3285
+ const std::string help = metric_def. at ( " help" ) ;
3284
3286
3285
3287
auto value = json_value (metric_def, " value" , 0 .);
3286
3288
prometheus << " # HELP llamacpp:" << name << " " << help << " \n "
@@ -3289,7 +3291,7 @@ int main(int argc, char ** argv) {
3289
3291
}
3290
3292
}
3291
3293
3292
- const int64_t t_start = data[ " t_start" ] ;
3294
+ const int64_t t_start = data. at ( " t_start" ) ;
3293
3295
res.set_header (" Process-Start-Time-Unix" , std::to_string (t_start));
3294
3296
3295
3297
res.set_content (prometheus.str (), " text/plain; version=0.0.4" );
@@ -3298,7 +3300,7 @@ int main(int argc, char ** argv) {
3298
3300
3299
3301
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3300
3302
json request_data = json::parse (req.body );
3301
- std::string filename = request_data[ " filename" ] ;
3303
+ std::string filename = request_data. at ( " filename" ) ;
3302
3304
if (!validate_file_name (filename)) {
3303
3305
res_error (res, format_error_response (" Invalid filename" , ERROR_TYPE_INVALID_REQUEST));
3304
3306
return ;
@@ -3328,7 +3330,7 @@ int main(int argc, char ** argv) {
3328
3330
3329
3331
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3330
3332
json request_data = json::parse (req.body );
3331
- std::string filename = request_data[ " filename" ] ;
3333
+ std::string filename = request_data. at ( " filename" ) ;
3332
3334
if (!validate_file_name (filename)) {
3333
3335
res_error (res, format_error_response (" Invalid filename" , ERROR_TYPE_INVALID_REQUEST));
3334
3336
return ;
@@ -3647,7 +3649,7 @@ int main(int argc, char ** argv) {
3647
3649
3648
3650
std::vector<llama_token> tokens;
3649
3651
if (body.count (" content" ) != 0 ) {
3650
- tokens = ctx_server.tokenize (body[ " content" ] , false );
3652
+ tokens = ctx_server.tokenize (body. at ( " content" ) , false );
3651
3653
}
3652
3654
const json data = format_tokenizer_response (tokens);
3653
3655
return res.set_content (data.dump (), " application/json; charset=utf-8" );
@@ -3659,7 +3661,7 @@ int main(int argc, char ** argv) {
3659
3661
3660
3662
std::string content;
3661
3663
if (body.count (" tokens" ) != 0 ) {
3662
- const std::vector<llama_token> tokens = body[ " tokens" ] ;
3664
+ const std::vector<llama_token> tokens = body. at ( " tokens" ) ;
3663
3665
content = tokens_to_str (ctx_server.ctx , tokens.cbegin (), tokens.cend ());
3664
3666
}
3665
3667
@@ -3682,10 +3684,10 @@ int main(int argc, char ** argv) {
3682
3684
json prompt;
3683
3685
if (body.count (" input" ) != 0 ) {
3684
3686
is_openai = true ;
3685
- prompt = body[ " input" ] ;
3687
+ prompt = body. at ( " input" ) ;
3686
3688
} else if (body.count (" content" ) != 0 ) {
3687
3689
// with "content", we only support single prompt
3688
- prompt = std::vector<std::string>{body[ " content" ] };
3690
+ prompt = std::vector<std::string>{body. at ( " content" ) };
3689
3691
} else {
3690
3692
res_error (res, format_error_response (" \" input\" or \" content\" must be provided" , ERROR_TYPE_INVALID_REQUEST));
3691
3693
return ;
@@ -3704,7 +3706,7 @@ int main(int argc, char ** argv) {
3704
3706
if (!result.error ) {
3705
3707
if (result.data .count (" results" )) {
3706
3708
// result for multi-task
3707
- responses = result.data [ " results" ] ;
3709
+ responses = result.data . at ( " results" ) ;
3708
3710
} else {
3709
3711
// result for single task
3710
3712
responses = std::vector<json>{result.data };
0 commit comments