@@ -51,6 +51,7 @@ struct server_params
51
51
std::string hostname = " 127.0.0.1" ;
52
52
std::vector<std::string> api_keys;
53
53
std::string public_path = " examples/server/public" ;
54
+ std::string chat_template = " chatml" ;
54
55
int32_t port = 8080 ;
55
56
int32_t read_timeout = 600 ;
56
57
int32_t write_timeout = 600 ;
@@ -349,6 +350,7 @@ struct llama_server_context
349
350
350
351
// slots / clients
351
352
std::vector<llama_client_slot> slots;
353
+ json default_generation_settings_for_props;
352
354
353
355
llama_server_queue queue_tasks;
354
356
llama_server_response queue_results;
@@ -445,6 +447,9 @@ struct llama_server_context
445
447
slots.push_back (slot);
446
448
}
447
449
450
+ default_generation_settings_for_props = get_formated_generation (slots.front ());
451
+ default_generation_settings_for_props[" seed" ] = -1 ;
452
+
448
453
batch = llama_batch_init (n_ctx, 0 , params.n_parallel );
449
454
450
455
// empty system prompt
@@ -527,27 +532,29 @@ struct llama_server_context
527
532
slot_params default_params;
528
533
llama_sampling_params default_sparams;
529
534
530
- slot->params .stream = json_value (data, " stream" , false );
531
- slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
532
- slot->params .n_predict = json_value (data, " n_predict" , default_params.n_predict );
533
- slot->sparams .top_k = json_value (data, " top_k" , default_sparams.top_k );
534
- slot->sparams .top_p = json_value (data, " top_p" , default_sparams.top_p );
535
- slot->sparams .min_p = json_value (data, " min_p" , default_sparams.min_p );
536
- slot->sparams .tfs_z = json_value (data, " tfs_z" , default_sparams.tfs_z );
537
- slot->sparams .typical_p = json_value (data, " typical_p" , default_sparams.typical_p );
538
- slot->sparams .temp = json_value (data, " temperature" , default_sparams.temp );
539
- slot->sparams .penalty_last_n = json_value (data, " repeat_last_n" , default_sparams.penalty_last_n );
540
- slot->sparams .penalty_repeat = json_value (data, " repeat_penalty" , default_sparams.penalty_repeat );
541
- slot->sparams .penalty_freq = json_value (data, " frequency_penalty" , default_sparams.penalty_freq );
542
- slot->sparams .penalty_present = json_value (data, " presence_penalty" , default_sparams.penalty_present );
543
- slot->sparams .mirostat = json_value (data, " mirostat" , default_sparams.mirostat );
544
- slot->sparams .mirostat_tau = json_value (data, " mirostat_tau" , default_sparams.mirostat_tau );
545
- slot->sparams .mirostat_eta = json_value (data, " mirostat_eta" , default_sparams.mirostat_eta );
546
- slot->sparams .penalize_nl = json_value (data, " penalize_nl" , default_sparams.penalize_nl );
547
- slot->params .n_keep = json_value (data, " n_keep" , slot->params .n_keep );
548
- slot->params .seed = json_value (data, " seed" , default_params.seed );
549
- slot->sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
550
- slot->sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
535
+ slot->params .stream = json_value (data, " stream" , false );
536
+ slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
537
+ slot->params .n_predict = json_value (data, " n_predict" , default_params.n_predict );
538
+ slot->sparams .top_k = json_value (data, " top_k" , default_sparams.top_k );
539
+ slot->sparams .top_p = json_value (data, " top_p" , default_sparams.top_p );
540
+ slot->sparams .min_p = json_value (data, " min_p" , default_sparams.min_p );
541
+ slot->sparams .tfs_z = json_value (data, " tfs_z" , default_sparams.tfs_z );
542
+ slot->sparams .typical_p = json_value (data, " typical_p" , default_sparams.typical_p );
543
+ slot->sparams .temp = json_value (data, " temperature" , default_sparams.temp );
544
+ slot->sparams .dynatemp_range = json_value (data, " dynatemp_range" , default_sparams.dynatemp_range );
545
+ slot->sparams .dynatemp_exponent = json_value (data, " dynatemp_exponent" , default_sparams.dynatemp_exponent );
546
+ slot->sparams .penalty_last_n = json_value (data, " repeat_last_n" , default_sparams.penalty_last_n );
547
+ slot->sparams .penalty_repeat = json_value (data, " repeat_penalty" , default_sparams.penalty_repeat );
548
+ slot->sparams .penalty_freq = json_value (data, " frequency_penalty" , default_sparams.penalty_freq );
549
+ slot->sparams .penalty_present = json_value (data, " presence_penalty" , default_sparams.penalty_present );
550
+ slot->sparams .mirostat = json_value (data, " mirostat" , default_sparams.mirostat );
551
+ slot->sparams .mirostat_tau = json_value (data, " mirostat_tau" , default_sparams.mirostat_tau );
552
+ slot->sparams .mirostat_eta = json_value (data, " mirostat_eta" , default_sparams.mirostat_eta );
553
+ slot->sparams .penalize_nl = json_value (data, " penalize_nl" , default_sparams.penalize_nl );
554
+ slot->params .n_keep = json_value (data, " n_keep" , slot->params .n_keep );
555
+ slot->params .seed = json_value (data, " seed" , default_params.seed );
556
+ slot->sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
557
+ slot->sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
551
558
552
559
// infill
553
560
if (data.count (" input_prefix" ) != 0 )
@@ -626,18 +633,36 @@ struct llama_server_context
626
633
const int n_vocab = llama_n_vocab (model);
627
634
for (const auto &el : *logit_bias)
628
635
{
629
- if (el.is_array () && el.size () == 2 && el[ 0 ]. is_number_integer () )
636
+ if (el.is_array () && el.size () == 2 )
630
637
{
631
- llama_token tok = el[0 ].get <llama_token>();
632
- if (tok >= 0 && tok < n_vocab)
638
+ float bias;
639
+ if (el[1 ].is_number ())
640
+ {
641
+ bias = el[1 ].get <float >();
642
+ }
643
+ else if (el[1 ].is_boolean () && !el[1 ].get <bool >())
644
+ {
645
+ bias = -INFINITY;
646
+ }
647
+ else
633
648
{
634
- if (el[1 ].is_number ())
649
+ continue ;
650
+ }
651
+
652
+ if (el[0 ].is_number_integer ())
653
+ {
654
+ llama_token tok = el[0 ].get <llama_token>();
655
+ if (tok >= 0 && tok < n_vocab)
635
656
{
636
- slot->sparams .logit_bias [tok] = el[ 1 ]. get < float >() ;
657
+ slot->sparams .logit_bias [tok] = bias ;
637
658
}
638
- else if (el[1 ].is_boolean () && !el[1 ].get <bool >())
659
+ }
660
+ else if (el[0 ].is_string ())
661
+ {
662
+ auto toks = llama_tokenize (model, el[0 ].get <std::string>(), false );
663
+ for (auto tok : toks)
639
664
{
640
- slot->sparams .logit_bias [tok] = -INFINITY ;
665
+ slot->sparams .logit_bias [tok] = bias ;
641
666
}
642
667
}
643
668
}
@@ -950,28 +975,44 @@ struct llama_server_context
950
975
{
951
976
continue ;
952
977
}
953
- clip_image_f32 * img_res = clip_image_f32_init ();
954
- if (!clip_image_preprocess (clp_ctx, img.img_data , img_res, /* pad2square =*/ true ))
978
+ clip_image_f32_batch img_res_v;
979
+ img_res_v.size = 0 ;
980
+ img_res_v.data = nullptr ;
981
+ if (!clip_image_preprocess (clp_ctx, img.img_data , img_res_v))
955
982
{
956
983
LOG_TEE (" Error processing the given image" );
957
984
clip_free (clp_ctx);
985
+ clip_image_f32_batch_free (img_res_v);
986
+ return false ;
987
+ }
988
+ if (img_res_v.size == 0 )
989
+ {
990
+ LOG_TEE (" Error processing the given image" );
958
991
return false ;
959
992
}
993
+
994
+ // note: assumes only one image was returned by clip_image_preprocess
995
+ clip_image_f32 * img_res = img_res_v.data ;
996
+
960
997
img.image_tokens = clip_n_patches (clp_ctx);
961
998
img.image_embedding = (float *)malloc (clip_embd_nbytes (clp_ctx));
962
999
if (!img.image_embedding )
963
1000
{
964
1001
LOG_TEE (" Unable to allocate memory for image embeddings\n " );
1002
+ clip_image_f32_batch_free (img_res_v);
965
1003
clip_free (clp_ctx);
966
1004
return false ;
967
1005
}
968
1006
LOG_TEE (" slot %i - encoding image [id: %i]\n " , slot.id , img.id );
969
1007
if (!clip_image_encode (clp_ctx, params.n_threads , img_res, img.image_embedding ))
970
1008
{
971
1009
LOG_TEE (" Unable to encode image\n " );
1010
+ clip_image_f32_batch_free (img_res_v);
972
1011
return false ;
973
1012
}
974
- clip_image_f32_free (img_res);
1013
+
1014
+ clip_image_f32_batch_free (img_res_v);
1015
+
975
1016
img.request_encode_image = false ;
976
1017
}
977
1018
@@ -990,11 +1031,6 @@ struct llama_server_context
990
1031
queue_results.send (res);
991
1032
}
992
1033
993
- json get_model_props ()
994
- {
995
- return get_formated_generation (slots[0 ]);
996
- }
997
-
998
1034
json get_formated_generation (llama_client_slot &slot)
999
1035
{
1000
1036
const auto eos_bias = slot.sparams .logit_bias .find (llama_token_eos (model));
@@ -1005,6 +1041,8 @@ struct llama_server_context
1005
1041
{" model" , params.model_alias },
1006
1042
{" seed" , slot.params .seed },
1007
1043
{" temperature" , slot.sparams .temp },
1044
+ {" dynatemp_range" , slot.sparams .dynatemp_range },
1045
+ {" dynatemp_exponent" , slot.sparams .dynatemp_exponent },
1008
1046
{" top_k" , slot.sparams .top_k },
1009
1047
{" top_p" , slot.sparams .top_p },
1010
1048
{" min_p" , slot.sparams .min_p },
@@ -1166,13 +1204,30 @@ struct llama_server_context
1166
1204
task.multitask_id = multitask_id;
1167
1205
1168
1206
// when a completion task's prompt array is not a singleton, we split it into multiple requests
1169
- if (task.data .count (" prompt" ) && task.data .at (" prompt" ).size () > 1 )
1170
- {
1171
- split_multiprompt_task (task_id, task);
1172
- }
1173
-
1174
1207
// otherwise, it's a single-prompt task, we actually queue it
1175
- queue_tasks.post (task);
1208
+ // if there's numbers in the prompt array it will be treated as an array of tokens
1209
+ if (task.data .count (" prompt" ) != 0 && task.data .at (" prompt" ).size () > 1 ) {
1210
+ bool numbers = false ;
1211
+ for (const auto & e : task.data .at (" prompt" )) {
1212
+ if (e.is_number ()) {
1213
+ numbers = true ;
1214
+ break ;
1215
+ }
1216
+ }
1217
+
1218
+ // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
1219
+ // it will completely stall the server. I don't know where the bug for this is.
1220
+ //
1221
+ // if there are numbers, it needs to be treated like a single prompt,
1222
+ // queue_tasks handles a mix of strings and numbers just fine.
1223
+ if (numbers) {
1224
+ queue_tasks.post (task);
1225
+ } else {
1226
+ split_multiprompt_task (task_id, task);
1227
+ }
1228
+ } else {
1229
+ queue_tasks.post (task);
1230
+ }
1176
1231
}
1177
1232
1178
1233
// for multiple images processing
@@ -1254,7 +1309,10 @@ struct llama_server_context
1254
1309
void split_multiprompt_task (int multitask_id, task_server& multiprompt_task)
1255
1310
{
1256
1311
int prompt_count = multiprompt_task.data .at (" prompt" ).size ();
1257
- assert (prompt_count > 1 );
1312
+ if (prompt_count <= 1 ) {
1313
+ send_error (multiprompt_task, " error while handling multiple prompts" );
1314
+ return ;
1315
+ }
1258
1316
1259
1317
// generate all the ID for subtask
1260
1318
std::vector<int > subtask_ids (prompt_count);
@@ -1566,10 +1624,6 @@ struct llama_server_context
1566
1624
LOG_TEE (" slot %d : in cache: %i tokens | to process: %i tokens\n " , slot.id , slot.n_past , slot.num_prompt_tokens_processed );
1567
1625
}
1568
1626
1569
- LOG_TEE (" slot %d : kv cache rm - [%d, end)\n " , slot.id , (int ) system_tokens.size () + slot.n_past );
1570
-
1571
- llama_kv_cache_seq_rm (ctx, slot.id , system_tokens.size () + slot.n_past , -1 );
1572
-
1573
1627
slot.cache_tokens = prompt_tokens;
1574
1628
1575
1629
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0 )
@@ -1583,6 +1637,10 @@ struct llama_server_context
1583
1637
}
1584
1638
}
1585
1639
1640
+ LOG_TEE (" slot %d : kv cache rm - [%d, end)\n " , slot.id , (int ) system_tokens.size () + slot.n_past );
1641
+
1642
+ llama_kv_cache_seq_rm (ctx, slot.id , system_tokens.size () + slot.n_past , -1 );
1643
+
1586
1644
LOG_VERBOSE (" prompt ingested" , {
1587
1645
{" n_past" , slot.n_past },
1588
1646
{" cached" , tokens_to_str (ctx, slot.cache_tokens .cbegin (), slot.cache_tokens .cbegin () + slot.n_past )},
0 commit comments