@@ -1704,19 +1704,6 @@ struct server_context {
1704
1704
// next, batch any pending prompts without exceeding n_batch
1705
1705
if (params.cont_batching || batch.n_tokens == 0 ) {
1706
1706
for (auto & slot : slots) {
1707
- const bool has_prompt = slot.prompt .is_array () || (slot.prompt .is_string () && !slot.prompt .get <std::string>().empty ());
1708
-
1709
- // empty prompt passed -> release the slot and send empty response
1710
- // note: infill mode allows empty prompt
1711
- if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT && !has_prompt && !slot.infill ) {
1712
- slot.state = SLOT_STATE_PROCESSING;
1713
- slot.command = SLOT_COMMAND_NONE;
1714
- slot.release ();
1715
- slot.print_timings ();
1716
- send_final_response (slot);
1717
- continue ;
1718
- }
1719
-
1720
1707
// this slot still has a prompt to be processed
1721
1708
if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
1722
1709
auto & prompt_tokens = slot.prompt_tokens ;
@@ -1768,6 +1755,21 @@ struct server_context {
1768
1755
{" prompt_tokens" , tokens_to_str (ctx, prompt_tokens.cbegin (), prompt_tokens.cend ())},
1769
1756
});
1770
1757
1758
+ // empty prompt passed -> release the slot and send empty response
1759
+ if (prompt_tokens.empty ()) {
1760
+ LOG_INFO (" empty prompt - releasing slot" , {
1761
+ {" id_slot" , slot.id },
1762
+ {" id_task" , slot.id_task }
1763
+ });
1764
+
1765
+ slot.state = SLOT_STATE_PROCESSING;
1766
+ slot.command = SLOT_COMMAND_NONE;
1767
+ slot.release ();
1768
+ slot.print_timings ();
1769
+ send_final_response (slot);
1770
+ continue ;
1771
+ }
1772
+
1771
1773
if (slot.embedding ) {
1772
1774
// this prompt is too large to process - discard it
1773
1775
if (slot.n_prompt_tokens > n_batch) {
0 commit comments