@@ -185,7 +185,7 @@ struct llama_client_slot
185
185
llama_sampling_context *ctx_sampling = nullptr ;
186
186
187
187
int32_t ga_i = 0 ; // group-attention state
188
- int32_t ga_n = 1 ;// group-attention factor
188
+ int32_t ga_n = 1 ; // group-attention factor
189
189
int32_t ga_w = 512 ; // group-attention width
190
190
191
191
// multimodal
@@ -1293,6 +1293,7 @@ struct llama_server_context
1293
1293
for (llama_client_slot &slot : slots)
1294
1294
{
1295
1295
slot.cache_tokens .clear ();
1296
+ slot.n_past = 0 ;
1296
1297
}
1297
1298
}
1298
1299
@@ -1429,7 +1430,6 @@ struct llama_server_context
1429
1430
// TODO: we always have to take into account the "system_tokens"
1430
1431
// this is not great and needs to be improved somehow
1431
1432
llama_batch_add (batch, slot.sampled , system_tokens.size () + slot.n_past , { slot.id }, true );
1432
-
1433
1433
slot.n_past += 1 ;
1434
1434
}
1435
1435
@@ -1540,25 +1540,6 @@ struct llama_server_context
1540
1540
slot.n_past = common_part (slot.cache_tokens , prompt_tokens);
1541
1541
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past ;
1542
1542
1543
- if (slot.ga_n != 1 )
1544
- {
1545
- int ga_i = 0 ;
1546
- int32_t ga_n = slot.ga_n ;
1547
- int32_t ga_w = slot.ga_w ;
1548
- int32_t slot_npast = 0 ;
1549
- for (int k = 0 ; k < slot.n_past ; ++k)
1550
- {
1551
- while (slot_npast >= ga_i + ga_w) {
1552
- const int bd = (ga_w/ga_n)*(ga_n - 1 );
1553
- slot_npast -= bd;
1554
- ga_i += ga_w/ga_n;
1555
- }
1556
- slot_npast++;
1557
- }
1558
- slot.n_past = slot_npast;
1559
- slot.ga_i = ga_i;
1560
- }
1561
-
1562
1543
LOG_TEE (" slot %d : in cache: %i tokens | to process: %i tokens\n " , slot.id , slot.n_past , slot.num_prompt_tokens_processed );
1563
1544
}
1564
1545
@@ -1573,25 +1554,44 @@ struct llama_server_context
1573
1554
// we have to evaluate at least 1 token to generate logits.
1574
1555
LOG_TEE (" slot %d : we have to evaluate at least 1 token to generate logits\n " , slot.id );
1575
1556
slot.n_past --;
1576
- if (slot.ga_i > 0 )
1577
- {
1578
- slot.n_past --;
1579
- }
1580
1557
}
1581
1558
1582
1559
LOG_VERBOSE (" prompt ingested" , {
1583
- {" n_past" , slot.n_past },
1584
- {" cached" , tokens_to_str (ctx, slot.cache_tokens .cbegin (), slot.cache_tokens .cbegin () + slot.n_past )},
1560
+ {" n_past" , slot.n_past },
1561
+ {" cached" , tokens_to_str (ctx, slot.cache_tokens .cbegin (), slot.cache_tokens .cbegin () + slot.n_past )},
1585
1562
{" to_eval" , tokens_to_str (ctx, slot.cache_tokens .cbegin () + slot.n_past , slot.cache_tokens .cend ())},
1586
1563
});
1587
1564
1565
+ if (slot.ga_n != 1 )
1566
+ {
1567
+ int ga_i = 0 ;
1568
+ int32_t ga_n = slot.ga_n ;
1569
+ int32_t ga_w = slot.ga_w ;
1570
+ int32_t slot_npast = 0 ;
1571
+ for (int k = 0 ; k < slot.n_past ; ++k)
1572
+ {
1573
+ while (slot_npast >= ga_i + ga_w) {
1574
+ const int bd = (ga_w/ga_n)*(ga_n - 1 );
1575
+ slot_npast -= bd;
1576
+ ga_i += ga_w/ga_n;
1577
+ }
1578
+ slot_npast++;
1579
+ }
1580
+ slot.n_past = slot_npast;
1581
+ slot.ga_i = ga_i;
1582
+
1583
+ LOG_TEE (" slot %d : applied self-extend to prompt: %i tokens\n " , slot.id , slot.n_past );
1584
+ }
1585
+
1588
1586
const bool has_images = process_images (slot);
1589
1587
1590
1588
// process the prefix of first image
1591
1589
std::vector<llama_token> prefix_tokens = has_images ? tokenize (slot.images [0 ].prefix_prompt , add_bos_token) : prompt_tokens;
1592
- int ga_i = slot.ga_i ;
1590
+
1591
+ int32_t ga_i = slot.ga_i ;
1593
1592
int32_t ga_n = slot.ga_n ;
1594
1593
int32_t ga_w = slot.ga_w ;
1594
+
1595
1595
for (; slot.n_past < (int ) prefix_tokens.size (); ++slot.n_past )
1596
1596
{
1597
1597
if (slot.ga_n != 1 )
@@ -1603,7 +1603,6 @@ struct llama_server_context
1603
1603
}
1604
1604
}
1605
1605
llama_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot.n_past , {slot.id }, false );
1606
- slot.n_past += 1 ;
1607
1606
}
1608
1607
1609
1608
if (has_images && !ingest_images (slot, n_batch))
@@ -1660,7 +1659,6 @@ struct llama_server_context
1660
1659
1661
1660
LOG_TEE (" \n n_past_old = %d, n_past = %d, ga_i = %d\n\n " , slot.n_past + bd, slot.n_past , slot.ga_i );
1662
1661
}
1663
- slot.n_past += n_tokens;
1664
1662
}
1665
1663
}
1666
1664
llama_batch batch_view =
@@ -1779,51 +1777,51 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
1779
1777
printf (" not recommended: doubles context memory required and no measurable increase in quality\n " );
1780
1778
if (llama_mlock_supported ())
1781
1779
{
1782
- printf (" --mlock force system to keep model in RAM rather than swapping or compressing\n " );
1780
+ printf (" --mlock force system to keep model in RAM rather than swapping or compressing\n " );
1783
1781
}
1784
1782
if (llama_mmap_supported ())
1785
1783
{
1786
- printf (" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n " );
1784
+ printf (" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n " );
1787
1785
}
1788
- printf (" --numa attempt optimizations that help on some NUMA systems\n " );
1786
+ printf (" --numa attempt optimizations that help on some NUMA systems\n " );
1789
1787
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1790
1788
printf (" -ngl N, --n-gpu-layers N\n " );
1791
- printf (" number of layers to store in VRAM\n " );
1789
+ printf (" number of layers to store in VRAM\n " );
1792
1790
printf (" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n " );
1793
- printf (" how to split the model across multiple GPUs, one of:\n " );
1794
- printf (" - none: use one GPU only\n " );
1795
- printf (" - layer (default): split layers and KV across GPUs\n " );
1796
- printf (" - row: split rows across GPUs\n " );
1791
+ printf (" how to split the model across multiple GPUs, one of:\n " );
1792
+ printf (" - none: use one GPU only\n " );
1793
+ printf (" - layer (default): split layers and KV across GPUs\n " );
1794
+ printf (" - row: split rows across GPUs\n " );
1797
1795
printf (" -ts SPLIT --tensor-split SPLIT\n " );
1798
- printf (" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n " );
1799
- printf (" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n " );
1800
- printf (" or for intermediate results and KV (with split-mode = row)\n " );
1796
+ printf (" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n " );
1797
+ printf (" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n " );
1798
+ printf (" or for intermediate results and KV (with split-mode = row)\n " );
1801
1799
#endif
1802
1800
printf (" -m FNAME, --model FNAME\n " );
1803
- printf (" model path (default: %s)\n " , params.model .c_str ());
1801
+ printf (" model path (default: %s)\n " , params.model .c_str ());
1804
1802
printf (" -a ALIAS, --alias ALIAS\n " );
1805
- printf (" set an alias for the model, will be added as `model` field in completion response\n " );
1806
- printf (" --lora FNAME apply LoRA adapter (implies --no-mmap)\n " );
1807
- printf (" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n " );
1808
- printf (" --host ip address to listen (default (default: %s)\n " , sparams.hostname .c_str ());
1809
- printf (" --port PORT port to listen (default (default: %d)\n " , sparams.port );
1810
- printf (" --path PUBLIC_PATH path from which to serve static files (default %s)\n " , sparams.public_path .c_str ());
1811
- printf (" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n " );
1812
- printf (" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n " );
1813
- printf (" -to N, --timeout N server read/write timeout in seconds (default: %d)\n " , sparams.read_timeout );
1814
- printf (" --embedding enable embedding vector output (default: %s)\n " , params.embedding ? " enabled" : " disabled" );
1815
- printf (" -np N, --parallel N number of slots for process requests (default: %d)\n " , params.n_parallel );
1816
- printf (" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n " );
1817
- printf (" -spf FNAME, --system-prompt-file FNAME\n " );
1818
- printf (" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n " );
1819
- printf (" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n " );
1820
- printf (" --log-disable disables logging to a file.\n " );
1803
+ printf (" set an alias for the model, will be added as `model` field in completion response\n " );
1804
+ printf (" --lora FNAME apply LoRA adapter (implies --no-mmap)\n " );
1805
+ printf (" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n " );
1806
+ printf (" --host ip address to listen (default (default: %s)\n " , sparams.hostname .c_str ());
1807
+ printf (" --port PORT port to listen (default (default: %d)\n " , sparams.port );
1808
+ printf (" --path PUBLIC_PATH path from which to serve static files (default %s)\n " , sparams.public_path .c_str ());
1809
+ printf (" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n " );
1810
+ printf (" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n " );
1811
+ printf (" -to N, --timeout N server read/write timeout in seconds (default: %d)\n " , sparams.read_timeout );
1812
+ printf (" --embedding enable embedding vector output (default: %s)\n " , params.embedding ? " enabled" : " disabled" );
1813
+ printf (" -np N, --parallel N number of slots for process requests (default: %d)\n " , params.n_parallel );
1814
+ printf (" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n " );
1815
+ printf (" -spf FNAME, --system-prompt-file FNAME\n " );
1816
+ printf (" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n " );
1817
+ printf (" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n " );
1818
+ printf (" --log-disable disables logging to a file.\n " );
1821
1819
printf (" \n " );
1822
1820
printf (" --override-kv KEY=TYPE:VALUE\n " );
1823
- printf (" advanced option to override model metadata by key. may be specified multiple times.\n " );
1824
- printf (" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n " );
1825
- printf (" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`" );
1826
- printf (" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`" );
1821
+ printf (" advanced option to override model metadata by key. may be specified multiple times.\n " );
1822
+ printf (" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n " );
1823
+ printf (" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`" );
1824
+ printf (" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`" );
1827
1825
printf (" \n " );
1828
1826
}
1829
1827
0 commit comments