Skip to content

Commit da840a3

Browse files
committed
Merge branch 'master' into Nexes_CQ_10
2 parents 056c47d + 116efee commit da840a3

File tree

10 files changed

+265
-44
lines changed

10 files changed

+265
-44
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
691691
[](gpt_params & params) {
692692
params.ctx_shift = false;
693693
}
694-
).set_examples({LLAMA_EXAMPLE_MAIN}));
694+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
695695
add_opt(llama_arg(
696696
{"--chunks"}, "N",
697697
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),

examples/server/README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@ The project is under active development, and we are [looking for feedback and co
2121
| -------- | ----------- |
2222
| `-h, --help, --usage` | print usage and exit |
2323
| `--version` | show version and build info |
24-
| `-v, --verbose` | print verbose information |
25-
| `--verbosity N` | set specific verbosity level (default: 0) |
2624
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
2725
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
2826
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
@@ -40,15 +38,18 @@ The project is under active development, and we are [looking for feedback and co
4038
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
4139
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
4240
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
41+
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled) |
4342
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
4443
| `-p, --prompt PROMPT` | prompt to start generation with |
44+
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
4545
| `-f, --file FNAME` | a file containing the prompt (default: none) |
4646
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
4747
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
4848
| `--no-escape` | do not process escape sequences |
49+
| `-sp, --special` | special tokens output enabled (default: false) |
4950
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
5051
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
51-
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
52+
| `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) |
5253
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
5354
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
5455
| `--penalize-nl` | penalize newline tokens (default: false) |
@@ -87,7 +88,7 @@ The project is under active development, and we are [looking for feedback and co
8788
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
8889
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
8990
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
90-
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
91+
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
9192
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
9293
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
9394
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
@@ -128,12 +129,13 @@ The project is under active development, and we are [looking for feedback and co
128129
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
129130
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
130131
| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
131-
| `--log-test` | Log test |
132132
| `--log-disable` | Log disable |
133-
| `--log-enable` | Log enable |
134-
| `--log-new` | Log new |
135-
| `--log-append` | Log append |
136-
| `--log-file FNAME` | Log file |
133+
| `--log-file FNAME` | Log to file |
134+
| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
135+
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
136+
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
137+
| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
138+
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
137139

138140
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
139141

examples/server/server.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,15 @@ struct server_context {
11801180
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
11811181
}
11821182

1183+
// if context shift is disabled, we stop when it reaches the context limit
1184+
if (slot.n_decoded >= slot.n_ctx) {
1185+
slot.truncated = true;
1186+
slot.stopped_limit = true;
1187+
slot.has_next_token = false;
1188+
1189+
SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
1190+
}
1191+
11831192
if (llama_token_is_eog(model, result.tok)) {
11841193
slot.stopped_eos = true;
11851194
slot.has_next_token = false;
@@ -1480,7 +1489,7 @@ struct server_context {
14801489
if (result.error) {
14811490
error_handler(result.data);
14821491
cancel_tasks(id_tasks);
1483-
break;
1492+
return;
14841493
}
14851494

14861495
size_t idx = result.data["index"];
@@ -1827,6 +1836,14 @@ struct server_context {
18271836
for (server_slot & slot : slots) {
18281837
if (slot.ga_n == 1) {
18291838
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
1839+
if (!params.ctx_shift) {
1840+
// this check is redundant (for good)
1841+
// we should never get here, because generation should already stopped in process_token()
1842+
slot.release();
1843+
send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
1844+
continue;
1845+
}
1846+
18301847
// Shift context
18311848
const int n_keep = slot.params.n_keep + add_bos_token;
18321849
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
@@ -1961,6 +1978,14 @@ struct server_context {
19611978
continue;
19621979
}
19631980
} else {
1981+
if (!params.ctx_shift) {
1982+
// if context shift is disabled, we make sure prompt size is smaller than KV size
1983+
if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
1984+
slot.release();
1985+
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
1986+
continue;
1987+
}
1988+
}
19641989
if (slot.params.n_keep < 0) {
19651990
slot.params.n_keep = slot.n_prompt_tokens;
19661991
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
@llama.cpp
2+
@ctx_shift
3+
Feature: llama.cpp server
4+
5+
Background: Server startup
6+
Given a server listening on localhost:8080
7+
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
8+
And a model file test-model.gguf
9+
And a model alias tinyllama-2
10+
And BOS token is 1
11+
And 42 as server seed
12+
And 256 KV cache size
13+
And 32 as batch size
14+
And 2 slots
15+
16+
Scenario: Inference with context shift
17+
And 64 server max tokens to predict
18+
Then the server is starting
19+
Then the server is healthy
20+
Given a prompt:
21+
"""
22+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
23+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
24+
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
25+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
26+
"""
27+
And a completion request with no api error
28+
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
29+
And the completion is truncated
30+
And 109 prompt tokens are processed
31+
32+
Scenario Outline: Inference without context shift
33+
And <n_predict> server max tokens to predict
34+
And disable context shifting
35+
Then the server is starting
36+
Then the server is healthy
37+
Given a prompt:
38+
"""
39+
Hi how are you
40+
"""
41+
And a completion request with no api error
42+
Then <n_token_output> tokens are predicted matching twind|Anna
43+
And the completion is <truncated> truncated
44+
And 8 prompt tokens are processed
45+
Examples:
46+
| n_predict | n_token_output | truncated |
47+
| 64 | 64 | not |
48+
| -1 | 120 | |
49+
50+
Scenario: Inference without context shift (expected error: prompt too long)
51+
And disable context shifting
52+
Then the server is starting
53+
Then the server is healthy
54+
Given a prompt:
55+
"""
56+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
57+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
58+
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
59+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
60+
"""
61+
And a completion request with 400 api error
62+

examples/server/tests/features/embeddings.feature

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ Feature: llama.cpp server
1010
And 42 as server seed
1111
And 2 slots
1212
# the bert-bge-small model has context size of 512
13-
# since the generated prompts are as big as the batch size, we need to set the batch size to 512
13+
# since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
1414
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
15-
And 512 as batch size
16-
And 512 as ubatch size
17-
And 2048 KV cache size
15+
And 128 as batch size
16+
And 128 as ubatch size
17+
And 512 KV cache size
1818
And embeddings extraction
1919
Then the server is starting
2020
Then the server is healthy
@@ -26,6 +26,20 @@ Feature: llama.cpp server
2626
"""
2727
Then embeddings are generated
2828

29+
Scenario: Embedding (error: prompt too long)
30+
When embeddings are computed for:
31+
"""
32+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
33+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
34+
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
35+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
36+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
37+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
38+
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
39+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
40+
"""
41+
And embeddings request with 500 api error
42+
2943
Scenario: OAI Embeddings compatibility
3044
Given a model bert-bge-small
3145
When an OAI compatible embeddings computation request for:

examples/server/tests/features/steps/steps.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
7777
context.response_format = None
7878
context.temperature = None
7979
context.lora_file = None
80+
context.disable_ctx_shift = False
8081

8182
context.tasks_result = []
8283
context.concurrent_tasks = []
@@ -148,7 +149,7 @@ def step_n_slots(context, n_slots: int):
148149

149150
@step('{n_predict:d} server max tokens to predict')
150151
def step_server_n_predict(context, n_predict: int):
151-
context.n_server_predict = n_predict
152+
context.n_server_predict = n_predict if n_predict > 0 else None
152153

153154

154155
@step('{slot_save_path} as slot save path')
@@ -180,6 +181,9 @@ def step_server_embeddings(context):
180181
def step_server_metrics(context):
181182
context.server_metrics = True
182183

184+
@step('disable context shifting')
185+
def step_server_disable_ctx_shift(context):
186+
context.disable_ctx_shift = True
183187

184188
@step("the server is starting")
185189
def step_start_server(context):
@@ -257,7 +261,7 @@ async def step_all_slots_status(context, expected_slot_status_string: Literal['i
257261
@step('a completion request with {api_error} api error')
258262
@async_run_until_complete
259263
async def step_request_completion(context, api_error: Literal['raised'] | str):
260-
expect_api_error = api_error == 'raised'
264+
expect_api_error = api_error == 'raised' or api_error != 'no'
261265
seeds = await completions_seed(context, num_seeds=1)
262266
completion = await request_completion(context.prompts.pop(),
263267
seeds[0] if seeds is not None else seeds,
@@ -272,8 +276,11 @@ async def step_request_completion(context, api_error: Literal['raised'] | str):
272276
context.tasks_result.append(completion)
273277
if context.debug:
274278
print(f"Completion response: {completion}")
275-
if expect_api_error:
279+
if api_error == 'raised':
276280
assert completion == 401, f"completion must be an 401 status code: {completion}"
281+
elif api_error.isdigit():
282+
api_error_code = int(api_error)
283+
assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"
277284

278285

279286
@step('{predicted_n:d} tokens are predicted matching {re_content}')
@@ -645,6 +652,9 @@ def step_assert_embeddings(context):
645652
for embedding in context.embeddings:
646653
assert_embeddings(embedding)
647654

655+
@step('embeddings request with {api_error_code:d} api error')
656+
def step_assert_embeddings(context, api_error_code: int):
657+
assert context.embeddings == api_error_code, f"embeddings request must return code {api_error_code}, but got {context.embeddings}"
648658

649659
@step('an OAI compatible embeddings computation request for')
650660
@async_run_until_complete
@@ -1089,15 +1099,17 @@ async def oai_chat_completions(user_prompt,
10891099
return completion_response
10901100

10911101

1092-
async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
1102+
async def request_embedding(content, seed, base_url=None) -> list[list[float]] | int:
10931103
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
10941104
async with session.post(f'{base_url}/embedding',
10951105
json={
10961106
"content": content,
10971107
}) as response:
1098-
assert response.status == 200
1099-
response_json = await response.json()
1100-
return [response_json['embedding']]
1108+
if response.status == 200:
1109+
response_json = await response.json()
1110+
return [response_json['embedding']]
1111+
else:
1112+
return response.status
11011113

11021114

11031115
async def request_oai_embeddings(input, seed,
@@ -1372,6 +1384,8 @@ def start_server_background(context):
13721384
server_args.append('--verbose')
13731385
if context.lora_file:
13741386
server_args.extend(['--lora', context.lora_file])
1387+
if context.disable_ctx_shift:
1388+
server_args.extend(['--no-context-shift'])
13751389

13761390
args = [str(arg) for arg in [context.server_path, *server_args]]
13771391
print(f"bench: starting server with: {' '.join(args)}")

ggml/src/ggml-cuda.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2890,6 +2890,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
28902890
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
28912891
return true;
28922892
}
2893+
if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
2894+
return true;
2895+
}
28932896
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
28942897
return true;
28952898
}

0 commit comments

Comments
 (0)