Skip to content

Commit 298990a

Browse files
committed
Merge branch 'master' into Nexes_CQ_10
2 parents f1814f1 + dca1d4b commit 298990a

22 files changed

+205
-194
lines changed

common/arg.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1838,9 +1838,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
18381838
params.endpoint_metrics = true;
18391839
}
18401840
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
1841+
add_opt(llama_arg(
1842+
{"--slots"},
1843+
format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
1844+
[](gpt_params & params) {
1845+
params.endpoint_slots = true;
1846+
}
1847+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
1848+
add_opt(llama_arg(
1849+
{"--props"},
1850+
format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
1851+
[](gpt_params & params) {
1852+
params.endpoint_props = true;
1853+
}
1854+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
18411855
add_opt(llama_arg(
18421856
{"--no-slots"},
1843-
format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
1857+
"disables slots monitoring endpoint",
18441858
[](gpt_params & params) {
18451859
params.endpoint_slots = false;
18461860
}

common/common.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,10 @@ struct gpt_params {
290290
std::string ssl_file_key = ""; // NOLINT
291291
std::string ssl_file_cert = ""; // NOLINT
292292

293-
bool endpoint_slots = true;
293+
// "advanced" endpoints are disabled by default for better security
294+
bool webui = true;
295+
bool endpoint_slots = false;
296+
bool endpoint_props = false; // only control POST requests, not GET
294297
bool endpoint_metrics = false;
295298

296299
bool log_json = false;

examples/export-lora/export-lora.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,9 @@ struct lora_merge_ctx {
314314
// optionally dequantize it
315315
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
316316
auto nels = ggml_nelements(inp_base);
317-
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
317+
const auto * qtype = ggml_get_type_traits(base->type);
318318
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
319-
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
319+
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
320320
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
321321
} else {
322322
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));

examples/quantize-stats/quantize-stats.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
142142
}
143143

144144
static void test_roundtrip_on_chunk(
145-
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
145+
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
146146
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
147147
) {
148148
if (layer->type == GGML_TYPE_F16) {
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
166166

167167
// Run quantization function for a single layer and update error stats
168168
static void test_roundtrip_on_layer(
169-
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
169+
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
170170
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
171171
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
172172
) {
@@ -371,8 +371,8 @@ int main(int argc, char ** argv) {
371371
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
372372
continue;
373373
}
374-
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
375-
if (qfns.from_float && qfns.to_float) {
374+
const auto * qfns = ggml_get_type_traits(type);
375+
if (qfns->from_float && qfns->to_float) {
376376
if (params.verbose) {
377377
printf("testing %s ...\n", ggml_type_name(type));
378378
}
@@ -393,7 +393,7 @@ int main(int argc, char ** argv) {
393393
test_roundtrip_on_layer(
394394
layer_name,
395395
params.per_layer_stats,
396-
qfns,
396+
*qfns,
397397
params.reference,
398398
kv_tensor.second,
399399
input_scratch,

examples/server/README.md

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ The project is under active development, and we are [looking for feedback and co
1818

1919
## Usage
2020

21+
<!-- Note for contributors: The list below is generated by llama-gen-docs -->
22+
2123
**Common params**
2224

2325
| Argument | Explanation |
@@ -149,7 +151,9 @@ The project is under active development, and we are [looking for feedback and co
149151
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
150152
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
151153
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
152-
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
154+
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
155+
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
156+
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
153157
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
154158
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
155159
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
@@ -380,8 +384,6 @@ node index.js
380384

381385
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
382386

383-
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
384-
385387
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
386388

387389
**Response format**
@@ -519,34 +521,41 @@ Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/B
519521

520522
Takes a prefix and a suffix and returns the predicted completion as stream.
521523

522-
*Options:*
524+
*Options:*
523525

524-
`input_prefix`: Set the prefix of the code to infill.
526+
- `input_prefix`: Set the prefix of the code to infill.
527+
- `input_suffix`: Set the suffix of the code to infill.
525528

526-
`input_suffix`: Set the suffix of the code to infill.
529+
It also accepts all the options of `/completion` except `stream` and `prompt`.
527530

528-
It also accepts all the options of `/completion` except `stream` and `prompt`.
531+
### **GET** `/props`: Get server global properties.
529532

530-
- **GET** `/props`: Return current server settings.
533+
This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
531534

532535
**Response format**
533536

534537
```json
535538
{
536-
"assistant_name": "",
537-
"user_name": "",
539+
"system_prompt": "",
538540
"default_generation_settings": { ... },
539541
"total_slots": 1,
540542
"chat_template": ""
541543
}
542544
```
543545

544-
- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
545-
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
546+
- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
546547
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
547548
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
548549
- `chat_template` - the model's original Jinja2 prompt template
549550

551+
### POST `/props`: Change server global properties.
552+
553+
To use this endpoint with POST method, you need to start server with `--props`
554+
555+
*Options:*
556+
557+
- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
558+
550559
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
551560

552561
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
@@ -813,28 +822,6 @@ To know the `id` of the adapter, use GET `/lora-adapters`
813822

814823
## More examples
815824

816-
### Change system prompt on runtime
817-
818-
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
819-
820-
`prompt`: Specify a context that you want all connecting clients to respect.
821-
822-
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
823-
824-
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
825-
826-
```json
827-
{
828-
"system_prompt": {
829-
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
830-
"anti_prompt": "User:",
831-
"assistant_name": "Assistant:"
832-
}
833-
}
834-
```
835-
836-
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
837-
838825
### Interactive mode
839826

840827
Check the sample in [chat.mjs](chat.mjs).

0 commit comments

Comments
 (0)