Skip to content

Commit e529aea

Browse files
run_with_preset.py: server, llama-bench support
1 parent 487f622 commit e529aea

File tree

3 files changed

+30
-5
lines changed

3 files changed

+30
-5
lines changed

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -941,6 +941,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
941941
fprintf(stream, "###############\n");
942942
fprintf(stream, "\n");
943943

944+
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
944945
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
945946
dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str(), false);
946947
fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
@@ -987,7 +988,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
987988
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
988989
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
989990
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
990-
fprintf(stream, "model_alias: %s # default: unknown\n", params.model_alias.c_str());
991991
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
992992
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
993993
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");

examples/server/server.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
720720
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
721721
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
722722
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
723-
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
723+
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
724724
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
725725
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
726726
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");

run_with_preset.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import yaml
99

10-
CLI_ARGS_MAIN = [
10+
CLI_ARGS_MAIN_PERPLEXITY = [
1111
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
1212
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
1313
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
@@ -21,8 +21,19 @@
2121
"temp", "tfs", "top-k", "top-p", "typical", "verbose-prompt"
2222
]
2323

24+
CLI_ARGS_LLAMA_BENCH = [
25+
"batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
26+
"n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
27+
]
28+
29+
CLI_ARGS_SERVER = [
30+
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
31+
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "no-mmap", "no-mul-mat-q", "numa",
32+
"path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split", "threads", "verbose"
33+
]
34+
2435
description = """Run llama.cpp binaries with presets from YAML file(s).
25-
To specify which binary should be run, specify the "binary" property.
36+
To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported).
2637
To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
2738
2839
Formatting considerations:
@@ -40,6 +51,7 @@
4051

4152
parser = argparse.ArgumentParser(
4253
description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
54+
parser.add_argument("-bin", "--binary", help="The binary to run.")
4355
parser.add_argument("yaml_files", nargs="*",
4456
help="Arbitrary number of YAML files from which to read preset values. "
4557
"If two files specify the same values the later one will be used.")
@@ -59,12 +71,25 @@
5971
props = {prop.replace("_", "-"): val for prop, val in props.items()}
6072

6173
binary = props.pop("binary", "main")
74+
if known_args.binary:
75+
binary = known_args.binary
76+
6277
if os.path.exists(f"./{binary}"):
6378
binary = f"./{binary}"
6479

80+
if binary.endswith("main") or binary.endswith("perplexity"):
81+
cli_args = CLI_ARGS_MAIN_PERPLEXITY
82+
elif binary.endswith("llama-bench"):
83+
cli_args = CLI_ARGS_LLAMA_BENCH
84+
elif binary.endswith("server"):
85+
cli_args = CLI_ARGS_SERVER
86+
else:
87+
print(f"Unknown binary: {binary}")
88+
sys.exit(1)
89+
6590
command_list = [binary]
6691

67-
for cli_arg in CLI_ARGS_MAIN:
92+
for cli_arg in cli_args:
6893
value = props.get(cli_arg, None)
6994

7095
if not value or value == -1:

0 commit comments

Comments
 (0)