run_with_preset.py: server, llama-bench support

JohannesGaessler · JohannesGaessler · commit e529aeab82d3 · 2023-08-23T20:39:52.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -941,6 +941,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "###############\n");
     fprintf(stream, "\n");
 
+    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
     fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
     dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str(), false);
     fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
@@ -987,7 +988,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
     fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
     fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
-    fprintf(stream, "model_alias: %s # default: unknown\n", params.model_alias.c_str());
     fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
     fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -720,7 +720,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
     fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
     fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
     fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
     fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
     fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
diff --git a/run_with_preset.py b/run_with_preset.py
@@ -7,7 +7,7 @@
 
 import yaml
 
-CLI_ARGS_MAIN = [
+CLI_ARGS_MAIN_PERPLEXITY = [
     "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
     "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
     "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
@@ -21,8 +21,19 @@
     "temp", "tfs", "top-k", "top-p", "typical", "verbose-prompt"
 ]
 
+CLI_ARGS_LLAMA_BENCH = [
+    "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
+    "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
+]
+
+CLI_ARGS_SERVER = [
+    "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
+    "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "no-mmap", "no-mul-mat-q", "numa",
+    "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split", "threads", "verbose"
+]
+
 description = """Run llama.cpp binaries with presets from YAML file(s).
-To specify which binary should be run, specify the "binary" property.
+To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported).
 To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
 
 Formatting considerations:
@@ -40,6 +51,7 @@
 
 parser = argparse.ArgumentParser(
     description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument("-bin", "--binary", help="The binary to run.")
 parser.add_argument("yaml_files", nargs="*",
                     help="Arbitrary number of YAML files from which to read preset values. "
                     "If two files specify the same values the later one will be used.")
@@ -59,12 +71,25 @@
 props = {prop.replace("_", "-"): val for prop, val in props.items()}
 
 binary = props.pop("binary", "main")
+if known_args.binary:
+    binary = known_args.binary
+
 if os.path.exists(f"./{binary}"):
     binary = f"./{binary}"
 
+if binary.endswith("main") or binary.endswith("perplexity"):
+    cli_args = CLI_ARGS_MAIN_PERPLEXITY
+elif binary.endswith("llama-bench"):
+    cli_args = CLI_ARGS_LLAMA_BENCH
+elif binary.endswith("server"):
+    cli_args = CLI_ARGS_SERVER
+else:
+    print(f"Unknown binary: {binary}")
+    sys.exit(1)
+
 command_list = [binary]
 
-for cli_arg in CLI_ARGS_MAIN:
+for cli_arg in cli_args:
     value = props.get(cli_arg, None)
 
     if not value or value == -1: