From 6a3a2fcc5b50431dc133eee4a86d512506d0550e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Sep 2024 13:37:09 +0200
Subject: [PATCH 01/20] (wip) argparser v3

---
 common/common.cpp      | 73 ++++++++++++++++++++++++++++++++++++++++++
 common/common.h        | 63 +++++++++++++++++++++++++++++++++++-
 examples/main/main.cpp |  3 ++
 3 files changed, 138 insertions(+), 1 deletion(-)
diff --git a/common/common.cpp b/common/common.cpp
index 9fa18472512ab..34d0eff78f312 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1714,6 +1714,79 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif
 
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+void gpt_params_print_usage(std::vector<llama_arg> & options) {
+    constexpr static int n_leading_spaces = 40;
+    std::string leading_spaces(n_leading_spaces, ' ');
+    for (const auto & opt : options) {
+        std::ostringstream ss;
+        for (const auto & arg : opt.args) {
+            if (&arg == &opt.args.front()) {
+                ss << format("%-7s", (arg + ",").c_str());
+            } else {
+                ss << arg << (&arg != &opt.args.back() ? ", " : "");
+            }
+        }
+        if (!opt.value_ex.empty()) ss << " " << opt.value_ex;
+        if (ss.tellp() > n_leading_spaces - 3) {
+            // current line is too long, add new line
+            ss << "\n" << leading_spaces;
+        } else {
+            // padding between arg and help, same line
+            ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+        }
+        const auto help_lines = llama_arg::break_str_into_lines(opt.help, 50);
+        for (const auto & line : help_lines) {
+            ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
+        }
+        printf("%s", ss.str().c_str());
+    }
+}
+
+std::vector<llama_arg> gpt_params_parser_register(gpt_params & params) {
+    std::vector<llama_arg> options;
+    options.push_back(llama_arg(
+        {"-h", "--help", "--usage"},
+        "print usage and exit",
+        [&params, &options]() {
+            gpt_params_print_usage(options);
+            exit(0);
+            return true;
+        }
+    ));
+    options.push_back(llama_arg(
+        {"-m", "--model"},
+        format("model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)", params.model.c_str()),
+        [&params](std::string value) {
+            params.model = value;
+            return true;
+        }
+    ).set_value_ex("FNAME"));
+    return options;
+}
+
+bool gpt_params_parser_run(int argc, char ** argv, std::vector<llama_arg> & options) {
+    for (const auto & opt : options) {
+        if (opt.handler_void) opt.handler_void();
+    }
+    return true;
+}
+
 void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     const llama_sampling_params & sparams = params.sparams;
 
diff --git a/common/common.h b/common/common.h
index cb5e7f6df10c5..04f4476f039de 100644
--- a/common/common.h
+++ b/common/common.h
@@ -14,6 +14,7 @@
 #include <vector>
 #include <random>
 #include <thread>
+#include <set>
 #include <unordered_map>
 #include <tuple>
 
@@ -123,7 +124,7 @@ struct gpt_params {
     // // sampling parameters
     struct llama_sampling_params sparams;
 
-    std::string model                = ""; // model path
+    std::string model                = "model.gguf"; // model path
     std::string model_draft          = ""; // draft model for speculative decoding
     std::string model_alias          = "unknown"; // model alias
     std::string model_url            = ""; // model url to download
@@ -277,6 +278,66 @@ struct gpt_params {
     std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 };
 
+enum llama_example {
+    LLAMA_EXAMPLE_ALL,
+    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_MAIN,
+};
+
+struct llama_arg {
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_ALL};
+    std::vector<std::string> args;
+    std::string value_ex;
+    std::string env;
+    std::string help;
+    std::function<bool(void)>        handler_void   = nullptr;
+    std::function<bool(std::string)> handler_string = nullptr;
+    std::function<bool(bool)>        handler_bool   = nullptr;
+    std::function<bool(int)>         handler_int    = nullptr;
+    std::function<bool(float)>       handler_float  = nullptr;
+
+    llama_arg(std::vector<std::string> args, std::string help, std::function<bool(std::string)> handler) : args(args), help(help), handler_string(handler) {}
+
+    llama_arg(std::vector<std::string> args, std::string help, std::function<bool(bool)> handler) : args(args), help(help), handler_bool(handler) {}
+
+    llama_arg(std::vector<std::string> args, std::string help, std::function<bool(void)> handler) : args(args), help(help), handler_void(handler) {}
+
+    llama_arg & set_examples(std::set<enum llama_example> _examples) {
+        examples = std::move(_examples);
+        return *this;
+    }
+
+    llama_arg & set_value_ex(std::string _value_ex) {
+        value_ex = std::move(_value_ex);
+        return *this;
+    }
+
+    llama_arg & set_env(std::string _env) {
+        env = _env;
+        return *this;
+    }
+
+    // utility function
+    static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
+        std::vector<std::string> result;
+        std::istringstream iss(input);
+        std::string word, line;
+        while (iss >> word) {
+            if (line.length() + !line.empty() + word.length() > max_char_per_line) {
+                if (!line.empty()) result.push_back(line);
+                line = word;
+            } else {
+                line += (!line.empty() ? " " : "") + word;
+            }
+        }
+        if (!line.empty()) result.push_back(line);
+        return result;
+    }
+};
+
+std::vector<llama_arg> gpt_params_parser_register(gpt_params & params);
+bool gpt_params_parser_run(int argc, char ** argv, std::vector<llama_arg> & options);
+
 void gpt_params_parse_from_env(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c55efbb66d7c1..6a025ed512217 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -131,6 +131,9 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
+    auto options = gpt_params_parser_register(params);
+    gpt_params_parser_run(argc, argv, options);
+    return 0;
 
     if (!gpt_params_parse(argc, argv, params)) {
         gpt_params_print_usage(argc, argv, params);

From 9ae4d8a96d81017e22a968d2528bcb5819adcce6 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 15:55:44 +0200
Subject: [PATCH 02/20] migrated

---
 common/common.cpp                             | 3326 +++++++++--------
 common/common.h                               |   81 +-
 examples/batched-bench/batched-bench.cpp      |    8 +-
 examples/batched/batched.cpp                  |    8 +-
 .../cvector-generator/cvector-generator.cpp   |    8 +-
 examples/embedding/embedding.cpp              |    4 +-
 examples/eval-callback/eval-callback.cpp      |    4 +-
 examples/export-lora/export-lora.cpp          |    8 +-
 examples/gritlm/gritlm.cpp                    |    4 +-
 examples/imatrix/imatrix.cpp                  |    8 +-
 examples/infill/infill.cpp                    |    4 +-
 examples/llava/llava-cli.cpp                  |   10 +-
 examples/llava/minicpmv-cli.cpp               |    5 +-
 examples/lookahead/lookahead.cpp              |    4 +-
 examples/lookup/lookup-create.cpp             |    4 +-
 examples/lookup/lookup-stats.cpp              |    4 +-
 examples/lookup/lookup.cpp                    |    4 +-
 examples/main/main.cpp                        |    7 +-
 examples/parallel/parallel.cpp                |    4 +-
 examples/passkey/passkey.cpp                  |    8 +-
 examples/perplexity/perplexity.cpp            |    4 +-
 examples/retrieval/retrieval.cpp              |    8 +-
 examples/save-load-state/save-load-state.cpp  |    4 +-
 examples/server/server.cpp                    |    7 +-
 examples/simple/simple.cpp                    |    8 +-
 examples/speculative/speculative.cpp          |    4 +-
 26 files changed, 1803 insertions(+), 1745 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 34d0eff78f312..09e3a992c6a06 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -306,6 +306,32 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 // CLI argument parsing
 //
 
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
 void gpt_params_handle_model_default(gpt_params & params) {
     if (!params.hf_repo.empty()) {
         // short-hand to avoid specifying --hf-file -> default it to --model
@@ -352,22 +378,60 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
     }
 }
 
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
-    bool invalid_param = false;
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
     std::string arg;
     const std::string arg_prefix = "--";
     llama_sampling_params & sparams = params.sparams;
 
+    std::unordered_map<std::string, const llama_arg *> arg_to_options;
+    for (const auto & opt : options) {
+        for (const auto & arg : opt.args) {
+            arg_to_options[arg] = &opt;
+        }
+    }
+
+    auto check_arg = [&](int i) {
+        if (i+1 >= argc) {
+            throw std::invalid_argument("expected value for argument");
+        }
+    };
+
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
         if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
-        if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
-            throw std::invalid_argument("error: unknown argument: " + arg);
+        if (arg_to_options.find(arg) == arg_to_options.end()) {
+            throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
         }
-        if (invalid_param) {
-            throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+        try {
+            auto opt = *arg_to_options[arg];
+            if (opt.handler_void) {
+                opt.handler_void();
+                continue;
+            }
+
+            // arg with single value
+            check_arg(i);
+            std::string val = argv[++i];
+            if (opt.handler_int) {
+                opt.handler_int(std::stoi(val));
+                continue;
+            }
+            if (opt.handler_string) {
+                opt.handler_string(val);
+                continue;
+            }
+
+            // arg with 2 values
+            check_arg(i);
+            std::string val2 = argv[++i];
+            if (opt.handler_str_str) {
+                opt.handler_str_str(val, val2);
+                continue;
+            }
+        } catch (std::exception & e) {
+            throw std::invalid_argument(format("error: %s", e.what()));
         }
     }
 
@@ -404,41 +468,21 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
-void gpt_params_parse_from_env(gpt_params & params) {
-    // we only care about server-related params for now
-    get_env("LLAMA_ARG_MODEL",            params.model);
-    get_env("LLAMA_ARG_MODEL_URL",        params.model_url);
-    get_env("LLAMA_ARG_MODEL_ALIAS",      params.model_alias);
-    get_env("LLAMA_ARG_HF_REPO",          params.hf_repo);
-    get_env("LLAMA_ARG_HF_FILE",          params.hf_file);
-    get_env("LLAMA_ARG_THREADS",          params.cpuparams.n_threads);
-    get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
-    get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
-    get_env("LLAMA_ARG_BATCH",            params.n_batch);
-    get_env("LLAMA_ARG_UBATCH",           params.n_ubatch);
-    get_env("LLAMA_ARG_N_GPU_LAYERS",     params.n_gpu_layers);
-    get_env("LLAMA_ARG_THREADS_HTTP",     params.n_threads_http);
-    get_env("LLAMA_ARG_CHAT_TEMPLATE",    params.chat_template);
-    get_env("LLAMA_ARG_N_PREDICT",        params.n_predict);
-    get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
-    get_env("LLAMA_ARG_ENDPOINT_SLOTS",   params.endpoint_slots);
-    get_env("LLAMA_ARG_EMBEDDINGS",       params.embedding);
-    get_env("LLAMA_ARG_FLASH_ATTN",       params.flash_attn);
-    get_env("LLAMA_ARG_DEFRAG_THOLD",     params.defrag_thold);
-    get_env("LLAMA_ARG_CONT_BATCHING",    params.cont_batching);
-    get_env("LLAMA_ARG_HOST",             params.hostname);
-    get_env("LLAMA_ARG_PORT",             params.port);
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
     const auto params_org = params; // the example can modify the default params
 
     try {
-        if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
+        if (!gpt_params_parse_ex(argc, argv, params, options)) {
             params = params_org;
-            params.usage = true;
             return false;
         }
+        if (params.usage) {
+            gpt_params_print_usage(options);
+            if (params.print_usage) {
+                params.print_usage(argc, argv);
+            }
+            exit(0);
+        }
     } catch (const std::invalid_argument & ex) {
         fprintf(stderr, "%s\n", ex.what());
         params = params_org;
@@ -521,1646 +565,1690 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
     return true;
 }
 
-#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
+static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
+    std::vector<std::string> result;
+    std::istringstream iss(input);
+    std::string line;
+    auto add_line = [&](const std::string& l) {
+        if (l.length() <= max_char_per_line) {
+            result.push_back(l);
+        } else {
+            std::istringstream line_stream(l);
+            std::string word, current_line;
+            while (line_stream >> word) {
+                if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
+                    if (!current_line.empty()) result.push_back(current_line);
+                    current_line = word;
+                } else {
+                    current_line += (!current_line.empty() ? " " : "") + word;
+                }
+            }
+            if (!current_line.empty()) result.push_back(current_line);
+        }
+    };
+    while (std::getline(iss, line)) {
+        add_line(line);
+    }
+    return result;
+}
+
+void gpt_params_print_usage(std::vector<llama_arg> & options) {
+    const static int n_leading_spaces = 40;
+    const static int n_char_per_line_help = 70; // TODO: detect this based on current console
+    
+    auto print_options = [](std::vector<const llama_arg *> & options) {
+        std::string leading_spaces(n_leading_spaces, ' ');
+        for (const auto & opt : options) {
+            std::ostringstream ss;
+            for (const auto & arg : opt->args) {
+                if (&arg == &opt->args.front()) {
+                    ss << (opt->args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str()));
+                } else {
+                    ss << arg << (&arg != &opt->args.back() ? ", " : "");
+                }
+            }
+            if (!opt->value_hint.empty()) ss << " " << opt->value_hint;
+            if (ss.tellp() > n_leading_spaces - 3) {
+                // current line is too long, add new line
+                ss << "\n" << leading_spaces;
+            } else {
+                // padding between arg and help, same line
+                ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+            }
+            const auto help_lines = break_str_into_lines(opt->help, n_char_per_line_help);
+            for (const auto & line : help_lines) {
+                ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
+            }
+            printf("%s", ss.str().c_str());
+        }
+    };
 
-bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
-    const char split_delim = ',';
+    std::vector<const llama_arg *> common_options;
+    std::vector<const llama_arg *> specific_options;
+    for (auto & opt : options) {
+        if (opt.in_example(LLAMA_EXAMPLE_COMMON)) {
+            common_options.push_back(&opt);
+        } else {
+            specific_options.push_back(&opt);
+        }
+    }
+    printf("----- common options -----\n\n");
+    print_options(common_options);
+    printf("\n\n----- example-specific options -----\n\n");
+    print_options(specific_options);
+}
 
+std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex) {
+    return gpt_params_parser_init(params, ex, nullptr);
+}
+
+std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage) {
+    std::vector<llama_arg> options;
+    params.print_usage = print_usage;
     llama_sampling_params & sparams = params.sparams;
 
-    if (arg == "-s" || arg == "--seed") {
-        CHECK_ARG
-        // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
-        params.seed = std::stoul(argv[i]);
-        sparams.seed = std::stoul(argv[i]);
-        return true;
+    std::string sampler_type_chars;
+    std::string sampler_type_names;
+    for (const auto sampler_type : sparams.samplers_sequence) {
+        sampler_type_chars += static_cast<char>(sampler_type);
+        sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
     }
-    if (arg == "-t" || arg == "--threads") {
-        CHECK_ARG
-        params.cpuparams.n_threads = std::stoi(argv[i]);
-        if (params.cpuparams.n_threads <= 0) {
-            params.cpuparams.n_threads = std::thread::hardware_concurrency();
+    sampler_type_names.pop_back();
+    const char split_delim = ',';
+
+
+    /**
+     * filter options by example
+     * rules:
+     * - all examples inherit options from LLAMA_EXAMPLE_COMMON
+     * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
+     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
+     */
+    std::unordered_set<std::string> seen_args;
+    auto add_opt = [&](llama_arg arg) {
+        if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
+            // make sure there is no argument duplications
+            for (const auto & a : arg.args) {
+                if (seen_args.find(a) == seen_args.end()) {
+                    seen_args.insert(a);
+                } else {
+                    throw std::runtime_error(format("found duplicated argument in source code: %s", a.c_str()));
+                }
+            }
+            options.push_back(std::move(arg));
         }
-        return true;
-    }
-    if (arg == "-C" || arg == "--cpu-mask") {
-        CHECK_ARG
-        std::string mask = argv[i];
-        params.cpuparams.mask_valid = true;
-        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
-        return true;
-    }
-    if (arg == "-Cr" || arg == "--cpu-range") {
-        CHECK_ARG
-        std::string range = argv[i];
-        params.cpuparams.mask_valid = true;
-        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
-        return true;
-    }
-    if (arg == "--prio") {
-        CHECK_ARG
-        params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--cpu-strict") {
-        CHECK_ARG
-        params.cpuparams.strict_cpu = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--poll") {
-        CHECK_ARG
-        params.cpuparams.poll = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-tb" || arg == "--threads-batch") {
-        CHECK_ARG
-        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
-        if (params.cpuparams_batch.n_threads <= 0) {
-            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+    };
+
+
+    add_opt(llama_arg(
+        {"-h", "--help", "--usage"},
+        "print usage and exit",
+        [&params]() {
+            params.usage = true;
         }
-        return true;
-    }
-    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
-        CHECK_ARG
-        std::string mask = argv[i];
-        params.cpuparams_batch.mask_valid = true;
-        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
-        return true;
-    }
-    if (arg == "-Crb" || arg == "--cpu-range_batch") {
-        CHECK_ARG
-        std::string range = argv[i];
-        params.cpuparams_batch.mask_valid = true;
-        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
-        return true;
-    }
-    if (arg == "--prio-batch") {
-        CHECK_ARG
-        params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--cpu-strict-batch") {
-        params.cpuparams_batch.strict_cpu = true;
-        return true;
-    }
-    if (arg == "--poll-batch") {
-        CHECK_ARG
-        params.cpuparams_batch.poll = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-td" || arg == "--threads-draft") {
-        CHECK_ARG
-        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
-        if (params.draft_cpuparams.n_threads <= 0) {
-            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
+    ));
+    add_opt(llama_arg(
+        {"--version"},
+        "show version and build info",
+        []() {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
         }
-        return true;
-    }
-        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
-        CHECK_ARG
-        std::string mask = argv[i];
-        params.draft_cpuparams.mask_valid = true;
-        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
-        return true;
-    }
-    if (arg == "-Crd" || arg == "--cpu-range-draft") {
-        CHECK_ARG
-        std::string range = argv[i];
-        params.draft_cpuparams.mask_valid = true;
-        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
-        return true;
-    }
-    if (arg == "--prio-draft") {
-        CHECK_ARG
-        params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--cpu-strict-draft") {
-        params.draft_cpuparams.strict_cpu = true;
-        return true;
-    }
-    if (arg == "--poll-draft") {
-        CHECK_ARG
-        params.draft_cpuparams.poll = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-tbd" || arg == "--threads-batch-draft") {
-        CHECK_ARG
-        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
-        if (params.draft_cpuparams_batch.n_threads <= 0) {
-            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+    ));
+    add_opt(llama_arg(
+        {"-v", "--verbose"},
+        "print verbose information",
+        [&params]() {
+            params.verbosity = 1;
         }
-        return true;
-    }
-    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
-        CHECK_ARG
-        std::string range = argv[i];
-        params.draft_cpuparams_batch.mask_valid = true;
-        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
-        return true;
-    }
-    if (arg == "--prio-batch-draft") {
-        CHECK_ARG
-        params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--cpu-strict-batch-draft") {
-        params.draft_cpuparams_batch.strict_cpu = true;
-        return true;
-    }
-    if (arg == "--poll-batch-draft") {
-        CHECK_ARG
-        params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-p" || arg == "--prompt") {
-        CHECK_ARG
-        params.prompt = argv[i];
-        return true;
-    }
-    if (arg == "-e" || arg == "--escape") {
-        params.escape = true;
-        return true;
-    }
-    if (arg == "--no-escape") {
-        params.escape = false;
-        return true;
-    }
-    if (arg == "--prompt-cache") {
-        CHECK_ARG
-        params.path_prompt_cache = argv[i];
-        return true;
-    }
-    if (arg == "--prompt-cache-all") {
-        params.prompt_cache_all = true;
-        return true;
-    }
-    if (arg == "--prompt-cache-ro") {
-        params.prompt_cache_ro = true;
-        return true;
-    }
-    if (arg == "-bf" || arg == "--binary-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i], std::ios::binary);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        // store the external file name in params
-        params.prompt_file = argv[i];
-        std::ostringstream ss;
-        ss << file.rdbuf();
-        params.prompt = ss.str();
-        fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
-        return true;
-    }
-    if (arg == "-f" || arg == "--file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
+    ));
+    add_opt(llama_arg(
+        {"--verbosity"}, "N",
+        format("set specific verbosity level (default: %d)", params.verbosity),
+        [&params](int value) {
+            params.verbosity = value;
         }
-        // store the external file name in params
-        params.prompt_file = argv[i];
-        std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-        if (!params.prompt.empty() && params.prompt.back() == '\n') {
-            params.prompt.pop_back();
+    ));
+    add_opt(llama_arg(
+        {"--verbose-prompt"},
+        format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
+        [&params]() {
+            params.verbose_prompt = true;
         }
-        return true;
-    }
-    if (arg == "--in-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
+    ));
+    add_opt(llama_arg(
+        {"--no-display-prompt"},
+        format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
+        [&params]() {
+            params.display_prompt = false;
         }
-        params.in_files.push_back(argv[i]);
-        return true;
-    }
-    if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
-        CHECK_ARG
-        params.n_predict = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--top-k") {
-        CHECK_ARG
-        sparams.top_k = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-c" || arg == "--ctx-size") {
-        CHECK_ARG
-        params.n_ctx = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--grp-attn-n" || arg == "-gan") {
-        CHECK_ARG
-        params.grp_attn_n = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--grp-attn-w" || arg == "-gaw") {
-        CHECK_ARG
-        params.grp_attn_w = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--rope-freq-base") {
-        CHECK_ARG
-        params.rope_freq_base = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--rope-freq-scale") {
-        CHECK_ARG
-        params.rope_freq_scale = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--rope-scaling") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-        else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-        else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--rope-scale") {
-        CHECK_ARG
-        params.rope_freq_scale = 1.0f / std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-orig-ctx") {
-        CHECK_ARG
-        params.yarn_orig_ctx = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-ext-factor") {
-        CHECK_ARG
-        params.yarn_ext_factor = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-attn-factor") {
-        CHECK_ARG
-        params.yarn_attn_factor = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-beta-fast") {
-        CHECK_ARG
-        params.yarn_beta_fast = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-beta-slow") {
-        CHECK_ARG
-        params.yarn_beta_slow = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--pooling") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
-        else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-        else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
-        else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--attention") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
-        else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--defrag-thold" || arg == "-dt") {
-        CHECK_ARG
-        params.defrag_thold = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--samplers") {
-        CHECK_ARG
-        const auto sampler_names = string_split(argv[i], ';');
-        sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
-        return true;
-    }
-    if (arg == "--sampling-seq") {
-        CHECK_ARG
-        sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
-        return true;
-    }
-    if (arg == "--top-p") {
-        CHECK_ARG
-        sparams.top_p = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--min-p") {
-        CHECK_ARG
-        sparams.min_p = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--temp") {
-        CHECK_ARG
-        sparams.temp = std::stof(argv[i]);
-        sparams.temp = std::max(sparams.temp, 0.0f);
-        return true;
-    }
-    if (arg == "--tfs") {
-        CHECK_ARG
-        sparams.tfs_z = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--typical") {
-        CHECK_ARG
-        sparams.typical_p = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--repeat-last-n") {
-        CHECK_ARG
-        sparams.penalty_last_n = std::stoi(argv[i]);
-        sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
-        return true;
-    }
-    if (arg == "--repeat-penalty") {
-        CHECK_ARG
-        sparams.penalty_repeat = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--frequency-penalty") {
-        CHECK_ARG
-        sparams.penalty_freq = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--presence-penalty") {
-        CHECK_ARG
-        sparams.penalty_present = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--dynatemp-range") {
-        CHECK_ARG
-        sparams.dynatemp_range = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--dynatemp-exp") {
-        CHECK_ARG
-        sparams.dynatemp_exponent = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--mirostat") {
-        CHECK_ARG
-        sparams.mirostat = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--mirostat-lr") {
-        CHECK_ARG
-        sparams.mirostat_eta = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--mirostat-ent") {
-        CHECK_ARG
-        sparams.mirostat_tau = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--cfg-negative-prompt") {
-        CHECK_ARG
-        sparams.cfg_negative_prompt = argv[i];
-        return true;
-    }
-    if (arg == "--cfg-negative-prompt-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
+    ));
+    add_opt(llama_arg(
+        {"-co", "--color"},
+        format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
+        [&params]() {
+            params.use_color = true;
         }
-        std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-        if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-            sparams.cfg_negative_prompt.pop_back();
+    ));
+    add_opt(llama_arg(
+        {"-s", "--seed"}, "SEED",
+        format("RNG seed (default: %d, use random seed for < 0)", params.seed),
+        [&sparams, &params](std::string value) {
+            // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
+            params.seed = std::stoul(value);
+            sparams.seed = std::stoul(value);
         }
-        return true;
-    }
-    if (arg == "--cfg-scale") {
-        CHECK_ARG
-        sparams.cfg_scale = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "-b" || arg == "--batch-size") {
-        CHECK_ARG
-        params.n_batch = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-ub" || arg == "--ubatch-size") {
-        CHECK_ARG
-        params.n_ubatch = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--keep") {
-        CHECK_ARG
-        params.n_keep = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--draft") {
-        CHECK_ARG
-        params.n_draft = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--chunks") {
-        CHECK_ARG
-        params.n_chunks = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-np" || arg == "--parallel") {
-        CHECK_ARG
-        params.n_parallel = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-ns" || arg == "--sequences") {
-        CHECK_ARG
-        params.n_sequences = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--p-split" || arg == "-ps") {
-        CHECK_ARG
-        params.p_split = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "-m" || arg == "--model") {
-        CHECK_ARG
-        params.model = argv[i];
-        return true;
-    }
-    if (arg == "-md" || arg == "--model-draft") {
-        CHECK_ARG
-        params.model_draft = argv[i];
-        return true;
-    }
-    if (arg == "-a" || arg == "--alias") {
-        CHECK_ARG
-        params.model_alias = argv[i];
-        return true;
-    }
-    if (arg == "-mu" || arg == "--model-url") {
-        CHECK_ARG
-        params.model_url = argv[i];
-        return true;
-    }
-    if (arg == "-hft" || arg == "--hf-token") {
-        if (++i >= argc) {
-          invalid_param = true;
-          return true;
+    ));
+    add_opt(llama_arg(
+        {"-t", "--threads"}, "N",
+        format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
+        [&params](int value) {
+            params.cpuparams.n_threads = value;
+            if (params.cpuparams.n_threads <= 0) {
+                params.cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
         }
-        params.hf_token = argv[i];
-        return true;
-    }
-    if (arg == "-hfr" || arg == "--hf-repo") {
-        CHECK_ARG
-        params.hf_repo = argv[i];
-        return true;
-    }
-    if (arg == "-hff" || arg == "--hf-file") {
-        CHECK_ARG
-        params.hf_file = argv[i];
-        return true;
-    }
-    if (arg == "--lora") {
-        CHECK_ARG
-        params.lora_adapters.push_back({
-            std::string(argv[i]),
-            1.0,
-        });
-        return true;
-    }
-    if (arg == "--lora-scaled") {
-        CHECK_ARG
-        std::string lora_adapter = argv[i];
-        CHECK_ARG
-        params.lora_adapters.push_back({
-            lora_adapter,
-            std::stof(argv[i]),
-        });
-        return true;
-    }
-    if (arg == "--lora-init-without-apply") {
-        params.lora_init_without_apply = true;
-        return true;
-    }
-    if (arg == "--control-vector") {
-        CHECK_ARG
-        params.control_vectors.push_back({ 1.0f, argv[i], });
-        return true;
-    }
-    if (arg == "--control-vector-scaled") {
-        CHECK_ARG
-        const char* fname = argv[i];
-        CHECK_ARG
-        params.control_vectors.push_back({ std::stof(argv[i]), fname, });
-        return true;
-    }
-    if (arg == "--control-vector-layer-range") {
-        CHECK_ARG
-        params.control_vector_layer_start = std::stoi(argv[i]);
-        CHECK_ARG
-        params.control_vector_layer_end = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--mmproj") {
-        CHECK_ARG
-        params.mmproj = argv[i];
-        return true;
-    }
-    if (arg == "--image") {
-        CHECK_ARG
-        params.image.emplace_back(argv[i]);
-        return true;
-    }
-    if (arg == "-i" || arg == "--interactive") {
-        params.interactive = true;
-        return true;
-    }
-    if (arg == "-sp" || arg == "--special") {
-        params.special = true;
-        return true;
-    }
-    if (arg == "--embedding" || arg == "--embeddings") {
-        params.embedding = true;
-        return true;
-    }
-    if (arg == "--embd-normalize") {
-        CHECK_ARG
-        params.embd_normalize = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--embd-output-format") {
-        CHECK_ARG
-        params.embd_out = argv[i];
-        return true;
-    }
-    if (arg == "--embd-separator") {
-        CHECK_ARG
-        params.embd_sep = argv[i];
-        return true;
-    }
-    if (arg == "-if" || arg == "--interactive-first") {
-        params.interactive_first = true;
-        return true;
-    }
-    if (arg == "-cnv" || arg == "--conversation") {
-        params.conversation = true;
-        return true;
-    }
-    if (arg == "--infill") {
-        params.infill = true;
-        return true;
-    }
-    if (arg == "-dkvc" || arg == "--dump-kv-cache") {
-        params.dump_kv_cache = true;
-        return true;
-    }
-    if (arg == "-nkvo" || arg == "--no-kv-offload") {
-        params.no_kv_offload = true;
-        return true;
-    }
-    if (arg == "-ctk" || arg == "--cache-type-k") {
-        params.cache_type_k = argv[++i];
-        return true;
-    }
-    if (arg == "-ctv" || arg == "--cache-type-v") {
-        params.cache_type_v = argv[++i];
-        return true;
-    }
-    if (arg == "-mli" || arg == "--multiline-input") {
-        params.multiline_input = true;
-        return true;
-    }
-    if (arg == "--simple-io") {
-        params.simple_io = true;
-        return true;
-    }
-    if (arg == "-cb" || arg == "--cont-batching") {
-        params.cont_batching = true;
-        return true;
-    }
-    if (arg == "-nocb" || arg == "--no-cont-batching") {
-        params.cont_batching = false;
-        return true;
-    }
-    if (arg == "-fa" || arg == "--flash-attn") {
-        params.flash_attn = true;
-        return true;
-    }
-    if (arg == "-co" || arg == "--color") {
-        params.use_color = true;
-        return true;
-    }
-    if (arg == "--mlock") {
-        params.use_mlock = true;
-        return true;
-    }
-    if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
-        CHECK_ARG
-        params.n_gpu_layers = std::stoi(argv[i]);
-        if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+    ));
+    add_opt(llama_arg(
+        {"-tb", "--threads-batch"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads)",
+        [&params](int value) {
+            params.cpuparams_batch.n_threads = value;
+            if (params.cpuparams_batch.n_threads <= 0) {
+                params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
         }
-        return true;
-    }
-    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
-        CHECK_ARG
-        params.n_gpu_layers_draft = std::stoi(argv[i]);
-        if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+    ));
+    add_opt(llama_arg(
+        {"-td", "--threads-draft"}, "N",
+        "number of threads to use during generation (default: same as --threads)",
+        [&params](int value) {
+            params.draft_cpuparams.n_threads = value;
+            if (params.draft_cpuparams.n_threads <= 0) {
+                params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
         }
-        return true;
-    }
-    if (arg == "--main-gpu" || arg == "-mg") {
-        CHECK_ARG
-        params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
-        return true;
-    }
-    if (arg == "--split-mode" || arg == "-sm") {
-        CHECK_ARG
-        std::string arg_next = argv[i];
-        if (arg_next == "none") {
-            params.split_mode = LLAMA_SPLIT_MODE_NONE;
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-tbd", "--threads-batch-draft"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
+        [&params](int value) {
+            params.draft_cpuparams_batch.n_threads = value;
+            if (params.draft_cpuparams_batch.n_threads <= 0) {
+                params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
         }
-        else if (arg_next == "layer") {
-            params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-C", "--cpu-mask"}, "M",
+        "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
+        [&params](std::string value) {
+            std::string mask = value;
+            params.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
         }
-        else if (arg_next == "row") {
-#ifdef GGML_USE_SYCL
-            fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
-            exit(1);
-#endif // GGML_USE_SYCL
-            params.split_mode = LLAMA_SPLIT_MODE_ROW;
+    ));
+    add_opt(llama_arg(
+        {"-Cr", "--cpu-range"}, "lo-hi",
+        "range of CPUs for affinity. Complements --cpu-mask",
+        [&params](std::string value) {
+            std::string range = value;
+            params.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
         }
-        else {
-            invalid_param = true;
-            return true;
+    ));
+    add_opt(llama_arg(
+        {"--cpu-strict"}, "<0|1>",
+        format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
+        [&params](std::string value) {
+            params.cpuparams.strict_cpu = std::stoul(value);
         }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
-        return true;
-    }
-    if (arg == "--tensor-split" || arg == "-ts") {
-        CHECK_ARG
-        std::string arg_next = argv[i];
-
-        // split string by , and /
-        const std::regex regex{ R"([,/]+)" };
-        std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
-        std::vector<std::string> split_arg{ it, {} };
-        if (split_arg.size() >= llama_max_devices()) {
-            invalid_param = true;
-            return true;
+    ));
+    add_opt(llama_arg(
+        {"--poll"}, "<0...100>",
+        format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
+        [&params](std::string value) {
+            params.cpuparams.poll = std::stoul(value);
         }
-        for (size_t i = 0; i < llama_max_devices(); ++i) {
-            if (i < split_arg.size()) {
-                params.tensor_split[i] = std::stof(split_arg[i]);
+    ));
+    add_opt(llama_arg(
+        {"-Cb", "--cpu-mask-batch"}, "M",
+        "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
+        [&params](std::string value) {
+            std::string mask = value;
+            params.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
             }
-            else {
-                params.tensor_split[i] = 0.0f;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Crb", "--cpu-range-batch"}, "lo-hi",
+        "ranges of CPUs for affinity. Complements --cpu-mask-batch",
+        [&params](std::string value) {
+            std::string range = value;
+            params.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--cpu-strict-batch"}, "<0|1>",
+        "use strict CPU placement (default: same as --cpu-strict)",
+        [&params](int value) {
+            params.cpuparams_batch.strict_cpu = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--poll-batch"}, "<0|1>",
+        "use polling to wait for work (default: same as --poll",
+        [&params](int value) {
+            params.cpuparams_batch.poll = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Cd", "--cpu-mask-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [&params](std::string value) {
+            std::string mask = value;
+            params.draft_cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-Crd", "--cpu-range-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
+        [&params](std::string value) {
+            std::string range = value;
+            params.draft_cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--cpu-strict-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
+        [&params](int value) {
+            params.draft_cpuparams.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--poll-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: same as --poll])",
+        [&params](int value) {
+            params.draft_cpuparams.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
+        [&params](std::string value) {
+            std::string range = value;
+            params.draft_cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--cpu-strict-batch-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
+        [&params](int value) {
+            params.draft_cpuparams_batch.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--poll-batch-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: --poll-draft)",
+        [&params](int value) {
+            params.draft_cpuparams_batch.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--draft"}, "N",
+        format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
+        [&params](int value) {
+            params.n_draft = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-ps", "--p-split"}, "N",
+        format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
+        [&params](std::string value) {
+            params.p_split = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-lcs", "--lookup-cache-static"}, "FNAME",
+        "path to static lookup cache to use for lookup decoding (not updated by generation)",
+        [&params](std::string value) {
+            params.lookup_cache_static = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
+        "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
+        [&params](std::string value) {
+            params.lookup_cache_dynamic = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-c", "--ctx-size"}, "N",
+        format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
+        [&params](int value) {
+            params.n_ctx = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-n", "--predict"}, "N",
+        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
+        [&params](int value) {
+            params.n_predict = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-b", "--batch-size"}, "N",
+        format("logical maximum batch size (default: %d)", params.n_batch),
+        [&params](int value) {
+            params.n_batch = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ub", "--ubatch-size"}, "N",
+        format("physical maximum batch size (default: %d)", params.n_ubatch),
+        [&params](int value) {
+            params.n_ubatch = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--keep"}, "N",
+        format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
+        [&params](int value) {
+            params.n_keep = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--chunks"}, "N",
+        format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
+        [&params](int value) {
+            params.n_chunks = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-fa", "--flash-attn"},
+        format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
+        [&params]() {
+            params.flash_attn = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-p", "--prompt"}, "PROMPT",
+        "prompt to start generation with\n",
+        [&params](std::string value) {
+            params.prompt = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-f", "--file"}, "FNAME",
+        "a file containing the prompt (default: none)",
+        [&params](std::string value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            // store the external file name in params
+            params.prompt_file = value;
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (!params.prompt.empty() && params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--in-file"}, "FNAME",
+        "an input file (repeat to specify multiple files)",
+        [&params](std::string value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            params.in_files.push_back(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-bf", "--binary-file"}, "FNAME",
+        "binary file containing the prompt (default: none)",
+        [&params](std::string value) {
+            std::ifstream file(value, std::ios::binary);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            // store the external file name in params
+            params.prompt_file = value;
+            std::ostringstream ss;
+            ss << file.rdbuf();
+            params.prompt = ss.str();
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
+        }
+    ));
+    add_opt(llama_arg(
+        {"-e", "--escape"},
+        format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        [&params]() {
+            params.escape = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--no-escape"},
+        "do not process escape sequences",
+        [&params]() {
+            params.escape = false;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ptc", "--print-token-count"}, "N",
+        format("print token count every N tokens (default: %d)", params.n_print),
+        [&params](int value) {
+            params.n_print = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache"}, "FNAME",
+        "file to cache prompt state for faster startup (default: none)",
+        [&params](std::string value) {
+            params.path_prompt_cache = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache-all"},
+        "if specified, saves user input and generations to cache as well\n",
+        [&params]() {
+            params.prompt_cache_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache-ro"},
+        "if specified, uses the prompt cache but does not update it",
+        [&params]() {
+            params.prompt_cache_ro = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-r", "--reverse-prompt"}, "PROMPT",
+        "halt generation at PROMPT, return control in interactive mode\n",
+        [&params](std::string value) {
+            params.antiprompt.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-sp", "--special"},
+        format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
+        [&params]() {
+            params.special = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-cnv", "--conversation"},
+        "run in conversation mode, does not print special tokens and suffix/prefix\n",
+        [&params]() {
+            params.conversation = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-i", "--interactive"},
+        format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
+        [&params]() {
+            params.interactive = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"-if", "--interactive-first"},
+        format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
+        [&params]() {
+            params.interactive_first = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"-mli", "--multiline-input"},
+        "allows you to write or paste multiple lines without ending each in '\\'",
+        [&params]() {
+            params.multiline_input = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"--in-prefix-bos"},
+        "prefix BOS to user inputs, preceding the `--in-prefix` string",
+        [&params]() {
+            params.input_prefix_bos = true;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"--in-prefix"}, "STRING",
+        "string to prefix user inputs with (default: empty)",
+        [&params](std::string value) {
+            params.input_prefix = value;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"--in-suffix"}, "STRING",
+        "string to suffix after user inputs with (default: empty)",
+        [&params](std::string value) {
+            params.input_suffix = value;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"--no-warmup"},
+        "skip warming up the model with an empty run",
+        [&params]() {
+            params.warmup = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--spm-infill"},
+        format(
+            "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
+            params.spm_infill ? "enabled" : "disabled"
+        ),
+        [&params]() {
+            params.spm_infill = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"--samplers"}, "SAMPLERS",
+        format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
+        [&sparams](std::string value) {
+            const auto sampler_names = string_split(value, ';');
+            sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--sampling-seq"}, "SEQUENCE",
+        format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
+        [&sparams](std::string value) {
+            sparams.samplers_sequence = llama_sampling_types_from_chars(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--ignore-eos"},
+        "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
+        [&params]() {
+            params.ignore_eos = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--penalize-nl"},
+        format("penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false"),
+        [&sparams]() {
+            sparams.penalize_nl = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--temp"}, "N",
+        format("temperature (default: %.1f)", (double)sparams.temp),
+        [&sparams](std::string value) {
+            sparams.temp = std::stof(value);
+            sparams.temp = std::max(sparams.temp, 0.0f);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--top-k"}, "N",
+        format("top-k sampling (default: %d, 0 = disabled)", sparams.top_k),
+        [&sparams](int value) {
+            sparams.top_k = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--top-p"}, "N",
+        format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p),
+        [&sparams](std::string value) {
+            sparams.top_p = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--min-p"}, "N",
+        format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p),
+        [&sparams](std::string value) {
+            sparams.min_p = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--tfs"}, "N",
+        format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z),
+        [&sparams](std::string value) {
+            sparams.tfs_z = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--typical"}, "N",
+        format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p),
+        [&sparams](std::string value) {
+            sparams.typical_p = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--repeat-last-n"}, "N",
+        format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n),
+        [&sparams](int value) {
+            sparams.penalty_last_n = value;
+            sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--repeat-penalty"}, "N",
+        format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat),
+        [&sparams](std::string value) {
+            sparams.penalty_repeat = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--presence-penalty"}, "N",
+        format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present),
+        [&sparams](std::string value) {
+            sparams.penalty_present = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--frequency-penalty"}, "N",
+        format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq),
+        [&sparams](std::string value) {
+            sparams.penalty_freq = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--dynatemp-range"}, "N",
+        format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range),
+        [&sparams](std::string value) {
+            sparams.dynatemp_range = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--dynatemp-exp"}, "N",
+        format("dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent),
+        [&sparams](std::string value) {
+            sparams.dynatemp_exponent = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--mirostat"}, "N",
+        format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat),
+        [&sparams](int value) {
+            sparams.mirostat = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--mirostat-lr"}, "N",
+        format("Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta),
+        [&sparams](std::string value) {
+            sparams.mirostat_eta = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--mirostat-ent"}, "N",
+        format("Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau),
+        [&sparams](std::string value) {
+            sparams.mirostat_tau = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
+        "modifies the likelihood of token appearing in the completion,\n"
+        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
+        [&sparams](std::string value) {
+            std::stringstream ss(value);
+            llama_token key;
+            char sign;
+            std::string value_str;
+            try {
+                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+                    sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                } else {
+                    throw std::invalid_argument("invalid input format");
+                }
+            } catch (const std::exception&) {
+                throw std::invalid_argument("invalid input format");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--cfg-negative-prompt"}, "PROMPT",
+        format("negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str()),
+        [&sparams](std::string value) {
+            sparams.cfg_negative_prompt = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--cfg-negative-prompt-file"}, "FNAME",
+        "negative prompt file to use for guidance",
+        [&sparams](std::string value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
+            if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
+                sparams.cfg_negative_prompt.pop_back();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--cfg-scale"}, "N",
+        format("strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale),
+        [&sparams](std::string value) {
+            sparams.cfg_scale = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--grammar"}, "GRAMMAR",
+        format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str()),
+        [&sparams](std::string value) {
+            sparams.grammar = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--grammar-file"}, "FNAME",
+        "file to read grammar from",
+        [&sparams](std::string value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(sparams.grammar)
+            );
+        }
+    ));
+    add_opt(llama_arg(
+        {"-j", "--json-schema"}, "SCHEMA",
+        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
+        [&sparams](std::string value) {
+            sparams.grammar = json_schema_to_grammar(json::parse(value));
+        }
+    ));
+    add_opt(llama_arg(
+        {"--pooling"}, "{none,mean,cls,last}",
+        "pooling type for embeddings, use model default if unspecified",
+        [&params](std::string value) {
+            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+            else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--attention"}, "{causal,non,causal}",
+        "attention type for embeddings, use model default if unspecified",
+        [&params](std::string value) {
+            /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+            else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--rope-scaling"}, "{none,linear,yarn}",
+        "RoPE frequency scaling method, defaults to linear unless specified by the model",
+        [&params](std::string value) {
+            /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-scale"}, "N",
+        "RoPE context scaling factor, expands context by a factor of N",
+        [&params](std::string value) {
+            params.rope_freq_scale = 1.0f / std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-freq-base"}, "N",
+        "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
+        [&params](std::string value) {
+            params.rope_freq_base = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-freq-scale"}, "N",
+        "RoPE frequency scaling factor, expands context by a factor of 1/N",
+        [&params](std::string value) {
+            params.rope_freq_scale = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-orig-ctx"}, "N",
+        format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
+        [&params](int value) {
+            params.yarn_orig_ctx = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-ext-factor"}, "N",
+        format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
+        [&params](std::string value) {
+            params.yarn_ext_factor = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-attn-factor"}, "N",
+        format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
+        [&params](std::string value) {
+            params.yarn_attn_factor = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-beta-slow"}, "N",
+        format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
+        [&params](std::string value) {
+            params.yarn_beta_slow = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-beta-fast"}, "N",
+        format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
+        [&params](std::string value) {
+            params.yarn_beta_fast = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-gan", "--grp-attn-n"}, "N",
+        format("group-attention factor (default: %d)", params.grp_attn_n),
+        [&params](int value) {
+            params.grp_attn_n = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-gaw", "--grp-attn-w"}, "N",
+        format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
+        [&params](int value) {
+            params.grp_attn_w = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-dkvc", "--dump-kv-cache"},
+        "verbose print of the KV cache",
+        [&params]() {
+            params.dump_kv_cache = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-nkvo", "--no-kv-offload"},
+        "disable KV offload",
+        [&params]() {
+            params.no_kv_offload = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ctk", "--cache-type-k"}, "TYPE",
+        format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
+        [&params](std::string value) {
+            // TODO: get the type right here
+            params.cache_type_k = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ctv", "--cache-type-v"}, "TYPE",
+        format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
+        [&params](std::string value) {
+            // TODO: get the type right here
+            params.cache_type_v = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--all-logits"},
+        format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
+        [&params]() {
+            params.logits_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--hellaswag"},
+        "compute HellaSwag score over random tasks from datafile supplied with -f",
+        [&params]() {
+            params.hellaswag = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--hellaswag-tasks"}, "N",
+        format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
+        [&params](int value) {
+            params.hellaswag_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--winogrande"},
+        "compute Winogrande score over random tasks from datafile supplied with -f",
+        [&params]() {
+            params.winogrande = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--winogrande-tasks"}, "N",
+        format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
+        [&params](int value) {
+            params.winogrande_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--multiple-choice"},
+        "compute multiple choice score over random tasks from datafile supplied with -f",
+        [&params]() {
+            params.multiple_choice = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--multiple-choice-tasks"}, "N",
+        format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
+        [&params](int value) {
+            params.multiple_choice_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--kl-divergence"},
+        "computes KL-divergence to logits provided via --kl-divergence-base",
+        [&params]() {
+            params.kl_divergence = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--ppl-stride"}, "N",
+        format("stride for perplexity calculation (default: %d)", params.ppl_stride),
+        [&params](int value) {
+            params.ppl_stride = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--ppl-output-type"}, "<0|1>",
+        format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
+        [&params](int value) {
+            params.ppl_output_type = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"-dt", "--defrag-thold"}, "N",
+        format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
+        [&params](std::string value) {
+            params.defrag_thold = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-np", "--parallel"}, "N",
+        format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+        [&params](int value) {
+            params.n_parallel = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ns", "--sequences"}, "N",
+        format("number of sequences to decode (default: %d)", params.n_sequences),
+        [&params](int value) {
+            params.n_sequences = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-cb", "--cont-batching"},
+        format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        [&params]() {
+            params.cont_batching = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-nocb", "--no-cont-batching"},
+        "disable continuous batching",
+        [&params]() {
+            params.cont_batching = false;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--mmproj"}, "FILE",
+        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        [&params](std::string value) {
+            params.mmproj = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--image"}, "FILE",
+        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        [&params](std::string value) {
+            params.image.emplace_back(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rpc"}, "SERVERS",
+        "comma separated list of RPC servers",
+        [&params](std::string value) {
+            params.rpc_servers = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--mlock"},
+        "force system to keep model in RAM rather than swapping or compressing",
+        [&params]() {
+            params.use_mlock = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--no-mmap"},
+        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
+        [&params]() {
+            params.use_mmap = false;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--numa"}, "TYPE",
+        "attempt optimizations that help on some NUMA systems\n"
+        "- distribute: spread execution evenly over all nodes\n"
+        "- isolate: only spawn threads on CPUs on the node that execution started on\n"
+        "- numactl: use the CPU map provided by numactl\n"
+        "if run without this previously, it is recommended to drop the system page cache before using this\n"
+        "see https://github.com/ggerganov/llama.cpp/issues/1437",
+        [&params](std::string value) {
+            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ngl", "--gpu-layers"}, "N",
+        "number of layers to store in VRAM",
+        [&params](int value) {
+            params.n_gpu_layers = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ngld", "--gpu-layers-draft"}, "N",
+        "number of layers to store in VRAM for the draft model",
+        [&params](int value) {
+            params.n_gpu_layers_draft = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
         }
+    ));
+    add_opt(llama_arg(
+        {"-sm", "--split-mode"}, "SPLIT_MODE",
+        "how to split the model across multiple GPUs, one of:\n"
+        "- none: use one GPU only\n"
+        "- layer (default): split layers and KV across GPUs\n"
+        "- row: split rows across GPUs",
+        [&params](std::string value) {
+            std::string arg_next = value;
+            if (arg_next == "none") {
+                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+            } else if (arg_next == "layer") {
+                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+            }
+            else if (arg_next == "row") {
+#ifdef GGML_USE_SYCL
+                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
+                exit(1);
+#endif // GGML_USE_SYCL
+                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+            }
+            else {
+                throw std::invalid_argument("invalid value");
+            }
 #ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
 #endif // GGML_USE_CUDA_SYCL_VULKAN
-        return true;
-    }
-    if (arg == "--rpc") {
-        CHECK_ARG
-        params.rpc_servers = argv[i];
-        return true;
-    }
-    if (arg == "--no-mmap") {
-        params.use_mmap = false;
-        return true;
-    }
-    if (arg == "--numa") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-        else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-        else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "-v" || arg == "--verbose") {
-        params.verbosity = 1;
-        return true;
-    }
-    if (arg == "--verbosity") {
-        CHECK_ARG
-        params.verbosity = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--verbose-prompt") {
-        params.verbose_prompt = true;
-        return true;
-    }
-    if (arg == "--no-display-prompt") {
-        params.display_prompt = false;
-        return true;
-    }
-    if (arg == "-r" || arg == "--reverse-prompt") {
-        CHECK_ARG
-        params.antiprompt.emplace_back(argv[i]);
-        return true;
-    }
-    if (arg == "-ld" || arg == "--logdir") {
-        CHECK_ARG
-        params.logdir = argv[i];
-
-        if (params.logdir.back() != DIRECTORY_SEPARATOR) {
-            params.logdir += DIRECTORY_SEPARATOR;
         }
-        return true;
-    }
-    if (arg == "-lcs" || arg == "--lookup-cache-static") {
-        CHECK_ARG
-        params.lookup_cache_static = argv[i];
-        return true;
-    }
-    if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
-        CHECK_ARG
-        params.lookup_cache_dynamic = argv[i];
-        return true;
-    }
-    if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
-        CHECK_ARG
-        params.logits_file = argv[i];
-        return true;
-    }
-    if (arg == "--perplexity" || arg == "--all-logits") {
-        params.logits_all = true;
-        return true;
-    }
-    if (arg == "--ppl-stride") {
-        CHECK_ARG
-        params.ppl_stride = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--ppl-output-type") {
-        CHECK_ARG
-        params.ppl_output_type = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-ptc" || arg == "--print-token-count") {
-        CHECK_ARG
-        params.n_print = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--check-tensors") {
-        params.check_tensors = true;
-        return true;
-    }
-    if (arg == "--hellaswag") {
-        params.hellaswag = true;
-        return true;
-    }
-    if (arg == "--hellaswag-tasks") {
-        CHECK_ARG
-        params.hellaswag_tasks = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--winogrande") {
-        params.winogrande = true;
-        return true;
-    }
-    if (arg == "--winogrande-tasks") {
-        CHECK_ARG
-        params.winogrande_tasks = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--multiple-choice") {
-        params.multiple_choice = true;
-        return true;
-    }
-    if (arg == "--multiple-choice-tasks") {
-        CHECK_ARG
-        params.multiple_choice_tasks = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--kl-divergence") {
-        params.kl_divergence = true;
-        return true;
-    }
-    if (arg == "--ignore-eos") {
-        params.ignore_eos = true;
-        return true;
-    }
-    if (arg == "--penalize-nl") {
-        sparams.penalize_nl = true;
-        return true;
-    }
-    if (arg == "-l" || arg == "--logit-bias") {
-        CHECK_ARG
-        std::stringstream ss(argv[i]);
-        llama_token key;
-        char sign;
-        std::string value_str;
-        try {
-            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+    ));
+    add_opt(llama_arg(
+        {"-ts", "--tensor-split"}, "N0,N1,N2,...",
+        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
+        [&params](std::string value) {
+            std::string arg_next = value;
+
+            // split string by , and /
+            const std::regex regex{ R"([,/]+)" };
+            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+            std::vector<std::string> split_arg{ it, {} };
+            if (split_arg.size() >= llama_max_devices()) {
+                throw std::invalid_argument(
+                    format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+                );
             }
-            else {
-                throw std::exception();
+            for (size_t i = 0; i < llama_max_devices(); ++i) {
+                if (i < split_arg.size()) {
+                        params.tensor_split[i] = std::stof(split_arg[i]);
+                } else {
+                        params.tensor_split[i] = 0.0f;
+                }
             }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
         }
-        catch (const std::exception&) {
-            invalid_param = true;
-            return true;
+    ));
+    add_opt(llama_arg(
+        {"-mg", "--main-gpu"}, "INDEX",
+        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
+        [&params](int value) {
+            params.main_gpu = value;
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
         }
-        return true;
-    }
-    if (arg == "-h" || arg == "--help" || arg == "--usage"  ) {
-        params.usage = true;
-        return true;
-    }
-    if (arg == "--version") {
-        fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-        fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-        exit(0);
-    }
-    if (arg == "--in-prefix-bos") {
-        params.input_prefix_bos = true;
-        params.enable_chat_template = false;
-        return true;
-    }
-    if (arg == "--in-prefix") {
-        CHECK_ARG
-        params.input_prefix = argv[i];
-        params.enable_chat_template = false;
-        return true;
-    }
-    if (arg == "--in-suffix") {
-        CHECK_ARG
-        params.input_suffix = argv[i];
-        params.enable_chat_template = false;
-        return true;
-    }
-    if (arg == "--spm-infill") {
-        params.spm_infill = true;
-        return true;
-    }
-    if (arg == "--grammar") {
-        CHECK_ARG
-        sparams.grammar = argv[i];
-        return true;
-    }
-    if (arg == "--grammar-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        std::copy(
-            std::istreambuf_iterator<char>(file),
-            std::istreambuf_iterator<char>(),
-            std::back_inserter(sparams.grammar)
-        );
-        return true;
-    }
-    if (arg == "-j" || arg == "--json-schema") {
-        CHECK_ARG
-        sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
-        return true;
-    }
-    if (arg == "--override-kv") {
-        CHECK_ARG
-        if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
-            fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
-            invalid_param = true;
-            return true;
+    ));
+    add_opt(llama_arg(
+        {"--check-tensors"},
+        format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
+        [&params]() {
+            params.check_tensors = true;
         }
-        return true;
-    }
-    if (arg == "--host") {
-        CHECK_ARG
-        params.hostname = argv[i];
-        return true;
-    }
-    if (arg == "--port") {
-        CHECK_ARG
-        params.port = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--path") {
-        CHECK_ARG
-        params.public_path = argv[i];
-        return true;
-    }
-    if (arg == "--api-key") {
-        CHECK_ARG
-        params.api_keys.push_back(argv[i]);
-        return true;
-    }
-    if (arg == "--api-key-file") {
-        CHECK_ARG
-        std::ifstream key_file(argv[i]);
-        if (!key_file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
+    ));
+    add_opt(llama_arg(
+        {"--override-kv"}, "KEY=TYPE:VALUE",
+        "advanced option to override model metadata by key. may be specified multiple times.\n"
+        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
+        [&params](std::string value) {
+            if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
+                throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--lora"}, "FNAME",
+        "path to LoRA adapter (can be repeated to use multiple adapters)",
+        [&params](std::string value) {
+            params.lora_adapters.push_back({ std::string(value), 1.0 });
+        }
+    ));
+    add_opt(llama_arg(
+        {"--lora-scaled"}, "FNAME", "SCALE",
+        "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
+        [&params](std::string fname, std::string scale) {
+            params.lora_adapters.push_back({ fname, std::stof(scale) });
+        }
+    ));
+    add_opt(llama_arg(
+        {"--control-vector"}, "FNAME",
+        "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
+        [&params](std::string value) {
+            params.control_vectors.push_back({ 1.0f, value, });
+        }
+    ));
+    add_opt(llama_arg(
+        {"--control-vector-scaled"}, "FNAME", "SCALE",
+        "add a control vector with user defined scaling SCALE\n"
+        "note: this argument can be repeated to add multiple scaled control vectors",
+        [&params](std::string fname, std::string scale) {
+            params.control_vectors.push_back({ std::stof(scale), fname });
+        }
+    ));
+    add_opt(llama_arg(
+        {"--control-vector-layer-range"}, "START", "END",
+        "layer range to apply the control vector(s) to, start and end inclusive",
+        [&params](std::string start, std::string end) {
+            params.control_vector_layer_start = std::stoi(start);
+            params.control_vector_layer_end = std::stoi(end);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-m", "--model"}, "FNAME",
+        ex == LLAMA_EXAMPLE_EXPORT_LORA
+            ? std::string("model path from which to load base model")
+            : format(
+                "model path (default: `models/$filename` with filename from `--hf-file` "
+                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
+            ),
+        [&params](std::string value) {
+            params.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(llama_arg(
+        {"-md", "--model-draft"}, "FNAME",
+        "draft model for speculative decoding (default: unused)",
+        [&params](std::string value) {
+            params.model_draft = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-mu", "--model-url"}, "MODEL_URL",
+        "model download url (default: unused)",
+        [&params](std::string value) {
+            params.model_url = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-hfr", "--hf-repo"}, "REPO",
+        "Hugging Face model repository (default: unused)",
+        [&params](std::string value) {
+            params.hf_repo = value;
         }
-        std::string key;
-        while (std::getline(key_file, key)) {
-            if (!key.empty()) {
-                params.api_keys.push_back(key);
+    ));
+    add_opt(llama_arg(
+        {"-hff", "--hf-file"}, "FILE",
+        "Hugging Face model file (default: unused)",
+        [&params](std::string value) {
+            params.hf_file = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-hft", "--hf-token"}, "TOKEN",
+        "Hugging Face access token (default: value from HF_TOKEN environment variable)",
+        [&params](std::string value) {
+            params.hf_token = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--context-file"}, "FNAME",
+        "file to load context from (repeat to specify multiple files)",
+        [&params](std::string value) {
+            std::ifstream file(value, std::ios::binary);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
             }
+            params.context_files.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--chunk-size"}, "N",
+        format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
+        [&params](int value) {
+            params.chunk_size = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--chunk-separator"}, "STRING",
+        format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
+        [&params](std::string value) {
+            params.chunk_separator = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--junk"}, "N",
+        format("number of times to repeat the junk text (default: %d)", params.n_junk),
+        [&params](int value) {
+            params.n_junk = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(llama_arg(
+        {"--pos"}, "N",
+        format("position of the passkey in the junk text (default: %d)", params.i_pos),
+        [&params](int value) {
+            params.i_pos = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(llama_arg(
+        {"-o", "--output"}, "FNAME",
+        format("output file (default: '%s')",
+            ex == LLAMA_EXAMPLE_EXPORT_LORA
+                ? params.lora_outfile.c_str()
+                : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
+                    ? params.cvector_outfile.c_str()
+                    : params.out_file.c_str()),
+        [&params](std::string value) {
+            params.out_file = value;
+            params.cvector_outfile = value;
+            params.lora_outfile = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(llama_arg(
+        {"-ofreq", "--output-frequency"}, "N",
+        format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
+        [&params](int value) {
+            params.n_out_freq = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--save-frequency"}, "N",
+        format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
+        [&params](int value) {
+            params.n_save_freq = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--process-output"},
+        format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
+        [&params]() {
+            params.process_output = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--no-ppl"},
+        format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        [&params]() {
+            params.compute_ppl = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--chunk"}, "N",
+        format("start processing the input from chunk N (default: %d)", params.i_chunk),
+        [&params](int value) {
+            params.i_chunk = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"-pps"},
+        format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
+        [&params]() {
+            params.is_pp_shared = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-npp"}, "n0,n1,...",
+        "number of prompt tokens",
+        [&params](std::string value) {
+            auto p = string_split<int>(value, split_delim);
+            params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
         }
-        key_file.close();
-        return true;
-    }
-    if (arg == "--ssl-key-file") {
-        CHECK_ARG
-        params.ssl_file_key = argv[i];
-        return true;
-    }
-    if (arg == "--ssl-cert-file") {
-        CHECK_ARG
-        params.ssl_file_cert = argv[i];
-        return true;
-    }
-    if (arg == "--timeout" || arg == "-to") {
-        CHECK_ARG
-        params.timeout_read  = std::stoi(argv[i]);
-        params.timeout_write = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--threads-http") {
-        CHECK_ARG
-        params.n_threads_http = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-spf" || arg == "--system-prompt-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-ntg"}, "n0,n1,...",
+        "number of text generation tokens",
+        [&params](std::string value) {
+            auto p = string_split<int>(value, split_delim);
+            params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
         }
-        std::string system_prompt;
-        std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(system_prompt)
-                );
-        params.system_prompt = system_prompt;
-        return true;
-    }
-    if (arg == "--log-format") {
-        CHECK_ARG
-        if (std::strcmp(argv[i], "json") == 0) {
-            params.log_json = true;
-        } else if (std::strcmp(argv[i], "text") == 0) {
-            params.log_json = false;
-        } else {
-            invalid_param = true;
-            return true;
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-npl"}, "n0,n1,...",
+        "number of parallel prompts",
+        [&params](std::string value) {
+            auto p = string_split<int>(value, split_delim);
+            params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"--embd-normalize"}, "N",
+        format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
+        [&params](int value) {
+            params.embd_normalize = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--embd-output-format"}, "FORMAT",
+        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
+        [&params](std::string value) {
+            params.embd_out = value;
         }
-        return true;
-    }
-    if (arg == "--no-slots") {
-        params.endpoint_slots = false;
-        return true;
-    }
-    if (arg == "--metrics") {
-        params.endpoint_metrics = true;
-        return true;
-    }
-    if (arg == "--slot-save-path") {
-        CHECK_ARG
-        params.slot_save_path = argv[i];
-        // if doesn't end with DIRECTORY_SEPARATOR, add it
-        if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
-            params.slot_save_path += DIRECTORY_SEPARATOR;
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--embd-separator"}, "STRING",
+        "separator of embendings (default \\n) for example \"<#sep#>\"",
+        [&params](std::string value) {
+            params.embd_sep = value;
         }
-        return true;
-    }
-    if (arg == "--chat-template") {
-        CHECK_ARG
-        if (!llama_chat_verify_template(argv[i])) {
-            fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
-            fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
-            invalid_param = true;
-            return true;
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--host"}, "HOST",
+        format("ip address to listen (default: %s)", params.hostname.c_str()),
+        [&params](std::string value) {
+            params.hostname = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--port"}, "PORT",
+        format("port to listen (default: %d)", params.port),
+        [&params](int value) {
+            params.port = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--path"}, "PATH",
+        format("path to serve static files from (default: %s)", params.public_path.c_str()),
+        [&params](std::string value) {
+            params.public_path = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--embedding(s)"},
+        format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
+        [&params]() {
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--api-key"}, "KEY",
+        "API key to use for authentication (default: none)",
+        [&params](std::string value) {
+            params.api_keys.push_back(value);
         }
-        params.chat_template = argv[i];
-        return true;
-    }
-    if (arg == "--slot-prompt-similarity" || arg == "-sps") {
-        CHECK_ARG
-        params.slot_prompt_similarity = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "-pps") {
-        params.is_pp_shared = true;
-        return true;
-    }
-    if (arg == "-npp") {
-        CHECK_ARG
-        auto p = string_split<int>(argv[i], split_delim);
-        params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
-        return true;
-    }
-    if (arg == "-ntg") {
-        CHECK_ARG
-        auto p = string_split<int>(argv[i], split_delim);
-        params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
-        return true;
-    }
-    if (arg == "-npl") {
-        CHECK_ARG
-        auto p = string_split<int>(argv[i], split_delim);
-        params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
-        return true;
-    }
-    if (arg == "--context-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i], std::ios::binary);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--api-key-file"}, "FNAME",
+        "path to file containing API keys (default: none)",
+        [&params](std::string value) {
+            std::ifstream key_file(value);
+            if (!key_file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::string key;
+            while (std::getline(key_file, key)) {
+                if (!key.empty()) {
+                        params.api_keys.push_back(key);
+                }
+            }
+            key_file.close();
         }
-        params.context_files.push_back(argv[i]);
-        return true;
-    }
-    if (arg == "--chunk-size") {
-        CHECK_ARG
-        params.chunk_size = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--chunk-separator") {
-        CHECK_ARG
-        params.chunk_separator = argv[i];
-        return true;
-    }
-    if (arg == "--junk") {
-        CHECK_ARG
-        params.n_junk = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--pos") {
-        CHECK_ARG
-        params.i_pos = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-o" || arg == "--output" || arg == "--output-file") {
-        CHECK_ARG
-        params.out_file = argv[i];
-        params.cvector_outfile = argv[i];
-        params.lora_outfile = argv[i];
-        return true;
-    }
-    if (arg == "-ofreq" || arg == "--output-frequency") {
-        CHECK_ARG
-        params.n_out_freq = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--save-frequency") {
-        CHECK_ARG
-        params.n_save_freq = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--process-output") {
-        params.process_output = true;
-        return true;
-    }
-    if (arg == "--no-ppl") {
-        params.compute_ppl = false;
-        return true;
-    }
-    if (arg == "--chunk" || arg == "--from-chunk") {
-        CHECK_ARG
-        params.i_chunk = std::stoi(argv[i]);
-        return true;
-    }
-    // cvector params
-    if (arg == "--positive-file") {
-        CHECK_ARG
-        params.cvector_positive_file = argv[i];
-        return true;
-    }
-    if (arg == "--negative-file") {
-        CHECK_ARG
-        params.cvector_negative_file = argv[i];
-        return true;
-    }
-    if (arg == "--pca-batch") {
-        CHECK_ARG
-        params.n_pca_batch = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--pca-iter") {
-        CHECK_ARG
-        params.n_pca_iterations = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--method") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
-        else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--no-warmup") {
-        params.warmup = false;
-        return true;
-    }
-#ifndef LOG_DISABLE_LOGS
-    // Parse args for logging parameters
-    if (log_param_single_parse(argv[i])) {
-        // Do nothing, log_param_single_parse automatically does it's thing
-        //  and returns if a match was found and parsed.
-        return true;
-    }
-    if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) {
-        // We have a matching known parameter requiring an argument,
-        //  now we need to check if there is anything after this argv
-        //  and flag invalid_param or parse it.
-        CHECK_ARG
-        if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
-            invalid_param = true;
-            return true;
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--ssl-key-file"}, "FNAME",
+        "path to file a PEM-encoded SSL private key",
+        [&params](std::string value) {
+            params.ssl_file_key = value;
         }
-        return true;
-    }
-    // End of Parse args for logging parameters
-#endif // LOG_DISABLE_LOGS
-
-    return false;
-}
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
-#endif
-
-LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
-static std::string format(const char * fmt, ...) {
-    va_list ap;
-    va_list ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-void gpt_params_print_usage(std::vector<llama_arg> & options) {
-    constexpr static int n_leading_spaces = 40;
-    std::string leading_spaces(n_leading_spaces, ' ');
-    for (const auto & opt : options) {
-        std::ostringstream ss;
-        for (const auto & arg : opt.args) {
-            if (&arg == &opt.args.front()) {
-                ss << format("%-7s", (arg + ",").c_str());
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--ssl-cert-file"}, "FNAME",
+        "path to file a PEM-encoded SSL certificate",
+        [&params](std::string value) {
+            params.ssl_file_cert = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--timeout"}, "N",
+        format("server read/write timeout in seconds (default: %d)", params.timeout_read),
+        [&params](int value) {
+            params.timeout_read  = value;
+            params.timeout_write = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--threads-http"}, "N",
+        format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
+        [&params](int value) {
+            params.n_threads_http = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"-spf", "--system-prompt-file"}, "FNAME",
+        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
+        [&params](std::string value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::string system_prompt;
+            std::copy(
+                        std::istreambuf_iterator<char>(file),
+                        std::istreambuf_iterator<char>(),
+                        std::back_inserter(system_prompt)
+                        );
+            params.system_prompt = system_prompt;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--log-format"}, "{text, json}",
+        "log output format: json or text (default: json)",
+        [&params](std::string value) {
+            if (value == "json") {
+                params.log_json = true;
+            } else if (value == "text") {
+                params.log_json = false;
             } else {
-                ss << arg << (&arg != &opt.args.back() ? ", " : "");
+                throw std::invalid_argument("invalid value");
             }
         }
-        if (!opt.value_ex.empty()) ss << " " << opt.value_ex;
-        if (ss.tellp() > n_leading_spaces - 3) {
-            // current line is too long, add new line
-            ss << "\n" << leading_spaces;
-        } else {
-            // padding between arg and help, same line
-            ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--metrics"},
+        format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
+        [&params]() {
+            params.endpoint_metrics = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--no-slots"},
+        format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [&params]() {
+            params.endpoint_slots = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--slot-save-path"}, "PATH",
+        "path to save slot kv cache (default: disabled)",
+        [&params](std::string value) {
+            params.slot_save_path = value;
+            // if doesn't end with DIRECTORY_SEPARATOR, add it
+            if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                params.slot_save_path += DIRECTORY_SEPARATOR;
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--chat-template"}, "JINJA_TEMPLATE",
+        "set custom jinja chat template (default: template taken from model's metadata)\n"
+        "if suffix/prefix are specified, template will be disabled\n"
+        "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
+        [&params](std::string value) {
+            if (!llama_chat_verify_template(value)) {
+                throw std::runtime_error(format(
+                    "error: the supplied chat template is not supported: %s\n"
+                    "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
+                    value.c_str()
+                ));
+            }
+            params.chat_template = value;
         }
-        const auto help_lines = llama_arg::break_str_into_lines(opt.help, 50);
-        for (const auto & line : help_lines) {
-            ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
+        format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
+        [&params](std::string value) {
+            params.slot_prompt_similarity = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--lora-init-without-apply"},
+        format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
+        [&params]() {
+            params.lora_init_without_apply = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--simple-io"},
+        "use basic IO for better compatibility in subprocesses and limited consoles",
+        [&params]() {
+            params.simple_io = true;
         }
-        printf("%s", ss.str().c_str());
-    }
-}
+    ));
+    add_opt(llama_arg(
+        {"-ld", "--logdir"}, "LOGDIR",
+        "path under which to save YAML logs (no logging if unset)",
+        [&params](std::string value) {
+            params.logdir = value;
 
-std::vector<llama_arg> gpt_params_parser_register(gpt_params & params) {
-    std::vector<llama_arg> options;
-    options.push_back(llama_arg(
-        {"-h", "--help", "--usage"},
-        "print usage and exit",
-        [&params, &options]() {
-            gpt_params_print_usage(options);
-            exit(0);
-            return true;
+            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
+                params.logdir += DIRECTORY_SEPARATOR;
+            }
         }
     ));
-    options.push_back(llama_arg(
-        {"-m", "--model"},
-        format("model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)", params.model.c_str()),
+    add_opt(llama_arg(
+        {"--positive-file"}, "FNAME",
+        format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
         [&params](std::string value) {
-            params.model = value;
-            return true;
+            params.cvector_positive_file = value;
         }
-    ).set_value_ex("FNAME"));
-    return options;
-}
-
-bool gpt_params_parser_run(int argc, char ** argv, std::vector<llama_arg> & options) {
-    for (const auto & opt : options) {
-        if (opt.handler_void) opt.handler_void();
-    }
-    return true;
-}
-
-void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sparams;
-
-    std::string sampler_type_chars;
-    std::string sampler_type_names;
-    for (const auto sampler_type : sparams.samplers_sequence) {
-        sampler_type_chars += static_cast<char>(sampler_type);
-        sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
-    }
-    sampler_type_names.pop_back();
-
-    struct option_info {
-        LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
-        option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
-            va_list args_list;
-            va_start(args_list, desc);
-            char buffer[1024];
-            vsnprintf(buffer, sizeof(buffer), desc, args_list);
-            va_end(args_list);
-            this->desc = buffer;
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--negative-file"}, "FNAME",
+        format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
+        [&params](std::string value) {
+            params.cvector_negative_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--pca-batch"}, "N",
+        format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
+        [&params](int value) {
+            params.n_pca_batch = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--pca-iter"}, "N",
+        format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
+        [&params](int value) {
+            params.n_pca_iterations = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--method"}, "{pca, mean}",
+        "dimensionality reduction method to be used (default: pca)",
+        [&params](std::string value) {
+            /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
+            else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+            else { throw std::invalid_argument("invalid value"); }
         }
-
-        option_info(const std::string & grp) : grp(grp) {}
-
-        std::string tags;
-        std::string args;
-        std::string desc;
-        std::string grp;
-    };
-
-    std::vector<option_info> options;
-
-    // TODO: filter by tags
-
-    options.push_back({ "general" });
-    options.push_back({ "*",           "-h,    --help, --usage",        "print usage and exit" });
-    options.push_back({ "*",           "       --version",              "show version and build info" });
-    options.push_back({ "*",           "-v,    --verbose",              "print verbose information" });
-    options.push_back({ "*",           "       --verbosity N",          "set specific verbosity level (default: %d)", params.verbosity });
-    options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
-    options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
-    options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
-    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
-    options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
-    options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
-    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
-
-#ifndef GGML_USE_OPENMP
-    // these options are available only with the internal threadpool
-    options.push_back({ "*",           "-C,    --cpu-mask M",            "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
-    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",       "range of CPUs for affinity. Complements --cpu-mask"});
-    options.push_back({ "*",           "       --cpu-strict <0|1>",      "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
-    options.push_back({ "*",           "       --priority N",            "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll <0...100>",        "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
-
-    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",      "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
-    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
-    options.push_back({ "*",           "       --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
-    options.push_back({ "*",           "       --priority-batch N",      "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
-    options.push_back({ "*",           "       --poll-batch <0|1>",      "use polling to wait for work (default: same as --poll"});
-
-    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",      "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
-    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
-    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
-    options.push_back({ "speculative", "       --priority-draft N",      "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
-    options.push_back({ "speculative", "       --poll-draft <0|1>",      "Use polling to wait for draft model work (default: same as --poll])"});
-
-    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
-    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
-                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
-    options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
-                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
-    options.push_back({ "speculative", "       --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
-    options.push_back({ "speculative", "       --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
-#endif // GGML_USE_OPENMP
-
-    options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
-    options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
-    options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
-                                                                        "path to static lookup cache to use for lookup decoding (not updated by generation)" });
-    options.push_back({ "*",           "-lcd,  --lookup-cache-dynamic FNAME",
-                                                                        "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
-
-    options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
-    options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
-    options.push_back({ "*",           "-b,    --batch-size N",         "logical maximum batch size (default: %d)", params.n_batch });
-    options.push_back({ "*",           "-ub,   --ubatch-size N",        "physical maximum batch size (default: %d)", params.n_ubatch });
-    options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
-    options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
-    options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n"
-                                                                        "in conversation mode, this will be used as system prompt\n"
-                                                                        "(default: '%s')", params.prompt.c_str() });
-    options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
-    options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" });
-    options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
-    options.push_back({ "*",           "-e,    --escape",               "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
-    options.push_back({ "*",           "       --no-escape",            "do not process escape sequences" });
-    options.push_back({ "main",        "-ptc,  --print-token-count N",  "print token count every N tokens (default: %d)", params.n_print });
-    options.push_back({ "main",        "       --prompt-cache FNAME",   "file to cache prompt state for faster startup (default: none)" });
-    options.push_back({ "main",        "       --prompt-cache-all",     "if specified, saves user input and generations to cache as well\n"
-                                                                        "not supported with --interactive or other interactive options" });
-    options.push_back({ "main",        "       --prompt-cache-ro",      "if specified, uses the prompt cache but does not update it" });
-    options.push_back({ "main",        "-r,    --reverse-prompt PROMPT",
-                                                                        "halt generation at PROMPT, return control in interactive mode\n"
-                                                                        "can be specified more than once for multiple prompts" });
-    options.push_back({ "main",        "-sp,   --special",              "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
-    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode, does not print special tokens and suffix/prefix\n"
-                                                                        "if suffix/prefix are not specified, default chat template will be used\n"
-                                                                        "(default: %s)", params.conversation ? "true" : "false" });
-    options.push_back({ "main infill", "-i,    --interactive",          "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
-    options.push_back({ "main infill", "-if,   --interactive-first",    "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
-    options.push_back({ "main infill", "-mli,  --multiline-input",      "allows you to write or paste multiple lines without ending each in '\\'" });
-    options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
-    options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
-    options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
-    options.push_back({ "main",        "       --no-warmup",            "skip warming up the model with an empty run" });
-    options.push_back({ "server infill",
-                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
-
-    options.push_back({ "sampling" });
-    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
-                                                                        "(default: %s)", sampler_type_names.c_str() });
-    options.push_back({ "*",           "       --sampling-seq SEQUENCE",
-                                                                        "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
-    options.push_back({ "*",           "       --ignore-eos",           "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
-    options.push_back({ "*",           "       --penalize-nl",          "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
-    options.push_back({ "*",           "       --temp N",               "temperature (default: %.1f)", (double)sparams.temp });
-    options.push_back({ "*",           "       --top-k N",              "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
-    options.push_back({ "*",           "       --top-p N",              "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
-    options.push_back({ "*",           "       --min-p N",              "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
-    options.push_back({ "*",           "       --tfs N",                "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
-    options.push_back({ "*",           "       --typical N",            "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
-    options.push_back({ "*",           "       --repeat-last-n N",      "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
-    options.push_back({ "*",           "       --repeat-penalty N",     "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
-    options.push_back({ "*",           "       --presence-penalty N",   "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
-    options.push_back({ "*",           "       --frequency-penalty N",  "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
-    options.push_back({ "*",           "       --dynatemp-range N",     "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
-    options.push_back({ "*",           "       --dynatemp-exp N",       "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
-    options.push_back({ "*",           "       --mirostat N",           "use Mirostat sampling.\n"
-                                                                        "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
-                                                                        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
-    options.push_back({ "*",           "       --mirostat-lr N",        "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
-    options.push_back({ "*",           "       --mirostat-ent N",       "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
-    options.push_back({ "*",           "       -l TOKEN_ID(+/-)BIAS",   "modifies the likelihood of token appearing in the completion,\n"
-                                                                        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
-                                                                        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
-    options.push_back({ "main",        "       --cfg-negative-prompt PROMPT",
-                                                                        "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
-    options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
-                                                                        "negative prompt file to use for guidance" });
-    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
-    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
-                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
-                                                                        "if suffix/prefix are specified, template will be disabled\n"
-                                                                        "only commonly used templates are accepted:\n"
-                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
-    options.push_back({ "grammar" });
-    options.push_back({ "*",           "       --grammar GRAMMAR",      "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
-    options.push_back({ "*",           "       --grammar-file FNAME",   "file to read grammar from" });
-    options.push_back({ "*",           "-j,    --json-schema SCHEMA",
-                                                                        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
-                                                                        "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
-
-    options.push_back({ "embedding" });
-    options.push_back({ "embedding",   "       --pooling {none,mean,cls,last}",
-                                                                        "pooling type for embeddings, use model default if unspecified" });
-    options.push_back({ "embedding",   "       --attention {causal,non-causal}",
-                                                                        "attention type for embeddings, use model default if unspecified" });
-
-    options.push_back({ "context hacking" });
-    options.push_back({ "*",           "       --rope-scaling {none,linear,yarn}",
-                                                                        "RoPE frequency scaling method, defaults to linear unless specified by the model" });
-    options.push_back({ "*",           "       --rope-scale N",         "RoPE context scaling factor, expands context by a factor of N" });
-    options.push_back({ "*",           "       --rope-freq-base N",     "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
-    options.push_back({ "*",           "       --rope-freq-scale N",    "RoPE frequency scaling factor, expands context by a factor of 1/N" });
-    options.push_back({ "*",           "       --yarn-orig-ctx N",      "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
-    options.push_back({ "*",           "       --yarn-ext-factor N",    "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
-    options.push_back({ "*",           "       --yarn-attn-factor N",   "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
-    options.push_back({ "*",           "       --yarn-beta-slow N",     "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
-    options.push_back({ "*",           "       --yarn-beta-fast N",     "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
-    options.push_back({ "*",           "-gan,  --grp-attn-n N",         "group-attention factor (default: %d)", params.grp_attn_n });
-    options.push_back({ "*",           "-gaw,  --grp-attn-w N",         "group-attention width (default: %.1f)", (double)params.grp_attn_w });
-    options.push_back({ "*",           "-dkvc, --dump-kv-cache",        "verbose print of the KV cache" });
-    options.push_back({ "*",           "-nkvo, --no-kv-offload",        "disable KV offload" });
-    options.push_back({ "*",           "-ctk,  --cache-type-k TYPE",    "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
-    options.push_back({ "*",           "-ctv,  --cache-type-v TYPE",    "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
-
-    options.push_back({ "perplexity" });
-    options.push_back({ "perplexity",  "       --all-logits",           "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
-    options.push_back({ "perplexity",  "       --hellaswag",            "compute HellaSwag score over random tasks from datafile supplied with -f" });
-    options.push_back({ "perplexity",  "       --hellaswag-tasks N",    "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
-    options.push_back({ "perplexity",  "       --winogrande",           "compute Winogrande score over random tasks from datafile supplied with -f" });
-    options.push_back({ "perplexity",  "       --winogrande-tasks N",   "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
-    options.push_back({ "perplexity",  "       --multiple-choice",      "compute multiple choice score over random tasks from datafile supplied with -f" });
-    options.push_back({ "perplexity",  "       --multiple-choice-tasks N",
-                                                                        "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
-    options.push_back({ "perplexity",  "       --kl-divergence",        "computes KL-divergence to logits provided via --kl-divergence-base" });
-    options.push_back({ "perplexity",  "       --ppl-stride N",         "stride for perplexity calculation (default: %d)", params.ppl_stride });
-    options.push_back({ "perplexity",  "       --ppl-output-type {0,1}",
-                                                                        "output type for perplexity calculation (default: %d)", params.ppl_output_type });
-
-    options.push_back({ "parallel" });
-    options.push_back({ "*",           "-dt,   --defrag-thold N",       "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
-    options.push_back({ "*",           "-np,   --parallel N",           "number of parallel sequences to decode (default: %d)", params.n_parallel });
-    options.push_back({ "*",           "-ns,   --sequences N",          "number of sequences to decode (default: %d)", params.n_sequences });
-    options.push_back({ "*",           "-cb,   --cont-batching",        "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-nocb, --no-cont-batching",     "disable continuous batching" });
-
-    options.push_back({ "multi-modality" });
-    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
-    options.push_back({ "*",           "       --image FILE",           "path to an image file. use with multimodal models. Specify multiple times for batching" });
-
-    options.push_back({ "backend" });
-    options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
-
-    if (llama_supports_mlock()) {
-        options.push_back({ "*",           "       --mlock",                "force system to keep model in RAM rather than swapping or compressing" });
-    }
-    if (llama_supports_mmap()) {
-        options.push_back({ "*",           "       --no-mmap",              "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
-    }
-    options.push_back({ "*",           "       --numa TYPE",            "attempt optimizations that help on some NUMA systems\n"
-                                                                        "  - distribute: spread execution evenly over all nodes\n"
-                                                                        "  - isolate: only spawn threads on CPUs on the node that execution started on\n"
-                                                                        "  - numactl: use the CPU map provided by numactl\n"
-                                                                        "if run without this previously, it is recommended to drop the system page cache before using this\n"
-                                                                        "see https://github.com/ggerganov/llama.cpp/issues/1437" });
-
-    if (llama_supports_gpu_offload()) {
-        options.push_back({ "*",           "-ngl,  --gpu-layers N",
-                                                                        "number of layers to store in VRAM" });
-        options.push_back({ "*",           "-ngld, --gpu-layers-draft N",
-                                                                        "number of layers to store in VRAM for the draft model" });
-        options.push_back({ "*",           "-sm,   --split-mode SPLIT_MODE",
-                                                                        "how to split the model across multiple GPUs, one of:\n"
-                                                                        "  - none: use one GPU only\n"
-                                                                        "  - layer (default): split layers and KV across GPUs\n"
-                                                                        "  - row: split rows across GPUs" });
-        options.push_back({ "*",           "-ts,   --tensor-split SPLIT",
-                                                                        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
-        options.push_back({ "*",           "-mg,   --main-gpu i",       "the GPU to use for the model (with split-mode = none),\n"
-                                                                        "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
-    }
-
-    options.push_back({ "model" });
-    options.push_back({ "*",           "       --check-tensors",        "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
-    options.push_back({ "*",           "       --override-kv KEY=TYPE:VALUE",
-                                                                        "advanced option to override model metadata by key. may be specified multiple times.\n"
-                                                                        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
-    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
-                                                                        "note: this argument can be repeated to add multiple control vectors" });
-    options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
-                                                                        "add a control vector with user defined scaling SCALE\n"
-                                                                        "note: this argument can be repeated to add multiple scaled control vectors" });
-    options.push_back({ "*",           "       --control-vector-layer-range START END",
-                                                                        "layer range to apply the control vector(s) to, start and end inclusive" });
-    options.push_back({ "*",           "-m,    --model FNAME",          "model path (default: models/$filename with filename from --hf-file\n"
-                                                                        "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
-    options.push_back({ "*",           "-md,   --model-draft FNAME",    "draft model for speculative decoding (default: unused)" });
-    options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
-    options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
-    options.push_back({ "*",           "-hff,  --hf-file FILE",         "Hugging Face model file (default: unused)" });
-    options.push_back({ "*",           "-hft,  --hf-token TOKEN",       "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
-
-    options.push_back({ "retrieval" });
-    options.push_back({ "retrieval",   "       --context-file FNAME",   "file to load context from (repeat to specify multiple files)" });
-    options.push_back({ "retrieval",   "       --chunk-size N",         "minimum length of embedded text chunks (default: %d)", params.chunk_size });
-    options.push_back({ "retrieval",   "       --chunk-separator STRING",
-                                                                        "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
-
-    options.push_back({ "passkey" });
-    options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk });
-    options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos });
-
-    options.push_back({ "imatrix" });
-    options.push_back({ "imatrix",     "-o,    --output FNAME",         "output file (default: '%s')", params.out_file.c_str() });
-    options.push_back({ "imatrix",     "       --output-frequency N",   "output the imatrix every N iterations (default: %d)", params.n_out_freq });
-    options.push_back({ "imatrix",     "       --save-frequency N",     "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
-    options.push_back({ "imatrix",     "       --process-output",       "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
-    options.push_back({ "imatrix",     "       --no-ppl",               "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
-    options.push_back({ "imatrix",     "       --chunk N",              "start processing the input from chunk N (default: %d)", params.i_chunk });
-
-    options.push_back({ "bench" });
-    options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
-    options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" });
-    options.push_back({ "bench",       "-ntg n0,n1,...",                "number of text generation tokens" });
-    options.push_back({ "bench",       "-npl n0,n1,...",                "number of parallel prompts" });
-
-    options.push_back({ "embedding" });
-    options.push_back({ "embedding",   "       --embd-normalize",       "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
-    options.push_back({ "embedding",   "       --embd-output-format",   "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
-    options.push_back({ "embedding",   "       --embd-separator",       "separator of embendings (default \\n) for example \"<#sep#>\"" });
-
-    options.push_back({ "server" });
-    options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
-    options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
-    options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
-    options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
-    options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
-    options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
-    options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
-    options.push_back({ "server",      "       --ssl-cert-file FNAME",  "path to file a PEM-encoded SSL certificate" });
-    options.push_back({ "server",      "       --timeout N",            "server read/write timeout in seconds (default: %d)", params.timeout_read });
-    options.push_back({ "server",      "       --threads-http N",       "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
-    options.push_back({ "server",      "       --system-prompt-file FNAME",
-                                                                        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
-    options.push_back({ "server",      "       --log-format {text,json}",
-                                                                        "log output format: json or text (default: json)" });
-    options.push_back({ "server",      "       --metrics",              "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
-    options.push_back({ "server",      "       --no-slots",             "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
-    options.push_back({ "server",      "       --slot-save-path PATH",  "path to save slot kv cache (default: disabled)" });
-    options.push_back({ "server",      "       --chat-template JINJA_TEMPLATE",
-                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
-                                                                        "only commonly used templates are accepted:\n"
-                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
-    options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
-                                                                        "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
-    options.push_back({ "server",      "       --lora-init-without-apply",     "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
-
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
 #ifndef LOG_DISABLE_LOGS
-    options.push_back({ "logging" });
-    options.push_back({ "*",           "       --simple-io",            "use basic IO for better compatibility in subprocesses and limited consoles" });
-    options.push_back({ "*",           "-ld,   --logdir LOGDIR",        "path under which to save YAML logs (no logging if unset)" });
-    options.push_back({ "logging",     "       --log-test",             "Run simple logging test" });
-    options.push_back({ "logging",     "       --log-disable",          "Disable trace logs" });
-    options.push_back({ "logging",     "       --log-enable",           "Enable trace logs" });
-    options.push_back({ "logging",     "       --log-file FNAME",       "Specify a log filename (without extension)" });
-    options.push_back({ "logging",     "       --log-new",              "Create a separate new log file on start. "
-                                                                        "Each log file will have unique name: \"<name>.<ID>.log\"" });
-    options.push_back({ "logging",     "       --log-append",           "Don't truncate the old log file." });
+    // TODO: make this looks less weird
+    add_opt(llama_arg(
+        {"--log-test"},
+        "Log test",
+        []() { log_param_single_parse("--log-test"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-disable"},
+        "Log disable",
+        []() { log_param_single_parse("--log-disable"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-enable"},
+        "Log enable",
+        []() { log_param_single_parse("--log-enable"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-new"},
+        "Log new",
+        []() { log_param_single_parse("--log-new"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-append"},
+        "Log append",
+        []() { log_param_single_parse("--log-append"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-file"}, "FNAME",
+        "Log file",
+        [](std::string value) { log_param_pair_parse(false, "--log-file", value); }
+    ));
 #endif // LOG_DISABLE_LOGS
 
-    options.push_back({ "cvector" });
-    options.push_back({ "cvector",     "-o,    --output FNAME",         "output file (default: '%s')", params.cvector_outfile.c_str() });
-    options.push_back({ "cvector",     "       --positive-file FNAME",  "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
-    options.push_back({ "cvector",     "       --negative-file FNAME",  "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
-    options.push_back({ "cvector",     "       --pca-batch N",          "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
-    options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
-    options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });
-
-    options.push_back({ "export-lora" });
-    options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
-    options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
-    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
-
-    printf("usage: %s [options]\n", argv[0]);
-
-    for (const auto & o : options) {
-        if (!o.grp.empty()) {
-            printf("\n%s:\n\n", o.grp.c_str());
-            continue;
-        }
-        printf("  %-32s", o.args.c_str());
-        if (o.args.length() > 30) {
-            printf("\n%34s", "");
-        }
-
-        const auto desc = o.desc;
-        size_t start = 0;
-        size_t end = desc.find('\n');
-        while (end != std::string::npos) {
-            printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
-            start = end + 1;
-            end = desc.find('\n', start);
-        }
-
-        printf("%s\n", desc.substr(start).c_str());
-    }
-    printf("\n");
+    return options;
 }
 
 std::string gpt_params_get_system_info(const gpt_params & params) {
diff --git a/common/common.h b/common/common.h
index 04f4476f039de..27e908d7f22d6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -170,6 +170,7 @@ struct gpt_params {
 
     bool   kl_divergence    = false; // compute KL divergence
 
+    std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
     bool usage             = false; // print usage
     bool use_color         = false; // use color to distinguish generations and inputs
     bool special           = false; // enable special token output
@@ -279,73 +280,67 @@ struct gpt_params {
 };
 
 enum llama_example {
-    LLAMA_EXAMPLE_ALL,
-    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_COMMON,
+    LLAMA_EXAMPLE_SPECULATIVE,
     LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_INFILL,
+    LLAMA_EXAMPLE_EMBEDDING,
+    LLAMA_EXAMPLE_PERPLEXITY,
+    LLAMA_EXAMPLE_RETRIEVAL,
+    LLAMA_EXAMPLE_PASSKEY,
+    LLAMA_EXAMPLE_IMATRIX,
+    LLAMA_EXAMPLE_BENCH,
+    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
+    LLAMA_EXAMPLE_EXPORT_LORA,
+
+    LLAMA_EXAMPLE_COUNT,
 };
 
 struct llama_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_ALL};
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
     std::vector<std::string> args;
-    std::string value_ex;
+    std::string value_hint; // help text or example for arg value
+    std::string value_hint_2; // for second arg value
     std::string env;
     std::string help;
-    std::function<bool(void)>        handler_void   = nullptr;
-    std::function<bool(std::string)> handler_string = nullptr;
-    std::function<bool(bool)>        handler_bool   = nullptr;
-    std::function<bool(int)>         handler_int    = nullptr;
-    std::function<bool(float)>       handler_float  = nullptr;
+    std::function<void(void)>                     handler_void    = nullptr;
+    std::function<void(std::string)>              handler_string  = nullptr;
+    std::function<void(std::string, std::string)> handler_str_str = nullptr;
+    std::function<void(int)>                      handler_int     = nullptr;
 
-    llama_arg(std::vector<std::string> args, std::string help, std::function<bool(std::string)> handler) : args(args), help(help), handler_string(handler) {}
+    llama_arg(std::vector<std::string> args, std::string value_hint, std::string help, std::function<void(std::string)> handler) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
 
-    llama_arg(std::vector<std::string> args, std::string help, std::function<bool(bool)> handler) : args(args), help(help), handler_bool(handler) {}
+    llama_arg(std::vector<std::string> args, std::string value_hint, std::string help, std::function<void(int)> handler) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
 
-    llama_arg(std::vector<std::string> args, std::string help, std::function<bool(void)> handler) : args(args), help(help), handler_void(handler) {}
+    llama_arg(std::vector<std::string> args, std::string help, std::function<void(void)> handler) : args(args), help(help), handler_void(handler) {}
 
-    llama_arg & set_examples(std::set<enum llama_example> _examples) {
-        examples = std::move(_examples);
-        return *this;
-    }
+    // support 2 values for arg
+    llama_arg(std::vector<std::string> args, std::string value_hint, std::string value_hint_2, std::string help, std::function<void(std::string, std::string)> handler) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
-    llama_arg & set_value_ex(std::string _value_ex) {
-        value_ex = std::move(_value_ex);
+    llama_arg & set_examples(std::set<enum llama_example> examples) {
+        this->examples = std::move(examples);
         return *this;
     }
 
-    llama_arg & set_env(std::string _env) {
-        env = _env;
+    llama_arg & set_env(std::string env) {
+        this->env = std::move(env);
         return *this;
     }
 
-    // utility function
-    static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
-        std::vector<std::string> result;
-        std::istringstream iss(input);
-        std::string word, line;
-        while (iss >> word) {
-            if (line.length() + !line.empty() + word.length() > max_char_per_line) {
-                if (!line.empty()) result.push_back(line);
-                line = word;
-            } else {
-                line += (!line.empty() ? " " : "") + word;
-            }
-        }
-        if (!line.empty()) result.push_back(line);
-        return result;
+    bool in_example(enum llama_example ex) {
+        return examples.find(ex) != examples.end();
     }
 };
 
-std::vector<llama_arg> gpt_params_parser_register(gpt_params & params);
-bool gpt_params_parser_run(int argc, char ** argv, std::vector<llama_arg> & options);
+std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
+std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
+bool gpt_params_parse      (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
+bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
+void gpt_params_print_usage(std::vector<llama_arg> & options);
 
-void gpt_params_parse_from_env(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);
 
-bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
-bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
-bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
-void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
-
 std::string gpt_params_get_system_info(const gpt_params & params);
 
 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 25e7c775a0095..0d6076108eac8 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -28,9 +28,7 @@ static std::vector<int> parse_list(char * p) {
     return ret;
 }
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
     LOG_TEE("\n");
@@ -39,8 +37,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 53fbfb0a8cf2a..55c7a09d1edd9 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -7,9 +7,7 @@
 #include <string>
 #include <vector>
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
     LOG_TEE("\n");
@@ -21,8 +19,8 @@ int main(int argc, char ** argv) {
     params.prompt = "Hello my name is";
     params.n_predict = 32;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index a68268388389d..0795175a12a73 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -35,9 +35,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
     return ret;
 }
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
     printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
     printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@@ -390,8 +388,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index b05aa006e7da5..74151d24f32d3 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -79,8 +79,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 5e89988e2beda..de11d86ba2712 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -144,8 +144,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 8df457e219493..544e7fff6fbcc 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -391,9 +391,7 @@ struct lora_merge_ctx {
     }
 };
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
     printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
     printf("\nNOTE: output model is F16\n");
@@ -403,8 +401,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 2c61c2e1eb3bc..73ad8c11b2a98 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -157,8 +157,8 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 83b85d72b043a..2a4f230740b76 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -17,9 +17,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s \\\n"
             "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
@@ -579,8 +577,8 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     params.verbosity = 1;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 05700c1d591d9..4f5f7d028d5f8 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -106,8 +106,8 @@ int main(int argc, char ** argv) {
     llama_sampling_params & sparams = params.sparams;
     g_params = &params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 86b39f20eea6e..4dd17cf68ab1c 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -112,9 +112,7 @@ struct llava_context {
     struct llama_model * model = NULL;
 };
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\n example usage:\n");
     LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
     LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
@@ -280,8 +278,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
@@ -293,7 +291,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
 
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        print_usage(argc, argv, {});
+        print_usage(argc, argv);
         return 1;
     }
     auto model = llava_init(&params);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index f500ea5b944f4..18a9ad09f7de1 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -253,8 +253,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
@@ -266,7 +266,6 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
 
     if (params.mmproj.empty() || (params.image.empty())) {
-        gpt_params_print_usage(argc, argv, params);
         show_additional_info(argc, argv);
         return 1;
     }
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 81cf1629c5b6a..6b2a131da2c1b 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -37,8 +37,8 @@ struct ngram_container {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 5f04709f50231..795b06c8894f0 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -13,8 +13,8 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 400f3e0b08957..93299ef8b738a 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -15,8 +15,8 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index d53a9828c2ea2..5a7b773faa2e8 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -14,8 +14,8 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 6a025ed512217..058a6da142b0d 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -131,12 +131,9 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
-    auto options = gpt_params_parser_register(params);
-    gpt_params_parser_run(argc, argv, options);
-    return 0;
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN);
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 621a1c9590622..b2083ce1c4453 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -100,8 +100,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index d03215cd1e0a9..58d8ffeea11a6 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -6,9 +6,7 @@
 #include <string>
 #include <vector>
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
     LOG_TEE("\n");
@@ -21,8 +19,8 @@ int main(int argc, char ** argv) {
     params.n_keep = 32;
     params.i_pos  = -1;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 484dd589109c7..2855dd0ab68ca 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1967,8 +1967,8 @@ int main(int argc, char ** argv) {
     params.n_ctx = 512;
     params.logits_all = true;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index aab9d81058af9..c3e835c864048 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -4,9 +4,7 @@
 #include <algorithm>
 #include <fstream>
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
     LOG_TEE("\n");
@@ -113,8 +111,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 3ea7c790d2bf7..ef82e81521e15 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -10,8 +10,8 @@ int main(int argc, char ** argv) {
 
     params.prompt = "The quick brown fox";
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 109dbc023efe0..099b224cf9c5f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2491,14 +2491,11 @@ int main(int argc, char ** argv) {
     // own arguments required by this example
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
-    // parse arguments from environment variables
-    gpt_params_parse_from_env(params);
-
     // TODO: not great to use extern vars
     server_log_json = params.log_json;
     server_verbose = params.verbosity > 0;
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 69a92cf7dc0c0..57ce71c1c4ba5 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -6,9 +6,7 @@
 #include <string>
 #include <vector>
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
     LOG_TEE("\n");
@@ -20,8 +18,8 @@ int main(int argc, char ** argv) {
     params.prompt = "Hello my name is";
     params.n_predict = 32;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 1616edecbbef6..849f5d9983c16 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -27,8 +27,8 @@ struct seq_draft {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 

From 753782ae350f3bb00a2e5d19f7dac9a210fb8518 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 16:46:31 +0200
Subject: [PATCH 03/20] add test

---
 Makefile                  |  6 +++
 common/common.cpp         | 79 +++++++++++++++++++++++----------------
 common/common.h           |  2 +
 tests/CMakeLists.txt      |  1 +
 tests/test-arg-parser.cpp | 67 +++++++++++++++++++++++++++++++++
 5 files changed, 122 insertions(+), 33 deletions(-)
 create mode 100644 tests/test-arg-parser.cpp

diff --git a/Makefile b/Makefile
index 332496cfc39c1..9c61d3ec02b24 100644
--- a/Makefile
+++ b/Makefile
@@ -43,6 +43,7 @@ BUILD_TARGETS = \
 
 # Binaries only useful for tests
 TEST_TARGETS = \
+	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
 	tests/test-chat-template \
@@ -1505,6 +1506,11 @@ run-benchmark-matmult: llama-benchmark-matmult
 
 .PHONY: run-benchmark-matmult swift
 
+tests/test-arg-parser: tests/test-arg-parser.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 tests/test-llama-grammar: tests/test-llama-grammar.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/common/common.cpp b/common/common.cpp
index 09e3a992c6a06..ce9199c844254 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -383,8 +383,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
     const std::string arg_prefix = "--";
     llama_sampling_params & sparams = params.sparams;
 
-    std::unordered_map<std::string, const llama_arg *> arg_to_options;
-    for (const auto & opt : options) {
+    std::unordered_map<std::string, llama_arg *> arg_to_options;
+    for (auto & opt : options) {
         for (const auto & arg : opt.args) {
             arg_to_options[arg] = &opt;
         }
@@ -404,8 +404,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
         if (arg_to_options.find(arg) == arg_to_options.end()) {
             throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
         }
+        auto opt = *arg_to_options[arg];
         try {
-            auto opt = *arg_to_options[arg];
             if (opt.handler_void) {
                 opt.handler_void();
                 continue;
@@ -431,7 +431,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
                 continue;
             }
         } catch (std::exception & e) {
-            throw std::invalid_argument(format("error: %s", e.what()));
+            throw std::invalid_argument(format(
+                "error while handling argument \"%s\": %s\n\n"
+                "usage:\n%s\n\nto show complete usage, run with -h",
+                arg.c_str(), e.what(), arg_to_options[arg]->to_string(false).c_str()));
         }
     }
 
@@ -592,39 +595,49 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
     return result;
 }
 
-void gpt_params_print_usage(std::vector<llama_arg> & options) {
+std::string llama_arg::to_string(bool markdown) {
+    // params for printing to console
     const static int n_leading_spaces = 40;
     const static int n_char_per_line_help = 70; // TODO: detect this based on current console
-    
-    auto print_options = [](std::vector<const llama_arg *> & options) {
-        std::string leading_spaces(n_leading_spaces, ' ');
-        for (const auto & opt : options) {
-            std::ostringstream ss;
-            for (const auto & arg : opt->args) {
-                if (&arg == &opt->args.front()) {
-                    ss << (opt->args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str()));
-                } else {
-                    ss << arg << (&arg != &opt->args.back() ? ", " : "");
-                }
-            }
-            if (!opt->value_hint.empty()) ss << " " << opt->value_hint;
-            if (ss.tellp() > n_leading_spaces - 3) {
-                // current line is too long, add new line
-                ss << "\n" << leading_spaces;
-            } else {
-                // padding between arg and help, same line
-                ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
-            }
-            const auto help_lines = break_str_into_lines(opt->help, n_char_per_line_help);
-            for (const auto & line : help_lines) {
-                ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
-            }
-            printf("%s", ss.str().c_str());
+    std::string leading_spaces(n_leading_spaces, ' ');
+
+    std::ostringstream ss;
+    if (markdown) ss << "| `";
+    for (const auto & arg : args) {
+        if (arg == args.front()) {
+            ss << (args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str()));
+        } else {
+            ss << arg << (arg != args.back() ? ", " : "");
+        }
+    }
+    if (!value_hint.empty()) ss << " " << value_hint;
+    if (!markdown) {
+        if (ss.tellp() > n_leading_spaces - 3) {
+            // current line is too long, add new line
+            ss << "\n" << leading_spaces;
+        } else {
+            // padding between arg and help, same line
+            ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+        }
+        const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
+        for (const auto & line : help_lines) {
+            ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
+        }
+    } else {
+        ss << "` | " << help << " |";
+    }
+    return ss.str();
+}
+
+void gpt_params_print_usage(std::vector<llama_arg> & options) {
+    auto print_options = [](std::vector<llama_arg *> & options) {
+        for (llama_arg * opt : options) {
+            printf("%s", opt->to_string(false).c_str());
         }
     };
 
-    std::vector<const llama_arg *> common_options;
-    std::vector<const llama_arg *> specific_options;
+    std::vector<llama_arg *> common_options;
+    std::vector<llama_arg *> specific_options;
     for (auto & opt : options) {
         if (opt.in_example(LLAMA_EXAMPLE_COMMON)) {
             common_options.push_back(&opt);
@@ -1688,7 +1701,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         }
     ));
     add_opt(llama_arg(
-        {"-sm", "--split-mode"}, "SPLIT_MODE",
+        {"-sm", "--split-mode"}, "{none,layer,row}",
         "how to split the model across multiple GPUs, one of:\n"
         "- none: use one GPU only\n"
         "- layer (default): split layers and KV across GPUs\n"
diff --git a/common/common.h b/common/common.h
index 27e908d7f22d6..05211bf972764 100644
--- a/common/common.h
+++ b/common/common.h
@@ -331,6 +331,8 @@ struct llama_arg {
     bool in_example(enum llama_example ex) {
         return examples.find(ex) != examples.end();
     }
+
+    std::string to_string(bool markdown);
 };
 
 std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0207e3a5943c9..30e71cfd44c51 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -108,6 +108,7 @@ llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
 #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 
 # llama_target_and_test(test-double-float.cpp) # SLOW
+llama_target_and_test(test-arg-parser.cpp)
 llama_target_and_test(test-quantize-fns.cpp)
 llama_target_and_test(test-quantize-perf.cpp)
 llama_target_and_test(test-sampling.cpp)
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
new file mode 100644
index 0000000000000..8b95a59d39c86
--- /dev/null
+++ b/tests/test-arg-parser.cpp
@@ -0,0 +1,67 @@
+#include <string>
+#include <vector>
+#include <sstream>
+
+#undef NDEBUG
+#include <cassert>
+
+#include "common.h"
+
+int main(void) {
+    gpt_params params;
+
+    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
+    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
+        try {
+            gpt_params_parser_init(params, (enum llama_example)ex);
+        } catch (std::exception & e) {
+            printf("%s\n", e.what());
+            assert(false);
+        }
+    }
+
+    auto list_str_to_char = [](std::vector<std::string> & argv) -> std::vector<char *> {
+        std::vector<char *> res;
+        for (auto & arg : argv) {
+            res.push_back(const_cast<char *>(arg.data()));
+        }
+        return res;
+    };
+
+    std::vector<std::string> argv;
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+
+    printf("test-arg-parser: test invalid usage\n\n");
+
+    argv = {"binary_name", "-m"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+
+    argv = {"binary_name", "-ngl", "hello"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+
+    argv = {"binary_name", "-sm", "hello"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+
+
+    printf("test-arg-parser: test valid usage\n\n");
+
+    argv = {"binary_name", "-m", "model_file.gguf"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.model == "model_file.gguf");
+
+    argv = {"binary_name", "-t", "1234"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.cpuparams.n_threads == 1234);
+
+    argv = {"binary_name", "--verbose"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.verbosity == 1);
+
+    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.model == "abc.gguf");
+    assert(params.n_predict == 6789);
+    assert(params.n_batch == 9090);
+
+    printf("test-arg-parser: all tests OK\n\n");
+}

From 60ae92bd5430640609e45afa62931fcaec08dae1 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 19:26:21 +0200
Subject: [PATCH 04/20] handle env

---
 common/common.cpp         | 113 +++++++++++++++++---------------------
 common/common.h           |  16 ++++++
 tests/test-arg-parser.cpp |  24 ++++++++
 3 files changed, 90 insertions(+), 63 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index ce9199c844254..49db551ae6339 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -77,41 +77,6 @@
 
 using json = nlohmann::ordered_json;
 
-//
-// Environment variable utils
-//
-
-template<typename T>
-static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::string(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::stoi(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<std::is_floating_point<T>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::stof(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<std::is_same<T, bool>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    if (value) {
-        std::string val(value);
-        target = val == "1" || val == "true";
-    }
-}
-
 //
 // CPU utils
 //
@@ -390,6 +355,29 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
         }
     }
 
+    // handle environment variables
+    for (auto & opt : options) {
+        std::string value;
+        if (opt.get_value_from_env(value)) {
+            try {
+                if (opt.handler_void && (value == "1" || value == "true")) {
+                    opt.handler_void();
+                }
+                if (opt.handler_int) {
+                    opt.handler_int(std::stoi(value));
+                }
+                if (opt.handler_string) {
+                    opt.handler_string(value);
+                    continue;
+                }
+            } catch (std::exception & e) {
+                throw std::invalid_argument(format(
+                    "error while handling environment variable \"%s\": %s\n\n", opt.env.c_str(), e.what()));
+            }
+        }
+    }
+
+    // handle command line arguments
     auto check_arg = [&](int i) {
         if (i+1 >= argc) {
             throw std::invalid_argument("expected value for argument");
@@ -405,6 +393,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
             throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
         }
         auto opt = *arg_to_options[arg];
+        if (opt.has_value_from_env()) {
+            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env.c_str(), arg.c_str());
+        }
         try {
             if (opt.handler_void) {
                 opt.handler_void();
@@ -449,10 +440,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
 
     gpt_params_handle_model_default(params);
 
-    if (params.hf_token.empty()) {
-        get_env("HF_TOKEN", params.hf_token);
-    }
-
     if (params.escape) {
         string_process_escapes(params.prompt);
         string_process_escapes(params.input_prefix);
@@ -762,7 +749,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 params.cpuparams.n_threads = std::thread::hardware_concurrency();
             }
         }
-    ));
+    ).set_env("LLAMA_ARG_THREADS"));
     add_opt(llama_arg(
         {"-tb", "--threads-batch"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads)",
@@ -960,28 +947,28 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](int value) {
             params.n_ctx = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_CTX_SIZE"));
     add_opt(llama_arg(
         {"-n", "--predict"}, "N",
         format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
         [&params](int value) {
             params.n_predict = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_N_PREDICT"));
     add_opt(llama_arg(
         {"-b", "--batch-size"}, "N",
         format("logical maximum batch size (default: %d)", params.n_batch),
         [&params](int value) {
             params.n_batch = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_BATCH"));
     add_opt(llama_arg(
         {"-ub", "--ubatch-size"}, "N",
         format("physical maximum batch size (default: %d)", params.n_ubatch),
         [&params](int value) {
             params.n_ubatch = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_UBATCH"));
     add_opt(llama_arg(
         {"--keep"}, "N",
         format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
@@ -1002,7 +989,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params]() {
             params.flash_attn = true;
         }
-    ));
+    ).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(llama_arg(
         {"-p", "--prompt"}, "PROMPT",
         "prompt to start generation with\n",
@@ -1599,7 +1586,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.defrag_thold = std::stof(value);
         }
-    ));
+    ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
     add_opt(llama_arg(
         {"-np", "--parallel"}, "N",
         format("number of parallel sequences to decode (default: %d)", params.n_parallel),
@@ -1620,14 +1607,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params]() {
             params.cont_batching = true;
         }
-    ));
+    ).set_env("LLAMA_ARG_CONT_BATCHING"));
     add_opt(llama_arg(
         {"-nocb", "--no-cont-batching"},
         "disable continuous batching",
         [&params]() {
             params.cont_batching = false;
         }
-    ));
+    ).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
     add_opt(llama_arg(
         {"--mmproj"}, "FILE",
         "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
@@ -1688,7 +1675,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
         }
-    ));
+    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
     add_opt(llama_arg(
         {"-ngld", "--gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
@@ -1830,7 +1817,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.model = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
     add_opt(llama_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
@@ -1844,28 +1831,28 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.model_url = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_MODEL_URL"));
     add_opt(llama_arg(
         {"-hfr", "--hf-repo"}, "REPO",
         "Hugging Face model repository (default: unused)",
         [&params](std::string value) {
             params.hf_repo = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_HF_REPO"));
     add_opt(llama_arg(
         {"-hff", "--hf-file"}, "FILE",
         "Hugging Face model file (default: unused)",
         [&params](std::string value) {
             params.hf_file = value;
         }
-    ));
+    ).set_env("LLAMA_ARG_HF_FILE"));
     add_opt(llama_arg(
         {"-hft", "--hf-token"}, "TOKEN",
         "Hugging Face access token (default: value from HF_TOKEN environment variable)",
         [&params](std::string value) {
             params.hf_token = value;
         }
-    ));
+    ).set_env("HF_TOKEN"));
     add_opt(llama_arg(
         {"--context-file"}, "FNAME",
         "file to load context from (repeat to specify multiple files)",
@@ -2012,14 +1999,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.hostname = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
     add_opt(llama_arg(
         {"--port"}, "PORT",
         format("port to listen (default: %d)", params.port),
         [&params](int value) {
             params.port = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
     add_opt(llama_arg(
         {"--path"}, "PATH",
         format("path to serve static files from (default: %s)", params.public_path.c_str()),
@@ -2028,19 +2015,19 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
-        {"--embedding(s)"},
+        {"--embedding", "--embeddings"},
         format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
         [&params]() {
             params.embedding = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
     add_opt(llama_arg(
         {"--api-key"}, "KEY",
         "API key to use for authentication (default: none)",
         [&params](std::string value) {
             params.api_keys.push_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
     add_opt(llama_arg(
         {"--api-key-file"}, "FNAME",
         "path to file containing API keys (default: none)",
@@ -2086,7 +2073,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](int value) {
             params.n_threads_http = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
     add_opt(llama_arg(
         {"-spf", "--system-prompt-file"}, "FNAME",
         "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
@@ -2123,14 +2110,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params]() {
             params.endpoint_metrics = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
     add_opt(llama_arg(
         {"--no-slots"},
         format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
         [&params]() {
             params.endpoint_slots = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
     add_opt(llama_arg(
         {"--slot-save-path"}, "PATH",
         "path to save slot kv cache (default: disabled)",
@@ -2157,7 +2144,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             }
             params.chat_template = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(llama_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
diff --git a/common/common.h b/common/common.h
index 05211bf972764..c6f476ec34586 100644
--- a/common/common.h
+++ b/common/common.h
@@ -316,6 +316,7 @@ struct llama_arg {
     llama_arg(std::vector<std::string> args, std::string help, std::function<void(void)> handler) : args(args), help(help), handler_void(handler) {}
 
     // support 2 values for arg
+    // note: env variable is not yet support for 2 values
     llama_arg(std::vector<std::string> args, std::string value_hint, std::string value_hint_2, std::string help, std::function<void(std::string, std::string)> handler) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
     llama_arg & set_examples(std::set<enum llama_example> examples) {
@@ -324,6 +325,7 @@ struct llama_arg {
     }
 
     llama_arg & set_env(std::string env) {
+        help = help + "\n(env: " + env + ")";
         this->env = std::move(env);
         return *this;
     }
@@ -332,6 +334,20 @@ struct llama_arg {
         return examples.find(ex) != examples.end();
     }
 
+    bool get_value_from_env(std::string & output) {
+        if (env.empty()) return false;
+        char * value = std::getenv(env.c_str());
+        if (value) {
+            output = value;
+            return true;
+        }
+        return false;
+    }
+
+    bool has_value_from_env() {
+        return std::getenv(env.c_str());
+    }
+
     std::string to_string(bool markdown);
 };
 
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index 8b95a59d39c86..ff1a626c39761 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -63,5 +63,29 @@ int main(void) {
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
+    printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
+
+    setenv("LLAMA_ARG_THREADS", "blah", true);
+    argv = {"binary_name"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.model == "blah.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+
+
+    printf("test-arg-parser: test environment variables being overwritten\n\n");
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name", "-m", "overwritten.gguf"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.model == "overwritten.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+
+
     printf("test-arg-parser: all tests OK\n\n");
 }

From 286dcc9dbef0485435dd34142f781605afe5f1b2 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 19:28:06 +0200
Subject: [PATCH 05/20] fix linux build

---
 common/common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/common.h b/common/common.h
index c6f476ec34586..f849483307d0a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -17,6 +17,7 @@
 #include <set>
 #include <unordered_map>
 #include <tuple>
+#include <functional>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'

From 75d0869ef5e79fb8d31b5f120ffe1cacd1fdabf9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 19:59:55 +0200
Subject: [PATCH 06/20] add export-docs example

---
 .gitignore                           |  1 +
 Makefile                             |  7 +++++
 common/common.cpp                    | 29 +++++++----------
 common/common.h                      |  2 +-
 examples/export-docs/CMakeLists.txt  |  5 +++
 examples/export-docs/export-docs.cpp | 47 ++++++++++++++++++++++++++++
 6 files changed, 73 insertions(+), 18 deletions(-)
 create mode 100644 examples/export-docs/CMakeLists.txt
 create mode 100644 examples/export-docs/export-docs.cpp

diff --git a/.gitignore b/.gitignore
index 9986ac6b19d4e..1092d097a7542 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,6 +61,7 @@ llama-batched-swift
 /rpc-server
 out/
 tmp/
+autogen-*.md
 
 # Deprecated
 
diff --git a/Makefile b/Makefile
index 9c61d3ec02b24..ba3f11c5352bf 100644
--- a/Makefile
+++ b/Makefile
@@ -39,6 +39,7 @@ BUILD_TARGETS = \
 	llama-tokenize \
 	llama-vdot \
 	llama-cvector-generator \
+	llama-export-docs \
 	tests/test-c.o
 
 # Binaries only useful for tests
@@ -1449,6 +1450,12 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@
 
+llama-export-docs: examples/export-docs/export-docs.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	./llama-export-docs
+
 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
diff --git a/common/common.cpp b/common/common.cpp
index 49db551ae6339..2d99bfc255252 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -425,7 +425,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
             throw std::invalid_argument(format(
                 "error while handling argument \"%s\": %s\n\n"
                 "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), arg_to_options[arg]->to_string(false).c_str()));
+                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
         }
     }
 
@@ -582,14 +582,13 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
     return result;
 }
 
-std::string llama_arg::to_string(bool markdown) {
+std::string llama_arg::to_string() {
     // params for printing to console
     const static int n_leading_spaces = 40;
     const static int n_char_per_line_help = 70; // TODO: detect this based on current console
     std::string leading_spaces(n_leading_spaces, ' ');
 
     std::ostringstream ss;
-    if (markdown) ss << "| `";
     for (const auto & arg : args) {
         if (arg == args.front()) {
             ss << (args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str()));
@@ -598,20 +597,16 @@ std::string llama_arg::to_string(bool markdown) {
         }
     }
     if (!value_hint.empty()) ss << " " << value_hint;
-    if (!markdown) {
-        if (ss.tellp() > n_leading_spaces - 3) {
-            // current line is too long, add new line
-            ss << "\n" << leading_spaces;
-        } else {
-            // padding between arg and help, same line
-            ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
-        }
-        const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
-        for (const auto & line : help_lines) {
-            ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
-        }
+    if (ss.tellp() > n_leading_spaces - 3) {
+        // current line is too long, add new line
+        ss << "\n" << leading_spaces;
     } else {
-        ss << "` | " << help << " |";
+        // padding between arg and help, same line
+        ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+    }
+    const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
+    for (const auto & line : help_lines) {
+        ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
     }
     return ss.str();
 }
@@ -619,7 +614,7 @@ std::string llama_arg::to_string(bool markdown) {
 void gpt_params_print_usage(std::vector<llama_arg> & options) {
     auto print_options = [](std::vector<llama_arg *> & options) {
         for (llama_arg * opt : options) {
-            printf("%s", opt->to_string(false).c_str());
+            printf("%s", opt->to_string().c_str());
         }
     };
 
diff --git a/common/common.h b/common/common.h
index f849483307d0a..7536120fc1588 100644
--- a/common/common.h
+++ b/common/common.h
@@ -349,7 +349,7 @@ struct llama_arg {
         return std::getenv(env.c_str());
     }
 
-    std::string to_string(bool markdown);
+    std::string to_string();
 };
 
 std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
diff --git a/examples/export-docs/CMakeLists.txt b/examples/export-docs/CMakeLists.txt
new file mode 100644
index 0000000000000..0e953167ed653
--- /dev/null
+++ b/examples/export-docs/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-export-docs)
+add_executable(${TARGET} export-docs.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/export-docs/export-docs.cpp b/examples/export-docs/export-docs.cpp
new file mode 100644
index 0000000000000..e21c4b89d53eb
--- /dev/null
+++ b/examples/export-docs/export-docs.cpp
@@ -0,0 +1,47 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <string>
+#include <vector>
+
+// Export usage message (-h) to markdown format
+
+static void export_md(std::string fname, llama_example ex) {
+    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
+
+    gpt_params params;
+    auto options = gpt_params_parser_init(params, ex);
+
+    file << "| Argument | Explanation |\n";
+    file << "| -------- | ----------- |\n";
+    for (auto & opt : options) {
+        file << "| `";
+        // args
+        for (const auto & arg : opt.args) {
+        if (arg == opt.args.front()) {
+                file << (opt.args.size() == 1 ? arg : (arg + ", "));
+            } else {
+                file << arg << (arg != opt.args.back() ? ", " : "");
+            }
+        }
+        // value hint
+        std::string md_value_hint(opt.value_hint);
+        string_replace_all(md_value_hint, "|", "\\|");
+        file << " " << md_value_hint;
+        // help text
+        std::string md_help(opt.help);
+        string_replace_all(md_help, "\n", "<br/>");
+        string_replace_all(md_help, "|", "\\|");
+        file << "` | " << md_help << " |\n";
+    }
+}
+
+int main(int, char **) {
+    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
+    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
+
+    return 0;
+}

From f5e6a80c3f941d08d9cd8a8c5e3ab9a46c5ffa8a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 20:00:52 +0200
Subject: [PATCH 07/20] fix build (2)

---
 common/common.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/common.cpp b/common/common.cpp
index 2d99bfc255252..838f59f4e1bfa 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -25,6 +25,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <climits>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>

From 88e3a4f3bc4a4f4f93a9e17c186eac4822579e7c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 20:20:46 +0200
Subject: [PATCH 08/20] skip build test-arg-parser on windows

---
 tests/test-arg-parser.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index ff1a626c39761..f3e24e9d8ec10 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -8,6 +8,9 @@
 #include "common.h"
 
 int main(void) {
+#ifdef _WIN32
+    printf("test-arg-parser: skip on windows build\n");
+#else
     gpt_params params;
 
     printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
@@ -88,4 +91,5 @@ int main(void) {
 
 
     printf("test-arg-parser: all tests OK\n\n");
+#endif // __MINGW32__
 }

From fe6df473a355acaedec38d34255f79309d3d8b24 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 20:26:26 +0200
Subject: [PATCH 09/20] update server docs

---
 examples/server/README.md | 389 +++++++++++++-------------------------
 1 file changed, 132 insertions(+), 257 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 805e05b4a5114..6570c64f93093 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -17,262 +17,137 @@ The project is under active development, and we are [looking for feedback and co
 
 ## Usage
 
-```
-usage: ./llama-server [options]
-
-general:
-
-  -h,    --help, --usage          print usage and exit
-         --version                show version and build info
-  -v,    --verbose                print verbose information
-         --verbosity N            set specific verbosity level (default: 0)
-         --verbose-prompt         print a verbose prompt before generation (default: false)
-         --no-display-prompt      don't print prompt at generation (default: false)
-  -co,   --color                  colorise output to distinguish prompt and user input from generations (default: false)
-  -s,    --seed SEED              RNG seed (default: -1, use random seed for < 0)
-  -t,    --threads N              number of threads to use during generation (default: 8)
-  -tb,   --threads-batch N        number of threads to use during batch and prompt processing (default: same as --threads)
-  -td,   --threads-draft N        number of threads to use during generation (default: same as --threads)
-  -tbd,  --threads-batch-draft N  number of threads to use during batch and prompt processing (default: same as --threads-draft)
-         --draft N                number of tokens to draft for speculative decoding (default: 5)
-  -ps,   --p-split N              speculative decoding split probability (default: 0.1)
-  -lcs,  --lookup-cache-static FNAME
-                                  path to static lookup cache to use for lookup decoding (not updated by generation)
-  -lcd,  --lookup-cache-dynamic FNAME
-                                  path to dynamic lookup cache to use for lookup decoding (updated by generation)
-  -c,    --ctx-size N             size of the prompt context (default: 0, 0 = loaded from model)
-  -n,    --predict N              number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
-  -b,    --batch-size N           logical maximum batch size (default: 2048)
-  -ub,   --ubatch-size N          physical maximum batch size (default: 512)
-         --keep N                 number of tokens to keep from the initial prompt (default: 0, -1 = all)
-         --chunks N               max number of chunks to process (default: -1, -1 = all)
-  -fa,   --flash-attn             enable Flash Attention (default: disabled)
-  -p,    --prompt PROMPT          prompt to start generation with
-                                  in conversation mode, this will be used as system prompt
-                                  (default: '')
-  -f,    --file FNAME             a file containing the prompt (default: none)
-         --in-file FNAME          an input file (repeat to specify multiple files)
-  -bf,   --binary-file FNAME      binary file containing the prompt (default: none)
-  -e,    --escape                 process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
-         --no-escape              do not process escape sequences
-  -ptc,  --print-token-count N    print token count every N tokens (default: -1)
-         --prompt-cache FNAME     file to cache prompt state for faster startup (default: none)
-         --prompt-cache-all       if specified, saves user input and generations to cache as well
-                                  not supported with --interactive or other interactive options
-         --prompt-cache-ro        if specified, uses the prompt cache but does not update it
-  -r,    --reverse-prompt PROMPT  halt generation at PROMPT, return control in interactive mode
-                                  can be specified more than once for multiple prompts
-  -sp,   --special                special tokens output enabled (default: false)
-  -cnv,  --conversation           run in conversation mode, does not print special tokens and suffix/prefix
-                                  if suffix/prefix are not specified, default chat template will be used
-                                  (default: false)
-  -i,    --interactive            run in interactive mode (default: false)
-  -if,   --interactive-first      run in interactive mode and wait for input right away (default: false)
-  -mli,  --multiline-input        allows you to write or paste multiple lines without ending each in '\'
-         --in-prefix-bos          prefix BOS to user inputs, preceding the `--in-prefix` string
-         --in-prefix STRING       string to prefix user inputs with (default: empty)
-         --in-suffix STRING       string to suffix after user inputs with (default: empty)
-         --spm-infill             use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
-
-sampling:
-
-         --samplers SAMPLERS      samplers that will be used for generation in the order, separated by ';'
-                                  (default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
-         --sampling-seq SEQUENCE  simplified sequence for samplers that will be used (default: kfypmt)
-         --ignore-eos             ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
-         --penalize-nl            penalize newline tokens (default: false)
-         --temp N                 temperature (default: 0.8)
-         --top-k N                top-k sampling (default: 40, 0 = disabled)
-         --top-p N                top-p sampling (default: 0.9, 1.0 = disabled)
-         --min-p N                min-p sampling (default: 0.1, 0.0 = disabled)
-         --tfs N                  tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
-         --typical N              locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
-         --repeat-last-n N        last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
-         --repeat-penalty N       penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
-         --presence-penalty N     repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
-         --frequency-penalty N    repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
-         --dynatemp-range N       dynamic temperature range (default: 0.0, 0.0 = disabled)
-         --dynatemp-exp N         dynamic temperature exponent (default: 1.0)
-         --mirostat N             use Mirostat sampling.
-                                  Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
-                                  (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
-         --mirostat-lr N          Mirostat learning rate, parameter eta (default: 0.1)
-         --mirostat-ent N         Mirostat target entropy, parameter tau (default: 5.0)
-         -l TOKEN_ID(+/-)BIAS     modifies the likelihood of token appearing in the completion,
-                                  i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
-                                  or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
-         --cfg-negative-prompt PROMPT
-                                  negative prompt to use for guidance (default: '')
-         --cfg-negative-prompt-file FNAME
-                                  negative prompt file to use for guidance
-         --cfg-scale N            strength of guidance (default: 1.0, 1.0 = disable)
-         --chat-template JINJA_TEMPLATE
-                                  set custom jinja chat template (default: template taken from model's metadata)
-                                  if suffix/prefix are specified, template will be disabled
-                                  only commonly used templates are accepted:
-                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-
-grammar:
-
-         --grammar GRAMMAR        BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
-         --grammar-file FNAME     file to read grammar from
-  -j,    --json-schema SCHEMA     JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
-                                  For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
-
-embedding:
-
-         --pooling {none,mean,cls,last}
-                                  pooling type for embeddings, use model default if unspecified
-         --attention {causal,non-causal}
-                                  attention type for embeddings, use model default if unspecified
-
-context hacking:
-
-         --rope-scaling {none,linear,yarn}
-                                  RoPE frequency scaling method, defaults to linear unless specified by the model
-         --rope-scale N           RoPE context scaling factor, expands context by a factor of N
-         --rope-freq-base N       RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
-         --rope-freq-scale N      RoPE frequency scaling factor, expands context by a factor of 1/N
-         --yarn-orig-ctx N        YaRN: original context size of model (default: 0 = model training context size)
-         --yarn-ext-factor N      YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
-         --yarn-attn-factor N     YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
-         --yarn-beta-slow N       YaRN: high correction dim or alpha (default: 1.0)
-         --yarn-beta-fast N       YaRN: low correction dim or beta (default: 32.0)
-  -gan,  --grp-attn-n N           group-attention factor (default: 1)
-  -gaw,  --grp-attn-w N           group-attention width (default: 512.0)
-  -dkvc, --dump-kv-cache          verbose print of the KV cache
-  -nkvo, --no-kv-offload          disable KV offload
-  -ctk,  --cache-type-k TYPE      KV cache data type for K (default: f16)
-  -ctv,  --cache-type-v TYPE      KV cache data type for V (default: f16)
-
-perplexity:
-
-         --all-logits             return logits for all tokens in the batch (default: false)
-         --hellaswag              compute HellaSwag score over random tasks from datafile supplied with -f
-         --hellaswag-tasks N      number of tasks to use when computing the HellaSwag score (default: 400)
-         --winogrande             compute Winogrande score over random tasks from datafile supplied with -f
-         --winogrande-tasks N     number of tasks to use when computing the Winogrande score (default: 0)
-         --multiple-choice        compute multiple choice score over random tasks from datafile supplied with -f
-         --multiple-choice-tasks N
-                                  number of tasks to use when computing the multiple choice score (default: 0)
-         --kl-divergence          computes KL-divergence to logits provided via --kl-divergence-base
-         --ppl-stride N           stride for perplexity calculation (default: 0)
-         --ppl-output-type {0,1}  output type for perplexity calculation (default: 0)
-
-parallel:
-
-  -dt,   --defrag-thold N         KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
-  -np,   --parallel N             number of parallel sequences to decode (default: 1)
-  -ns,   --sequences N            number of sequences to decode (default: 1)
-  -cb,   --cont-batching          enable continuous batching (a.k.a dynamic batching) (default: enabled)
-
-multi-modality:
-
-         --mmproj FILE            path to a multimodal projector file for LLaVA. see examples/llava/README.md
-         --image FILE             path to an image file. use with multimodal models. Specify multiple times for batching
-
-backend:
-
-         --rpc SERVERS            comma separated list of RPC servers
-         --mlock                  force system to keep model in RAM rather than swapping or compressing
-         --no-mmap                do not memory-map model (slower load but may reduce pageouts if not using mlock)
-         --numa TYPE              attempt optimizations that help on some NUMA systems
-                                    - distribute: spread execution evenly over all nodes
-                                    - isolate: only spawn threads on CPUs on the node that execution started on
-                                    - numactl: use the CPU map provided by numactl
-                                  if run without this previously, it is recommended to drop the system page cache before using this
-                                  see https://github.com/ggerganov/llama.cpp/issues/1437
-
-model:
-
-         --check-tensors          check model tensor data for invalid values (default: false)
-         --override-kv KEY=TYPE:VALUE
-                                  advanced option to override model metadata by key. may be specified multiple times.
-                                  types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
-         --lora FNAME             apply LoRA adapter (implies --no-mmap)
-         --lora-scaled FNAME S    apply LoRA adapter with user defined scaling S (implies --no-mmap)
-         --lora-base FNAME        optional model to use as a base for the layers modified by the LoRA adapter
-         --control-vector FNAME   add a control vector
-                                  note: this argument can be repeated to add multiple control vectors
-         --control-vector-scaled FNAME SCALE
-                                  add a control vector with user defined scaling SCALE
-                                  note: this argument can be repeated to add multiple scaled control vectors
-         --control-vector-layer-range START END
-                                  layer range to apply the control vector(s) to, start and end inclusive
-  -m,    --model FNAME            model path (default: models/$filename with filename from --hf-file
-                                  or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
-  -md,   --model-draft FNAME      draft model for speculative decoding (default: unused)
-  -mu,   --model-url MODEL_URL    model download url (default: unused)
-  -hfr,  --hf-repo REPO           Hugging Face model repository (default: unused)
-  -hff,  --hf-file FILE           Hugging Face model file (default: unused)
-  -hft,  --hf-token TOKEN         Hugging Face access token (default: value from HF_TOKEN environment variable)
-
-server:
-
-         --host HOST              ip address to listen (default: 127.0.0.1)
-         --port PORT              port to listen (default: 8080)
-         --path PATH              path to serve static files from (default: )
-         --embedding(s)           restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
-         --api-key KEY            API key to use for authentication (default: none)
-         --api-key-file FNAME     path to file containing API keys (default: none)
-         --ssl-key-file FNAME     path to file a PEM-encoded SSL private key
-         --ssl-cert-file FNAME    path to file a PEM-encoded SSL certificate
-         --timeout N              server read/write timeout in seconds (default: 600)
-         --threads-http N         number of threads used to process HTTP requests (default: -1)
-         --system-prompt-file FNAME
-                                  set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
-         --log-format {text,json}
-                                  log output format: json or text (default: json)
-         --metrics                enable prometheus compatible metrics endpoint (default: disabled)
-         --no-slots               disables slots monitoring endpoint (default: enabled)
-         --slot-save-path PATH    path to save slot kv cache (default: disabled)
-         --chat-template JINJA_TEMPLATE
-                                  set custom jinja chat template (default: template taken from model's metadata)
-                                  only commonly used templates are accepted:
-                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-  -sps,  --slot-prompt-similarity SIMILARITY
-                                  how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
-         --lora-init-without-apply
-                                  load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
-
-logging:
-
-         --simple-io              use basic IO for better compatibility in subprocesses and limited consoles
-  -ld,   --logdir LOGDIR          path under which to save YAML logs (no logging if unset)
-         --log-test               Run simple logging test
-         --log-disable            Disable trace logs
-         --log-enable             Enable trace logs
-         --log-file FNAME         Specify a log filename (without extension)
-         --log-new                Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
-         --log-append             Don't truncate the old log file.
-```
-
-Available environment variables (if specified, these variables will override parameters specified in arguments):
-
-- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
-- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
-- `LLAMA_ARG_MODEL`: equivalent to `-m`
-- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
-- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
-- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
-- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
-- `LLAMA_ARG_THREADS`: equivalent to `-t`
-- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
-- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
-- `LLAMA_ARG_BATCH`: equivalent to `-b`
-- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
-- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
-- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
-- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
-- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
-- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
-- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
-- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
-- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
-- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
-- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
-- `LLAMA_ARG_HOST`: equivalent to `--host`
-- `LLAMA_ARG_PORT`: equivalent to `--port`
+| Argument | Explanation |
+| -------- | ----------- |
+| `-h, --help, --usage ` | print usage and exit |
+| `--version ` | show version and build info |
+| `-v, --verbose ` | print verbose information |
+| `--verbosity N` | set specific verbosity level (default: 0) |
+| `--verbose-prompt ` | print a verbose prompt before generation (default: false) |
+| `--no-display-prompt ` | don't print prompt at generation (default: false) |
+| `-co, --color ` | colorise output to distinguish prompt and user input from generations (default: false) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
+| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
+| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
+| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
+| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
+| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
+| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
+| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
+| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
+| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
+| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll |
+| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
+| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
+| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-n, --predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
+| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
+| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
+| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
+| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
+| `-fa, --flash-attn ` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
+| `-p, --prompt PROMPT` | prompt to start generation with<br/> |
+| `-f, --file FNAME` | a file containing the prompt (default: none) |
+| `--in-file FNAME` | an input file (repeat to specify multiple files) |
+| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
+| `-e, --escape ` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
+| `--no-escape ` | do not process escape sequences |
+| `--spm-infill ` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
+| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
+| `--ignore-eos ` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
+| `--penalize-nl ` | penalize newline tokens (default: false) |
+| `--temp N` | temperature (default: 0.8) |
+| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
+| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
+| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
+| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) |
+| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
+| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
+| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
+| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
+| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
+| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
+| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
+| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
+| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
+| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
+| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
+| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
+| `--grammar-file FNAME` | file to read grammar from |
+| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model |
+| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N |
+| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model) |
+| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N |
+| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size) |
+| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) |
+| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0) |
+| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0) |
+| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) |
+| `-gan, --grp-attn-n N` | group-attention factor (default: 1) |
+| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) |
+| `-dkvc, --dump-kv-cache ` | verbose print of the KV cache |
+| `-nkvo, --no-kv-offload ` | disable KV offload |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
+| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
+| `-ns, --sequences N` | number of sequences to decode (default: 1) |
+| `-cb, --cont-batching ` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-nocb, --no-cont-batching ` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
+| `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md |
+| `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching |
+| `--rpc SERVERS` | comma separated list of RPC servers |
+| `--mlock ` | force system to keep model in RAM rather than swapping or compressing |
+| `--no-mmap ` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
+| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
+| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
+| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
+| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
+| `--check-tensors ` | check model tensor data for invalid values (default: false) |
+| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
+| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
+| `--lora-scaled FNAME` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
+| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
+| `--control-vector-scaled FNAME` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
+| `--control-vector-layer-range START` | layer range to apply the control vector(s) to, start and end inclusive |
+| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
+| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
+| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
+| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
+| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
+| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
+| `--path PATH` | path to serve static files from (default: ) |
+| `--embedding, --embeddings ` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
+| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
+| `--api-key-file FNAME` | path to file containing API keys (default: none) |
+| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
+| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
+| `--timeout N` | server read/write timeout in seconds (default: 600) |
+| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
+| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
+| `--log-format {text, json}` | log output format: json or text (default: json) |
+| `--metrics ` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
+| `--no-slots ` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
+| `--lora-init-without-apply ` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+| `--simple-io ` | use basic IO for better compatibility in subprocesses and limited consoles |
+| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
+| `--log-test ` | Log test |
+| `--log-disable ` | Log disable |
+| `--log-enable ` | Log enable |
+| `--log-new ` | Log new |
+| `--log-append ` | Log append |
+| `--log-file FNAME` | Log file |
+
+Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
 
 Example usage of docker compose with environment variables:
 
@@ -289,7 +164,7 @@ services:
       LLAMA_ARG_MODEL: /models/my_model.gguf
       LLAMA_ARG_CTX_SIZE: 4096
       LLAMA_ARG_N_PARALLEL: 2
-      LLAMA_ARG_ENDPOINT_METRICS: 1  # to disable, either remove or set to 0
+      LLAMA_ARG_ENDPOINT_METRICS: 1
       LLAMA_ARG_PORT: 8080
 ```
 

From b1657cb934d95ee28c3bd5667e48d46ce1c7da91 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 20:58:10 +0200
Subject: [PATCH 10/20] bring back missing --alias

---
 common/common.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 534cbe35ce4f8..9e959b02bb037 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -650,7 +650,6 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
     }
     sampler_type_names.pop_back();
-    const char split_delim = ',';
 
 
     /**
@@ -1804,6 +1803,13 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             params.control_vector_layer_end = std::stoi(end);
         }
     ));
+    add_opt(llama_arg(
+        {"-a", "--alias"}, "STRING",
+        "set alias for model name (to be used by REST API)",
+        [&params](std::string value) {
+            params.model_alias = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL"));
     add_opt(llama_arg(
         {"-m", "--model"}, "FNAME",
         ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -1950,7 +1956,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         {"-npp"}, "n0,n1,...",
         "number of prompt tokens",
         [&params](std::string value) {
-            auto p = string_split<int>(value, split_delim);
+            auto p = string_split<int>(value, ',');
             params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
@@ -1958,7 +1964,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         {"-ntg"}, "n0,n1,...",
         "number of text generation tokens",
         [&params](std::string value) {
-            auto p = string_split<int>(value, split_delim);
+            auto p = string_split<int>(value, ',');
             params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
@@ -1966,7 +1972,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         {"-npl"}, "n0,n1,...",
         "number of parallel prompts",
         [&params](std::string value) {
-            auto p = string_split<int>(value, split_delim);
+            auto p = string_split<int>(value, ',');
             params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));

From 509ec08e5751bc9c51160f5b731ea94ddd4906ee Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Sep 2024 21:03:50 +0200
Subject: [PATCH 11/20] bring back --n-predict

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 9e959b02bb037..d04fc5f7ff5ca 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -944,7 +944,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         }
     ).set_env("LLAMA_ARG_CTX_SIZE"));
     add_opt(llama_arg(
-        {"-n", "--predict"}, "N",
+        {"-n", "--predict", "--n-predict"}, "N",
         format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
         [&params](int value) {
             params.n_predict = value;

From d545ffcb6deac469a0b4af0be69bf9b47997181e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 6 Sep 2024 09:39:08 +0200
Subject: [PATCH 12/20] clarify test-arg-parser

---
 tests/test-arg-parser.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index f3e24e9d8ec10..2f3cf815d2a2c 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -8,9 +8,6 @@
 #include "common.h"
 
 int main(void) {
-#ifdef _WIN32
-    printf("test-arg-parser: skip on windows build\n");
-#else
     gpt_params params;
 
     printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
@@ -66,6 +63,10 @@ int main(void) {
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
+// skip this part on windows, because setenv is not supported
+#ifdef _WIN32
+    printf("test-arg-parser: skip on windows build\n");
+#else
     printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
 
     setenv("LLAMA_ARG_THREADS", "blah", true);
@@ -91,5 +92,5 @@ int main(void) {
 
 
     printf("test-arg-parser: all tests OK\n\n");
-#endif // __MINGW32__
+#endif // _WIN32
 }

From 79ce128d2a51e847f502204b519de14e3840a9ee Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 6 Sep 2024 09:41:04 +0200
Subject: [PATCH 13/20] small correction

---
 tests/test-arg-parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index 2f3cf815d2a2c..8852bfc7e63b6 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -89,8 +89,8 @@ int main(void) {
     assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
     assert(params.model == "overwritten.gguf");
     assert(params.cpuparams.n_threads == 1010);
+#endif // _WIN32
 
 
     printf("test-arg-parser: all tests OK\n\n");
-#endif // _WIN32
 }

From 961bd19da102c8dec63a23acc01976bb84ed2565 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 6 Sep 2024 13:42:20 +0200
Subject: [PATCH 14/20] add comments

---
 common/common.cpp | 22 +++++++++++++-------
 common/common.h   | 52 +++++++++++++++++++++++++++--------------------
 2 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index d04fc5f7ff5ca..d8d2caac31ec3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -456,6 +456,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
         params.kv_overrides.back().key[0] = 0;
     }
 
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+        sparams.seed = params.seed;
+    }
+
     return true;
 }
 
@@ -468,7 +473,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector<l
             return false;
         }
         if (params.usage) {
-            gpt_params_print_usage(options);
+            gpt_params_print_usage(params, options);
             if (params.print_usage) {
                 params.print_usage(argc, argv);
             }
@@ -612,7 +617,7 @@ std::string llama_arg::to_string() {
     return ss.str();
 }
 
-void gpt_params_print_usage(std::vector<llama_arg> & options) {
+void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options) {
     auto print_options = [](std::vector<llama_arg *> & options) {
         for (llama_arg * opt : options) {
             printf("%s", opt->to_string().c_str());
@@ -622,14 +627,16 @@ void gpt_params_print_usage(std::vector<llama_arg> & options) {
     std::vector<llama_arg *> common_options;
     std::vector<llama_arg *> specific_options;
     for (auto & opt : options) {
-        if (opt.in_example(LLAMA_EXAMPLE_COMMON)) {
-            common_options.push_back(&opt);
-        } else {
+        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        if (opt.in_example(params.curr_ex)) {
             specific_options.push_back(&opt);
+        } else {
+            common_options.push_back(&opt);
         }
     }
     printf("----- common options -----\n\n");
     print_options(common_options);
+    // TODO: maybe convert enum llama_example to string
     printf("\n\n----- example-specific options -----\n\n");
     print_options(specific_options);
 }
@@ -641,6 +648,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
 std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage) {
     std::vector<llama_arg> options;
     params.print_usage = print_usage;
+    params.curr_ex     = ex;
     llama_sampling_params & sparams = params.sparams;
 
     std::string sampler_type_chars;
@@ -1772,14 +1780,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.lora_adapters.push_back({ std::string(value), 1.0 });
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(llama_arg(
         {"--lora-scaled"}, "FNAME", "SCALE",
         "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
         [&params](std::string fname, std::string scale) {
             params.lora_adapters.push_back({ fname, std::stof(scale) });
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(llama_arg(
         {"--control-vector"}, "FNAME",
         "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
diff --git a/common/common.h b/common/common.h
index 7536120fc1588..8f5e3a96ab06d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -63,6 +63,24 @@ int32_t cpu_get_num_math();
 // CLI argument parsing
 //
 
+enum llama_example {
+    LLAMA_EXAMPLE_COMMON,
+    LLAMA_EXAMPLE_SPECULATIVE,
+    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_INFILL,
+    LLAMA_EXAMPLE_EMBEDDING,
+    LLAMA_EXAMPLE_PERPLEXITY,
+    LLAMA_EXAMPLE_RETRIEVAL,
+    LLAMA_EXAMPLE_PASSKEY,
+    LLAMA_EXAMPLE_IMATRIX,
+    LLAMA_EXAMPLE_BENCH,
+    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
+    LLAMA_EXAMPLE_EXPORT_LORA,
+
+    LLAMA_EXAMPLE_COUNT,
+};
+
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
     DIMRE_METHOD_PCA,
@@ -79,6 +97,7 @@ struct cpu_params {
 };
 
 struct gpt_params {
+    enum llama_example curr_ex    = LLAMA_EXAMPLE_COMMON;
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
     int32_t n_predict             =    -1; // new tokens to predict
@@ -125,7 +144,7 @@ struct gpt_params {
     // // sampling parameters
     struct llama_sampling_params sparams;
 
-    std::string model                = "model.gguf"; // model path
+    std::string model                = ""; // model path
     std::string model_draft          = ""; // draft model for speculative decoding
     std::string model_alias          = "unknown"; // model alias
     std::string model_url            = ""; // model url to download
@@ -280,24 +299,6 @@ struct gpt_params {
     std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 };
 
-enum llama_example {
-    LLAMA_EXAMPLE_COMMON,
-    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
-    LLAMA_EXAMPLE_EMBEDDING,
-    LLAMA_EXAMPLE_PERPLEXITY,
-    LLAMA_EXAMPLE_RETRIEVAL,
-    LLAMA_EXAMPLE_PASSKEY,
-    LLAMA_EXAMPLE_IMATRIX,
-    LLAMA_EXAMPLE_BENCH,
-    LLAMA_EXAMPLE_SERVER,
-    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
-    LLAMA_EXAMPLE_EXPORT_LORA,
-
-    LLAMA_EXAMPLE_COUNT,
-};
-
 struct llama_arg {
     std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
     std::vector<std::string> args;
@@ -352,11 +353,18 @@ struct llama_arg {
     std::string to_string();
 };
 
+// initialize list of options (arguments) that can be used by the current example
 std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
+// optionally, we can provide "print_usage" to print example usage
 std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
-bool gpt_params_parse      (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-void gpt_params_print_usage(std::vector<llama_arg> & options);
+
+// parse input arguments from CLI
+// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
+bool gpt_params_parse   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
+
+// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
+void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
 
 void gpt_params_handle_model_default(gpt_params & params);
 

From 53244f9c58883b78534ce867ab2fdee8a52fd641 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 6 Sep 2024 13:47:10 +0200
Subject: [PATCH 15/20] fix args with 2 values

---
 common/common.cpp                    |  3 +-
 examples/export-docs/export-docs.cpp | 13 ++++--
 examples/server/README.md            | 70 ++++++++++++++--------------
 3 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index d8d2caac31ec3..526fff05782bf 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -603,6 +603,7 @@ std::string llama_arg::to_string() {
         }
     }
     if (!value_hint.empty()) ss << " " << value_hint;
+    if (!value_hint_2.empty()) ss << " " << value_hint_2;
     if (ss.tellp() > n_leading_spaces - 3) {
         // current line is too long, add new line
         ss << "\n" << leading_spaces;
@@ -850,7 +851,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     ));
     add_opt(llama_arg(
         {"--poll-batch"}, "<0|1>",
-        "use polling to wait for work (default: same as --poll",
+        "use polling to wait for work (default: same as --poll)",
         [&params](int value) {
             params.cpuparams_batch.poll = value;
         }
diff --git a/examples/export-docs/export-docs.cpp b/examples/export-docs/export-docs.cpp
index e21c4b89d53eb..86c041a811d12 100644
--- a/examples/export-docs/export-docs.cpp
+++ b/examples/export-docs/export-docs.cpp
@@ -28,9 +28,16 @@ static void export_md(std::string fname, llama_example ex) {
             }
         }
         // value hint
-        std::string md_value_hint(opt.value_hint);
-        string_replace_all(md_value_hint, "|", "\\|");
-        file << " " << md_value_hint;
+        if (!opt.value_hint.empty()) {
+            std::string md_value_hint(opt.value_hint);
+            string_replace_all(md_value_hint, "|", "\\|");
+            file << " " << md_value_hint;
+        }
+        if (!opt.value_hint_2.empty()) {
+            std::string md_value_hint_2(opt.value_hint_2);
+            string_replace_all(md_value_hint_2, "|", "\\|");
+            file << " " << md_value_hint_2;
+        }
         // help text
         std::string md_help(opt.help);
         string_replace_all(md_help, "\n", "<br/>");
diff --git a/examples/server/README.md b/examples/server/README.md
index 6570c64f93093..62250fd8df672 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -19,13 +19,13 @@ The project is under active development, and we are [looking for feedback and co
 
 | Argument | Explanation |
 | -------- | ----------- |
-| `-h, --help, --usage ` | print usage and exit |
-| `--version ` | show version and build info |
-| `-v, --verbose ` | print verbose information |
+| `-h, --help, --usage` | print usage and exit |
+| `--version` | show version and build info |
+| `-v, --verbose` | print verbose information |
 | `--verbosity N` | set specific verbosity level (default: 0) |
-| `--verbose-prompt ` | print a verbose prompt before generation (default: false) |
-| `--no-display-prompt ` | don't print prompt at generation (default: false) |
-| `-co, --color ` | colorise output to distinguish prompt and user input from generations (default: false) |
+| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
+| `--no-display-prompt` | don't print prompt at generation (default: false) |
+| `-co, --color` | colorise output to distinguish prompt and user input from generations (default: false) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
@@ -36,27 +36,27 @@ The project is under active development, and we are [looking for feedback and co
 | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
 | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
-| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll |
+| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
 | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
 | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
 | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
-| `-n, --predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
+| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
 | `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
-| `-fa, --flash-attn ` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
+| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
 | `-p, --prompt PROMPT` | prompt to start generation with<br/> |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
 | `--in-file FNAME` | an input file (repeat to specify multiple files) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
-| `-e, --escape ` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
-| `--no-escape ` | do not process escape sequences |
-| `--spm-infill ` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
+| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
+| `--no-escape` | do not process escape sequences |
+| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
 | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
-| `--ignore-eos ` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--penalize-nl ` | penalize newline tokens (default: false) |
+| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
+| `--penalize-nl` | penalize newline tokens (default: false) |
 | `--temp N` | temperature (default: 0.8) |
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
@@ -87,33 +87,33 @@ The project is under active development, and we are [looking for feedback and co
 | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) |
 | `-gan, --grp-attn-n N` | group-attention factor (default: 1) |
 | `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) |
-| `-dkvc, --dump-kv-cache ` | verbose print of the KV cache |
-| `-nkvo, --no-kv-offload ` | disable KV offload |
+| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
+| `-nkvo, --no-kv-offload` | disable KV offload |
 | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
 | `-ns, --sequences N` | number of sequences to decode (default: 1) |
-| `-cb, --cont-batching ` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching ` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
+| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
 | `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md |
 | `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching |
-| `--rpc SERVERS` | comma separated list of RPC servers |
-| `--mlock ` | force system to keep model in RAM rather than swapping or compressing |
-| `--no-mmap ` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
+| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
+| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
 | `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
-| `--check-tensors ` | check model tensor data for invalid values (default: false) |
+| `--check-tensors` | check model tensor data for invalid values (default: false) |
 | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
 | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
-| `--lora-scaled FNAME` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
+| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
-| `--control-vector-scaled FNAME` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
-| `--control-vector-layer-range START` | layer range to apply the control vector(s) to, start and end inclusive |
+| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
+| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
 | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
 | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
@@ -123,7 +123,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: ) |
-| `--embedding, --embeddings ` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
+| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
@@ -132,19 +132,19 @@ The project is under active development, and we are [looking for feedback and co
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--log-format {text, json}` | log output format: json or text (default: json) |
-| `--metrics ` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
-| `--no-slots ` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
+| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
-| `--lora-init-without-apply ` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--simple-io ` | use basic IO for better compatibility in subprocesses and limited consoles |
+| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
-| `--log-test ` | Log test |
-| `--log-disable ` | Log disable |
-| `--log-enable ` | Log enable |
-| `--log-new ` | Log new |
-| `--log-append ` | Log append |
+| `--log-test` | Log test |
+| `--log-disable` | Log disable |
+| `--log-enable` | Log enable |
+| `--log-new` | Log new |
+| `--log-append` | Log append |
 | `--log-file FNAME` | Log file |
 
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.

From e1281d0d7ae4736fc6d6ec2964f885eebb39a452 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 6 Sep 2024 14:05:51 +0200
Subject: [PATCH 16/20] refine example-specific args

---
 common/common.cpp            | 28 ++++++++++++++++++----------
 common/common.h              |  1 +
 examples/llava/llava-cli.cpp |  2 +-
 examples/main/main.cpp       |  9 ++++++++-
 examples/server/README.md    |  8 +-------
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 526fff05782bf..d28f918ef6dda 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -720,21 +720,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params]() {
             params.verbose_prompt = true;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--no-display-prompt"},
         format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
         [&params]() {
             params.display_prompt = false;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-co", "--color"},
         format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
         [&params]() {
             params.use_color = true;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-s", "--seed"}, "SEED",
         format("RNG seed (default: %d, use random seed for < 0)", params.seed),
@@ -996,7 +996,9 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(llama_arg(
         {"-p", "--prompt"}, "PROMPT",
-        "prompt to start generation with\n",
+        ex == LLAMA_EXAMPLE_MAIN
+            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
+            : "prompt to start generation with",
         [&params](std::string value) {
             params.prompt = value;
         }
@@ -1102,7 +1104,13 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-cnv", "--conversation"},
-        "run in conversation mode, does not print special tokens and suffix/prefix\n",
+        format(
+            "run in conversation mode:\n"
+            "- does not print special tokens and suffix/prefix\n"
+            "- interactive mode is also enabled\n"
+            "(default: %s)",
+            params.conversation ? "true" : "false"
+        ),
         [&params]() {
             params.conversation = true;
         }
@@ -1625,14 +1633,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.mmproj = value;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(llama_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
         [&params](std::string value) {
             params.image.emplace_back(value);
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
 #ifdef GGML_USE_RPC
     add_opt(llama_arg(
         {"--rpc"}, "SERVERS",
@@ -1692,7 +1700,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-sm", "--split-mode"}, "{none,layer,row}",
         "how to split the model across multiple GPUs, one of:\n"
@@ -1837,7 +1845,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.model_draft = value;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
@@ -2178,7 +2186,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params]() {
             params.simple_io = true;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-ld", "--logdir"}, "LOGDIR",
         "path under which to save YAML logs (no logging if unset)",
diff --git a/common/common.h b/common/common.h
index 8f5e3a96ab06d..a8aa6fe144a30 100644
--- a/common/common.h
+++ b/common/common.h
@@ -77,6 +77,7 @@ enum llama_example {
     LLAMA_EXAMPLE_SERVER,
     LLAMA_EXAMPLE_CVECTOR_GENERATOR,
     LLAMA_EXAMPLE_EXPORT_LORA,
+    LLAMA_EXAMPLE_LLAVA,
 
     LLAMA_EXAMPLE_COUNT,
 };
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 4dd17cf68ab1c..8a64fe1bbdc8b 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -278,7 +278,7 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
     if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 058a6da142b0d..c434ff608b06b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -40,6 +40,13 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    printf("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    printf("\n");
+}
+
 static bool file_exists(const std::string & path) {
     std::ifstream f(path.c_str());
     return f.good();
@@ -131,7 +138,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
 
     if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
diff --git a/examples/server/README.md b/examples/server/README.md
index 62250fd8df672..d21e5f2dbc4c6 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -25,7 +25,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--verbosity N` | set specific verbosity level (default: 0) |
 | `--verbose-prompt` | print a verbose prompt before generation (default: false) |
 | `--no-display-prompt` | don't print prompt at generation (default: false) |
-| `-co, --color` | colorise output to distinguish prompt and user input from generations (default: false) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
@@ -46,7 +45,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
 | `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
-| `-p, --prompt PROMPT` | prompt to start generation with<br/> |
+| `-p, --prompt PROMPT` | prompt to start generation with |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
 | `--in-file FNAME` | an input file (repeat to specify multiple files) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
@@ -96,13 +95,10 @@ The project is under active development, and we are [looking for feedback and co
 | `-ns, --sequences N` | number of sequences to decode (default: 1) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md |
-| `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
 | `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
@@ -115,7 +111,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
 | `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
 | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
@@ -138,7 +133,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
 | `--log-test` | Log test |
 | `--log-disable` | Log disable |

From ceddafa0e152d6213413773550d27a51ff7caabd Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 7 Sep 2024 18:19:41 +0200
Subject: [PATCH 17/20] no more lamba capture

Co-authored-by: slaren@users.noreply.github.com
---
 common/common.cpp | 398 +++++++++++++++++++++++-----------------------
 common/common.h   |  47 ++++--
 2 files changed, 232 insertions(+), 213 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 3694c12762a0c..012dd1adc98ef 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -362,13 +362,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
         if (opt.get_value_from_env(value)) {
             try {
                 if (opt.handler_void && (value == "1" || value == "true")) {
-                    opt.handler_void();
+                    opt.handler_void(params, sparams);
                 }
                 if (opt.handler_int) {
-                    opt.handler_int(std::stoi(value));
+                    opt.handler_int(params, sparams, std::stoi(value));
                 }
                 if (opt.handler_string) {
-                    opt.handler_string(value);
+                    opt.handler_string(params, sparams, value);
                     continue;
                 }
             } catch (std::exception & e) {
@@ -399,7 +399,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
         }
         try {
             if (opt.handler_void) {
-                opt.handler_void();
+                opt.handler_void(params, sparams);
                 continue;
             }
 
@@ -407,11 +407,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
             check_arg(i);
             std::string val = argv[++i];
             if (opt.handler_int) {
-                opt.handler_int(std::stoi(val));
+                opt.handler_int(params, sparams, std::stoi(val));
                 continue;
             }
             if (opt.handler_string) {
-                opt.handler_string(val);
+                opt.handler_string(params, sparams, val);
                 continue;
             }
 
@@ -419,7 +419,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
             check_arg(i);
             std::string val2 = argv[++i];
             if (opt.handler_str_str) {
-                opt.handler_str_str(val, val2);
+                opt.handler_str_str(params, sparams, val, val2);
                 continue;
             }
         } catch (std::exception & e) {
@@ -687,14 +687,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-h", "--help", "--usage"},
         "print usage and exit",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.usage = true;
         }
     ));
     add_opt(llama_arg(
         {"--version"},
         "show version and build info",
-        []() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
             fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
             exit(0);
@@ -703,42 +703,42 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-v", "--verbose"},
         "print verbose information",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.verbosity = 1;
         }
     ));
     add_opt(llama_arg(
         {"--verbosity"}, "N",
         format("set specific verbosity level (default: %d)", params.verbosity),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.verbosity = value;
         }
     ));
     add_opt(llama_arg(
         {"--verbose-prompt"},
         format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.verbose_prompt = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--no-display-prompt"},
         format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.display_prompt = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-co", "--color"},
         format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.use_color = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-s", "--seed"}, "SEED",
         format("RNG seed (default: %d, use random seed for < 0)", params.seed),
-        [&sparams, &params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
             params.seed = std::stoul(value);
             sparams.seed = std::stoul(value);
@@ -747,7 +747,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-t", "--threads"}, "N",
         format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.cpuparams.n_threads = value;
             if (params.cpuparams.n_threads <= 0) {
                 params.cpuparams.n_threads = std::thread::hardware_concurrency();
@@ -757,7 +757,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-tb", "--threads-batch"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads)",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.cpuparams_batch.n_threads = value;
             if (params.cpuparams_batch.n_threads <= 0) {
                 params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
@@ -767,7 +767,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-td", "--threads-draft"}, "N",
         "number of threads to use during generation (default: same as --threads)",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.draft_cpuparams.n_threads = value;
             if (params.draft_cpuparams.n_threads <= 0) {
                 params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
@@ -777,7 +777,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-tbd", "--threads-batch-draft"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.draft_cpuparams_batch.n_threads = value;
             if (params.draft_cpuparams_batch.n_threads <= 0) {
                 params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
@@ -787,7 +787,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-C", "--cpu-mask"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string mask = value;
             params.cpuparams.mask_valid = true;
             if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
@@ -798,7 +798,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-Cr", "--cpu-range"}, "lo-hi",
         "range of CPUs for affinity. Complements --cpu-mask",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string range = value;
             params.cpuparams.mask_valid = true;
             if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
@@ -809,21 +809,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cpu-strict"}, "<0|1>",
         format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.cpuparams.strict_cpu = std::stoul(value);
         }
     ));
     add_opt(llama_arg(
         {"--poll"}, "<0...100>",
         format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.cpuparams.poll = std::stoul(value);
         }
     ));
     add_opt(llama_arg(
         {"-Cb", "--cpu-mask-batch"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string mask = value;
             params.cpuparams_batch.mask_valid = true;
             if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
@@ -834,7 +834,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-Crb", "--cpu-range-batch"}, "lo-hi",
         "ranges of CPUs for affinity. Complements --cpu-mask-batch",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string range = value;
             params.cpuparams_batch.mask_valid = true;
             if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
@@ -845,21 +845,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cpu-strict-batch"}, "<0|1>",
         "use strict CPU placement (default: same as --cpu-strict)",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.cpuparams_batch.strict_cpu = value;
         }
     ));
     add_opt(llama_arg(
         {"--poll-batch"}, "<0|1>",
         "use polling to wait for work (default: same as --poll)",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.cpuparams_batch.poll = value;
         }
     ));
     add_opt(llama_arg(
         {"-Cd", "--cpu-mask-draft"}, "M",
         "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string mask = value;
             params.draft_cpuparams.mask_valid = true;
             if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
@@ -870,7 +870,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-Crd", "--cpu-range-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string range = value;
             params.draft_cpuparams.mask_valid = true;
             if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
@@ -881,21 +881,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cpu-strict-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: same as --cpu-strict)",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.draft_cpuparams.strict_cpu = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"--poll-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: same as --poll])",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.draft_cpuparams.poll = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string range = value;
             params.draft_cpuparams_batch.mask_valid = true;
             if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
@@ -906,91 +906,91 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cpu-strict-batch-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.draft_cpuparams_batch.strict_cpu = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"--poll-batch-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: --poll-draft)",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.draft_cpuparams_batch.poll = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"--draft"}, "N",
         format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_draft = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-ps", "--p-split"}, "N",
         format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.p_split = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-lcs", "--lookup-cache-static"}, "FNAME",
         "path to static lookup cache to use for lookup decoding (not updated by generation)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.lookup_cache_static = value;
         }
     ));
     add_opt(llama_arg(
         {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
         "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.lookup_cache_dynamic = value;
         }
     ));
     add_opt(llama_arg(
         {"-c", "--ctx-size"}, "N",
         format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_ctx = value;
         }
     ).set_env("LLAMA_ARG_CTX_SIZE"));
     add_opt(llama_arg(
         {"-n", "--predict", "--n-predict"}, "N",
         format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_predict = value;
         }
     ).set_env("LLAMA_ARG_N_PREDICT"));
     add_opt(llama_arg(
         {"-b", "--batch-size"}, "N",
         format("logical maximum batch size (default: %d)", params.n_batch),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_batch = value;
         }
     ).set_env("LLAMA_ARG_BATCH"));
     add_opt(llama_arg(
         {"-ub", "--ubatch-size"}, "N",
         format("physical maximum batch size (default: %d)", params.n_ubatch),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_ubatch = value;
         }
     ).set_env("LLAMA_ARG_UBATCH"));
     add_opt(llama_arg(
         {"--keep"}, "N",
         format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_keep = value;
         }
     ));
     add_opt(llama_arg(
         {"--chunks"}, "N",
         format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_chunks = value;
         }
     ));
     add_opt(llama_arg(
         {"-fa", "--flash-attn"},
         format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.flash_attn = true;
         }
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
@@ -999,14 +999,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         ex == LLAMA_EXAMPLE_MAIN
             ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
             : "prompt to start generation with",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.prompt = value;
         }
     ));
     add_opt(llama_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1022,7 +1022,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--in-file"}, "FNAME",
         "an input file (repeat to specify multiple files)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1033,7 +1033,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1049,56 +1049,56 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-e", "--escape"},
         format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.escape = true;
         }
     ));
     add_opt(llama_arg(
         {"--no-escape"},
         "do not process escape sequences",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.escape = false;
         }
     ));
     add_opt(llama_arg(
         {"-ptc", "--print-token-count"}, "N",
         format("print token count every N tokens (default: %d)", params.n_print),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_print = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--prompt-cache"}, "FNAME",
         "file to cache prompt state for faster startup (default: none)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.path_prompt_cache = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--prompt-cache-all"},
         "if specified, saves user input and generations to cache as well\n",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.prompt_cache_all = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--prompt-cache-ro"},
         "if specified, uses the prompt cache but does not update it",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.prompt_cache_ro = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-r", "--reverse-prompt"}, "PROMPT",
         "halt generation at PROMPT, return control in interactive mode\n",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.antiprompt.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-sp", "--special"},
         format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.special = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
@@ -1111,35 +1111,35 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             "(default: %s)",
             params.conversation ? "true" : "false"
         ),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.conversation = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-i", "--interactive"},
         format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.interactive = true;
         }
     ).set_examples({LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-if", "--interactive-first"},
         format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.interactive_first = true;
         }
     ).set_examples({LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-mli", "--multiline-input"},
         "allows you to write or paste multiple lines without ending each in '\\'",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.multiline_input = true;
         }
     ).set_examples({LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"--in-prefix-bos"},
         "prefix BOS to user inputs, preceding the `--in-prefix` string",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.input_prefix_bos = true;
             params.enable_chat_template = false;
         }
@@ -1147,7 +1147,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--in-prefix"}, "STRING",
         "string to prefix user inputs with (default: empty)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.input_prefix = value;
             params.enable_chat_template = false;
         }
@@ -1155,7 +1155,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.input_suffix = value;
             params.enable_chat_template = false;
         }
@@ -1163,7 +1163,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.warmup = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
@@ -1173,14 +1173,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
             params.spm_infill ? "enabled" : "disabled"
         ),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.spm_infill = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"--samplers"}, "SAMPLERS",
         format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             const auto sampler_names = string_split(value, ';');
             sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
         }
@@ -1188,28 +1188,28 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--sampling-seq"}, "SEQUENCE",
         format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.samplers_sequence = llama_sampling_types_from_chars(value);
         }
     ));
     add_opt(llama_arg(
         {"--ignore-eos"},
         "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.ignore_eos = true;
         }
     ));
     add_opt(llama_arg(
         {"--penalize-nl"},
         format("penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false"),
-        [&sparams]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             sparams.penalize_nl = true;
         }
     ));
     add_opt(llama_arg(
         {"--temp"}, "N",
         format("temperature (default: %.1f)", (double)sparams.temp),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.temp = std::stof(value);
             sparams.temp = std::max(sparams.temp, 0.0f);
         }
@@ -1217,42 +1217,42 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--top-k"}, "N",
         format("top-k sampling (default: %d, 0 = disabled)", sparams.top_k),
-        [&sparams](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             sparams.top_k = value;
         }
     ));
     add_opt(llama_arg(
         {"--top-p"}, "N",
         format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.top_p = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--min-p"}, "N",
         format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.min_p = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--tfs"}, "N",
         format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.tfs_z = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--typical"}, "N",
         format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.typical_p = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--repeat-last-n"}, "N",
         format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n),
-        [&sparams](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             sparams.penalty_last_n = value;
             sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
         }
@@ -1260,35 +1260,35 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--repeat-penalty"}, "N",
         format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.penalty_repeat = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--presence-penalty"}, "N",
         format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.penalty_present = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--frequency-penalty"}, "N",
         format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.penalty_freq = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--dynatemp-range"}, "N",
         format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.dynatemp_range = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--dynatemp-exp"}, "N",
         format("dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.dynatemp_exponent = std::stof(value);
         }
     ));
@@ -1296,21 +1296,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         {"--mirostat"}, "N",
         format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
         "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat),
-        [&sparams](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             sparams.mirostat = value;
         }
     ));
     add_opt(llama_arg(
         {"--mirostat-lr"}, "N",
         format("Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.mirostat_eta = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--mirostat-ent"}, "N",
         format("Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.mirostat_tau = std::stof(value);
         }
     ));
@@ -1319,7 +1319,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         "modifies the likelihood of token appearing in the completion,\n"
         "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
         "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::stringstream ss(value);
             llama_token key;
             char sign;
@@ -1338,14 +1338,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cfg-negative-prompt"}, "PROMPT",
         format("negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str()),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.cfg_negative_prompt = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--cfg-negative-prompt-file"}, "FNAME",
         "negative prompt file to use for guidance",
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1359,21 +1359,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cfg-scale"}, "N",
         format("strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.cfg_scale = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--grammar"}, "GRAMMAR",
         format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str()),
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.grammar = value;
         }
     ));
     add_opt(llama_arg(
         {"--grammar-file"}, "FNAME",
         "file to read grammar from",
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1388,14 +1388,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-j", "--json-schema"}, "SCHEMA",
         "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
-        [&sparams](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             sparams.grammar = json_schema_to_grammar(json::parse(value));
         }
     ));
     add_opt(llama_arg(
         {"--pooling"}, "{none,mean,cls,last}",
         "pooling type for embeddings, use model default if unspecified",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
             else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
             else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
@@ -1406,7 +1406,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--attention"}, "{causal,non,causal}",
         "attention type for embeddings, use model default if unspecified",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
             else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
             else { throw std::invalid_argument("invalid value"); }
@@ -1415,7 +1415,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--rope-scaling"}, "{none,linear,yarn}",
         "RoPE frequency scaling method, defaults to linear unless specified by the model",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
             else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
             else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
@@ -1425,91 +1425,91 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--rope-scale"}, "N",
         "RoPE context scaling factor, expands context by a factor of N",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.rope_freq_scale = 1.0f / std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--rope-freq-base"}, "N",
         "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.rope_freq_base = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--rope-freq-scale"}, "N",
         "RoPE frequency scaling factor, expands context by a factor of 1/N",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.rope_freq_scale = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--yarn-orig-ctx"}, "N",
         format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.yarn_orig_ctx = value;
         }
     ));
     add_opt(llama_arg(
         {"--yarn-ext-factor"}, "N",
         format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.yarn_ext_factor = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--yarn-attn-factor"}, "N",
         format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.yarn_attn_factor = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--yarn-beta-slow"}, "N",
         format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.yarn_beta_slow = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--yarn-beta-fast"}, "N",
         format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.yarn_beta_fast = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"-gan", "--grp-attn-n"}, "N",
         format("group-attention factor (default: %d)", params.grp_attn_n),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.grp_attn_n = value;
         }
     ));
     add_opt(llama_arg(
         {"-gaw", "--grp-attn-w"}, "N",
         format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.grp_attn_w = value;
         }
     ));
     add_opt(llama_arg(
         {"-dkvc", "--dump-kv-cache"},
         "verbose print of the KV cache",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.dump_kv_cache = true;
         }
     ));
     add_opt(llama_arg(
         {"-nkvo", "--no-kv-offload"},
         "disable KV offload",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.no_kv_offload = true;
         }
     ));
     add_opt(llama_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
         format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             // TODO: get the type right here
             params.cache_type_k = value;
         }
@@ -1517,7 +1517,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ctv", "--cache-type-v"}, "TYPE",
         format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             // TODO: get the type right here
             params.cache_type_v = value;
         }
@@ -1525,119 +1525,119 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--all-logits"},
         format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.logits_all = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--hellaswag"},
         "compute HellaSwag score over random tasks from datafile supplied with -f",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.hellaswag = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--hellaswag-tasks"}, "N",
         format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.hellaswag_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--winogrande"},
         "compute Winogrande score over random tasks from datafile supplied with -f",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.winogrande = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--winogrande-tasks"}, "N",
         format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.winogrande_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--multiple-choice"},
         "compute multiple choice score over random tasks from datafile supplied with -f",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.multiple_choice = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--multiple-choice-tasks"}, "N",
         format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.multiple_choice_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--kl-divergence"},
         "computes KL-divergence to logits provided via --kl-divergence-base",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.kl_divergence = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--ppl-stride"}, "N",
         format("stride for perplexity calculation (default: %d)", params.ppl_stride),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.ppl_stride = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--ppl-output-type"}, "<0|1>",
         format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.ppl_output_type = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"-dt", "--defrag-thold"}, "N",
         format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.defrag_thold = std::stof(value);
         }
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
     add_opt(llama_arg(
         {"-np", "--parallel"}, "N",
         format("number of parallel sequences to decode (default: %d)", params.n_parallel),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_parallel = value;
         }
     ));
     add_opt(llama_arg(
         {"-ns", "--sequences"}, "N",
         format("number of sequences to decode (default: %d)", params.n_sequences),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_sequences = value;
         }
     ));
     add_opt(llama_arg(
         {"-cb", "--cont-batching"},
         format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.cont_batching = true;
         }
     ).set_env("LLAMA_ARG_CONT_BATCHING"));
     add_opt(llama_arg(
         {"-nocb", "--no-cont-batching"},
         "disable continuous batching",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.cont_batching = false;
         }
     ).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
     add_opt(llama_arg(
         {"--mmproj"}, "FILE",
         "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.mmproj = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(llama_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
@@ -1645,7 +1645,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--rpc"}, "SERVERS",
         "comma separated list of RPC servers",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.rpc_servers = value;
         }
     ));
@@ -1653,14 +1653,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--mlock"},
         "force system to keep model in RAM rather than swapping or compressing",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.use_mlock = true;
         }
     ));
     add_opt(llama_arg(
         {"--no-mmap"},
         "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.use_mmap = false;
         }
     ));
@@ -1672,7 +1672,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         "- numactl: use the CPU map provided by numactl\n"
         "if run without this previously, it is recommended to drop the system page cache before using this\n"
         "see https://github.com/ggerganov/llama.cpp/issues/1437",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
             else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
@@ -1682,7 +1682,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ngl", "--gpu-layers"}, "N",
         "number of layers to store in VRAM",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
@@ -1693,7 +1693,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ngld", "--gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_gpu_layers_draft = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
@@ -1707,7 +1707,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         "- none: use one GPU only\n"
         "- layer (default): split layers and KV across GPUs\n"
         "- row: split rows across GPUs",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string arg_next = value;
             if (arg_next == "none") {
                 params.split_mode = LLAMA_SPLIT_MODE_NONE;
@@ -1732,7 +1732,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ts", "--tensor-split"}, "N0,N1,N2,...",
         "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::string arg_next = value;
 
             // split string by , and /
@@ -1759,7 +1759,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-mg", "--main-gpu"}, "INDEX",
         format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.main_gpu = value;
 #ifndef GGML_USE_CUDA_SYCL_VULKAN
             fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
@@ -1769,7 +1769,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--check-tensors"},
         format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.check_tensors = true;
         }
     ));
@@ -1777,7 +1777,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         {"--override-kv"}, "KEY=TYPE:VALUE",
         "advanced option to override model metadata by key. may be specified multiple times.\n"
         "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
                 throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
             }
@@ -1786,21 +1786,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--lora"}, "FNAME",
         "path to LoRA adapter (can be repeated to use multiple adapters)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.lora_adapters.push_back({ std::string(value), 1.0 });
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(llama_arg(
         {"--lora-scaled"}, "FNAME", "SCALE",
         "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
-        [&params](std::string fname, std::string scale) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & fname, const std::string & scale) {
             params.lora_adapters.push_back({ fname, std::stof(scale) });
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(llama_arg(
         {"--control-vector"}, "FNAME",
         "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.control_vectors.push_back({ 1.0f, value, });
         }
     ));
@@ -1808,14 +1808,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         {"--control-vector-scaled"}, "FNAME", "SCALE",
         "add a control vector with user defined scaling SCALE\n"
         "note: this argument can be repeated to add multiple scaled control vectors",
-        [&params](std::string fname, std::string scale) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & fname, const std::string & scale) {
             params.control_vectors.push_back({ std::stof(scale), fname });
         }
     ));
     add_opt(llama_arg(
         {"--control-vector-layer-range"}, "START", "END",
         "layer range to apply the control vector(s) to, start and end inclusive",
-        [&params](std::string start, std::string end) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & start, const std::string & end) {
             params.control_vector_layer_start = std::stoi(start);
             params.control_vector_layer_end = std::stoi(end);
         }
@@ -1823,7 +1823,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-a", "--alias"}, "STRING",
         "set alias for model name (to be used by REST API)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.model_alias = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL"));
@@ -1835,49 +1835,49 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 "model path (default: `models/$filename` with filename from `--hf-file` "
                 "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
             ),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.model = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
     add_opt(llama_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.model_draft = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.model_url = value;
         }
     ).set_env("LLAMA_ARG_MODEL_URL"));
     add_opt(llama_arg(
         {"-hfr", "--hf-repo"}, "REPO",
         "Hugging Face model repository (default: unused)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HF_REPO"));
     add_opt(llama_arg(
         {"-hff", "--hf-file"}, "FILE",
         "Hugging Face model file (default: unused)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
     add_opt(llama_arg(
         {"-hft", "--hf-token"}, "TOKEN",
         "Hugging Face access token (default: value from HF_TOKEN environment variable)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.hf_token = value;
         }
     ).set_env("HF_TOKEN"));
     add_opt(llama_arg(
         {"--context-file"}, "FNAME",
         "file to load context from (repeat to specify multiple files)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1888,28 +1888,28 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--chunk-size"}, "N",
         format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.chunk_size = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(llama_arg(
         {"--chunk-separator"}, "STRING",
         format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.chunk_separator = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(llama_arg(
         {"--junk"}, "N",
         format("number of times to repeat the junk text (default: %d)", params.n_junk),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_junk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
     add_opt(llama_arg(
         {"--pos"}, "N",
         format("position of the passkey in the junk text (default: %d)", params.i_pos),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.i_pos = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
@@ -1921,7 +1921,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
                     ? params.cvector_outfile.c_str()
                     : params.out_file.c_str()),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.out_file = value;
             params.cvector_outfile = value;
             params.lora_outfile = value;
@@ -1930,49 +1930,49 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ofreq", "--output-frequency"}, "N",
         format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_out_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"--save-frequency"}, "N",
         format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_save_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"--process-output"},
         format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.process_output = true;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"--no-ppl"},
         format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.compute_ppl = false;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"--chunk"}, "N",
         format("start processing the input from chunk N (default: %d)", params.i_chunk),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.i_chunk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"-pps"},
         format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.is_pp_shared = true;
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(llama_arg(
         {"-npp"}, "n0,n1,...",
         "number of prompt tokens",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
         }
@@ -1980,7 +1980,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ntg"}, "n0,n1,...",
         "number of text generation tokens",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
         }
@@ -1988,7 +1988,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-npl"}, "n0,n1,...",
         "number of parallel prompts",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
         }
@@ -1996,63 +1996,63 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--embd-normalize"}, "N",
         format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.embd_normalize = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(llama_arg(
         {"--embd-output-format"}, "FORMAT",
         "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.embd_out = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(llama_arg(
         {"--embd-separator"}, "STRING",
         "separator of embendings (default \\n) for example \"<#sep#>\"",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.embd_sep = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(llama_arg(
         {"--host"}, "HOST",
         format("ip address to listen (default: %s)", params.hostname.c_str()),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.hostname = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
     add_opt(llama_arg(
         {"--port"}, "PORT",
         format("port to listen (default: %d)", params.port),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.port = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
     add_opt(llama_arg(
         {"--path"}, "PATH",
         format("path to serve static files from (default: %s)", params.public_path.c_str()),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.public_path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--embedding", "--embeddings"},
         format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.embedding = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
     add_opt(llama_arg(
         {"--api-key"}, "KEY",
         "API key to use for authentication (default: none)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.api_keys.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
     add_opt(llama_arg(
         {"--api-key-file"}, "FNAME",
         "path to file containing API keys (default: none)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::ifstream key_file(value);
             if (!key_file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -2069,21 +2069,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--ssl-key-file"}, "FNAME",
         "path to file a PEM-encoded SSL private key",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.ssl_file_key = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--ssl-cert-file"}, "FNAME",
         "path to file a PEM-encoded SSL certificate",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.ssl_file_cert = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--timeout"}, "N",
         format("server read/write timeout in seconds (default: %d)", params.timeout_read),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.timeout_read  = value;
             params.timeout_write = value;
         }
@@ -2091,14 +2091,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--threads-http"}, "N",
         format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_threads_http = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
     add_opt(llama_arg(
         {"-spf", "--system-prompt-file"}, "FNAME",
         "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -2115,7 +2115,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--log-format"}, "{text, json}",
         "log output format: json or text (default: json)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             if (value == "json") {
                 params.log_json = true;
             } else if (value == "text") {
@@ -2128,21 +2128,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--metrics"},
         format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.endpoint_metrics = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
     add_opt(llama_arg(
         {"--no-slots"},
         format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.endpoint_slots = false;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
     add_opt(llama_arg(
         {"--slot-save-path"}, "PATH",
         "path to save slot kv cache (default: disabled)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.slot_save_path = value;
             // if doesn't end with DIRECTORY_SEPARATOR, add it
             if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
@@ -2155,7 +2155,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         "set custom jinja chat template (default: template taken from model's metadata)\n"
         "if suffix/prefix are specified, template will be disabled\n"
         "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             if (!llama_chat_verify_template(value)) {
                 throw std::runtime_error(format(
                     "error: the supplied chat template is not supported: %s\n"
@@ -2169,28 +2169,28 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.slot_prompt_similarity = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--lora-init-without-apply"},
         format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.lora_init_without_apply = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--simple-io"},
         "use basic IO for better compatibility in subprocesses and limited consoles",
-        [&params]() {
+        [](gpt_params & params, llama_sampling_params & sparams) {
             params.simple_io = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-ld", "--logdir"}, "LOGDIR",
         "path under which to save YAML logs (no logging if unset)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.logdir = value;
 
             if (params.logdir.back() != DIRECTORY_SEPARATOR) {
@@ -2201,35 +2201,35 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--positive-file"}, "FNAME",
         format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.cvector_positive_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(llama_arg(
         {"--negative-file"}, "FNAME",
         format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             params.cvector_negative_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(llama_arg(
         {"--pca-batch"}, "N",
         format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_pca_batch = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(llama_arg(
         {"--pca-iter"}, "N",
         format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
-        [&params](int value) {
+        [](gpt_params & params, llama_sampling_params & sparams, int value) {
             params.n_pca_iterations = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(llama_arg(
         {"--method"}, "{pca, mean}",
         "dimensionality reduction method to be used (default: pca)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
             else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
             else { throw std::invalid_argument("invalid value"); }
@@ -2238,7 +2238,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--output-format"}, "{md,jsonl}",
         "output format for batched-bench results (default: md)",
-        [&params](std::string value) {
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
             /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
             else if (value == "md") { params.batched_bench_output_jsonl = false; }
             else { std::invalid_argument("invalid value"); }
@@ -2249,32 +2249,32 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--log-test"},
         "Log test",
-        []() { log_param_single_parse("--log-test"); }
+        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-test"); }
     ));
     add_opt(llama_arg(
         {"--log-disable"},
         "Log disable",
-        []() { log_param_single_parse("--log-disable"); }
+        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-disable"); }
     ));
     add_opt(llama_arg(
         {"--log-enable"},
         "Log enable",
-        []() { log_param_single_parse("--log-enable"); }
+        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-enable"); }
     ));
     add_opt(llama_arg(
         {"--log-new"},
         "Log new",
-        []() { log_param_single_parse("--log-new"); }
+        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-new"); }
     ));
     add_opt(llama_arg(
         {"--log-append"},
         "Log append",
-        []() { log_param_single_parse("--log-append"); }
+        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-append"); }
     ));
     add_opt(llama_arg(
         {"--log-file"}, "FNAME",
         "Log file",
-        [](std::string value) { log_param_pair_parse(false, "--log-file", value); }
+        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
     ));
 #endif // LOG_DISABLE_LOGS
 
diff --git a/common/common.h b/common/common.h
index e8dd040e9d994..c4893d17481ca 100644
--- a/common/common.h
+++ b/common/common.h
@@ -310,20 +310,39 @@ struct llama_arg {
     std::string value_hint_2; // for second arg value
     std::string env;
     std::string help;
-    std::function<void(void)>                     handler_void    = nullptr;
-    std::function<void(std::string)>              handler_string  = nullptr;
-    std::function<void(std::string, std::string)> handler_str_str = nullptr;
-    std::function<void(int)>                      handler_int     = nullptr;
-
-    llama_arg(std::vector<std::string> args, std::string value_hint, std::string help, std::function<void(std::string)> handler) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-
-    llama_arg(std::vector<std::string> args, std::string value_hint, std::string help, std::function<void(int)> handler) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-
-    llama_arg(std::vector<std::string> args, std::string help, std::function<void(void)> handler) : args(args), help(help), handler_void(handler) {}
+    void (*handler_void)   (gpt_params & params, llama_sampling_params & sparams) = nullptr;
+    void (*handler_string) (gpt_params & params, llama_sampling_params & sparams, const std::string &) = nullptr;
+    void (*handler_str_str)(gpt_params & params, llama_sampling_params & sparams, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (gpt_params & params, llama_sampling_params & sparams, int) = nullptr;
+
+    llama_arg(
+        const std::initializer_list<std::string> & args,
+        const std::string & value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, llama_sampling_params & sparams, const std::string &)
+    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+
+    llama_arg(
+        const std::initializer_list<std::string> & args,
+        const std::string & value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, llama_sampling_params & sparams, int)
+    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+
+    llama_arg(
+        const std::initializer_list<std::string> & args,
+        const std::string & help,
+        void (*handler)(gpt_params & params, llama_sampling_params & sparams)
+    ) : args(args), help(help), handler_void(handler) {}
 
     // support 2 values for arg
-    // note: env variable is not yet support for 2 values
-    llama_arg(std::vector<std::string> args, std::string value_hint, std::string value_hint_2, std::string help, std::function<void(std::string, std::string)> handler) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+    llama_arg(
+        const std::initializer_list<std::string> & args,
+        const std::string & value_hint,
+        const std::string & value_hint_2,
+        const std::string & help,
+        void (*handler)(gpt_params & params, llama_sampling_params & sparams, const std::string &, const std::string &)
+    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
     llama_arg & set_examples(std::set<enum llama_example> examples) {
         this->examples = std::move(examples);
@@ -340,7 +359,7 @@ struct llama_arg {
         return examples.find(ex) != examples.end();
     }
 
-    bool get_value_from_env(std::string & output) {
+    bool get_value_from_env(std::string & output) const {
         if (env.empty()) return false;
         char * value = std::getenv(env.c_str());
         if (value) {
@@ -350,7 +369,7 @@ struct llama_arg {
         return false;
     }
 
-    bool has_value_from_env() {
+    bool has_value_from_env() const {
         return std::getenv(env.c_str());
     }
 

From eb7d8f85a27a24e65a68c5727c4a0e3a2ea86805 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 7 Sep 2024 18:24:44 +0200
Subject: [PATCH 18/20] params.sparams

---
 common/common.cpp | 499 +++++++++++++++++++++++-----------------------
 common/common.h   |  16 +-
 2 files changed, 257 insertions(+), 258 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 012dd1adc98ef..804af1d943e0f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -362,13 +362,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
         if (opt.get_value_from_env(value)) {
             try {
                 if (opt.handler_void && (value == "1" || value == "true")) {
-                    opt.handler_void(params, sparams);
+                    opt.handler_void(params);
                 }
                 if (opt.handler_int) {
-                    opt.handler_int(params, sparams, std::stoi(value));
+                    opt.handler_int(params, std::stoi(value));
                 }
                 if (opt.handler_string) {
-                    opt.handler_string(params, sparams, value);
+                    opt.handler_string(params, value);
                     continue;
                 }
             } catch (std::exception & e) {
@@ -399,7 +399,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
         }
         try {
             if (opt.handler_void) {
-                opt.handler_void(params, sparams);
+                opt.handler_void(params);
                 continue;
             }
 
@@ -407,11 +407,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
             check_arg(i);
             std::string val = argv[++i];
             if (opt.handler_int) {
-                opt.handler_int(params, sparams, std::stoi(val));
+                opt.handler_int(params, std::stoi(val));
                 continue;
             }
             if (opt.handler_string) {
-                opt.handler_string(params, sparams, val);
+                opt.handler_string(params, val);
                 continue;
             }
 
@@ -419,7 +419,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
             check_arg(i);
             std::string val2 = argv[++i];
             if (opt.handler_str_str) {
-                opt.handler_str_str(params, sparams, val, val2);
+                opt.handler_str_str(params, val, val2);
                 continue;
             }
         } catch (std::exception & e) {
@@ -650,11 +650,10 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     std::vector<llama_arg> options;
     params.print_usage = print_usage;
     params.curr_ex     = ex;
-    llama_sampling_params & sparams = params.sparams;
 
     std::string sampler_type_chars;
     std::string sampler_type_names;
-    for (const auto sampler_type : sparams.samplers_sequence) {
+    for (const auto sampler_type : params.sparams.samplers_sequence) {
         sampler_type_chars += static_cast<char>(sampler_type);
         sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
     }
@@ -687,14 +686,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-h", "--help", "--usage"},
         "print usage and exit",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.usage = true;
         }
     ));
     add_opt(llama_arg(
         {"--version"},
         "show version and build info",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
             fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
             exit(0);
@@ -703,51 +702,51 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-v", "--verbose"},
         "print verbose information",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.verbosity = 1;
         }
     ));
     add_opt(llama_arg(
         {"--verbosity"}, "N",
         format("set specific verbosity level (default: %d)", params.verbosity),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.verbosity = value;
         }
     ));
     add_opt(llama_arg(
         {"--verbose-prompt"},
         format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.verbose_prompt = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--no-display-prompt"},
         format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.display_prompt = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-co", "--color"},
         format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.use_color = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-s", "--seed"}, "SEED",
         format("RNG seed (default: %d, use random seed for < 0)", params.seed),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
             params.seed = std::stoul(value);
-            sparams.seed = std::stoul(value);
+            params.sparams.seed = std::stoul(value);
         }
     ));
     add_opt(llama_arg(
         {"-t", "--threads"}, "N",
         format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.cpuparams.n_threads = value;
             if (params.cpuparams.n_threads <= 0) {
                 params.cpuparams.n_threads = std::thread::hardware_concurrency();
@@ -757,7 +756,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-tb", "--threads-batch"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads)",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.cpuparams_batch.n_threads = value;
             if (params.cpuparams_batch.n_threads <= 0) {
                 params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
@@ -767,7 +766,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-td", "--threads-draft"}, "N",
         "number of threads to use during generation (default: same as --threads)",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.draft_cpuparams.n_threads = value;
             if (params.draft_cpuparams.n_threads <= 0) {
                 params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
@@ -777,7 +776,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-tbd", "--threads-batch-draft"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.draft_cpuparams_batch.n_threads = value;
             if (params.draft_cpuparams_batch.n_threads <= 0) {
                 params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
@@ -787,7 +786,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-C", "--cpu-mask"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string mask = value;
             params.cpuparams.mask_valid = true;
             if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
@@ -798,7 +797,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-Cr", "--cpu-range"}, "lo-hi",
         "range of CPUs for affinity. Complements --cpu-mask",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string range = value;
             params.cpuparams.mask_valid = true;
             if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
@@ -809,21 +808,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cpu-strict"}, "<0|1>",
         format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.cpuparams.strict_cpu = std::stoul(value);
         }
     ));
     add_opt(llama_arg(
         {"--poll"}, "<0...100>",
         format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.cpuparams.poll = std::stoul(value);
         }
     ));
     add_opt(llama_arg(
         {"-Cb", "--cpu-mask-batch"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string mask = value;
             params.cpuparams_batch.mask_valid = true;
             if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
@@ -834,7 +833,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-Crb", "--cpu-range-batch"}, "lo-hi",
         "ranges of CPUs for affinity. Complements --cpu-mask-batch",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string range = value;
             params.cpuparams_batch.mask_valid = true;
             if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
@@ -845,21 +844,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cpu-strict-batch"}, "<0|1>",
         "use strict CPU placement (default: same as --cpu-strict)",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.cpuparams_batch.strict_cpu = value;
         }
     ));
     add_opt(llama_arg(
         {"--poll-batch"}, "<0|1>",
         "use polling to wait for work (default: same as --poll)",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.cpuparams_batch.poll = value;
         }
     ));
     add_opt(llama_arg(
         {"-Cd", "--cpu-mask-draft"}, "M",
         "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string mask = value;
             params.draft_cpuparams.mask_valid = true;
             if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
@@ -870,7 +869,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-Crd", "--cpu-range-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string range = value;
             params.draft_cpuparams.mask_valid = true;
             if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
@@ -881,21 +880,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cpu-strict-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: same as --cpu-strict)",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.draft_cpuparams.strict_cpu = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"--poll-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: same as --poll])",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.draft_cpuparams.poll = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string range = value;
             params.draft_cpuparams_batch.mask_valid = true;
             if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
@@ -906,91 +905,91 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--cpu-strict-batch-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.draft_cpuparams_batch.strict_cpu = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"--poll-batch-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: --poll-draft)",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.draft_cpuparams_batch.poll = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"--draft"}, "N",
         format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_draft = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-ps", "--p-split"}, "N",
         format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.p_split = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-lcs", "--lookup-cache-static"}, "FNAME",
         "path to static lookup cache to use for lookup decoding (not updated by generation)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.lookup_cache_static = value;
         }
     ));
     add_opt(llama_arg(
         {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
         "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.lookup_cache_dynamic = value;
         }
     ));
     add_opt(llama_arg(
         {"-c", "--ctx-size"}, "N",
         format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_ctx = value;
         }
     ).set_env("LLAMA_ARG_CTX_SIZE"));
     add_opt(llama_arg(
         {"-n", "--predict", "--n-predict"}, "N",
         format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_predict = value;
         }
     ).set_env("LLAMA_ARG_N_PREDICT"));
     add_opt(llama_arg(
         {"-b", "--batch-size"}, "N",
         format("logical maximum batch size (default: %d)", params.n_batch),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_batch = value;
         }
     ).set_env("LLAMA_ARG_BATCH"));
     add_opt(llama_arg(
         {"-ub", "--ubatch-size"}, "N",
         format("physical maximum batch size (default: %d)", params.n_ubatch),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_ubatch = value;
         }
     ).set_env("LLAMA_ARG_UBATCH"));
     add_opt(llama_arg(
         {"--keep"}, "N",
         format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_keep = value;
         }
     ));
     add_opt(llama_arg(
         {"--chunks"}, "N",
         format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_chunks = value;
         }
     ));
     add_opt(llama_arg(
         {"-fa", "--flash-attn"},
         format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.flash_attn = true;
         }
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
@@ -999,14 +998,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         ex == LLAMA_EXAMPLE_MAIN
             ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
             : "prompt to start generation with",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.prompt = value;
         }
     ));
     add_opt(llama_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1022,7 +1021,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--in-file"}, "FNAME",
         "an input file (repeat to specify multiple files)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1033,7 +1032,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1049,56 +1048,56 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-e", "--escape"},
         format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.escape = true;
         }
     ));
     add_opt(llama_arg(
         {"--no-escape"},
         "do not process escape sequences",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.escape = false;
         }
     ));
     add_opt(llama_arg(
         {"-ptc", "--print-token-count"}, "N",
         format("print token count every N tokens (default: %d)", params.n_print),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_print = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--prompt-cache"}, "FNAME",
         "file to cache prompt state for faster startup (default: none)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.path_prompt_cache = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--prompt-cache-all"},
         "if specified, saves user input and generations to cache as well\n",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.prompt_cache_all = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--prompt-cache-ro"},
         "if specified, uses the prompt cache but does not update it",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.prompt_cache_ro = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-r", "--reverse-prompt"}, "PROMPT",
         "halt generation at PROMPT, return control in interactive mode\n",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.antiprompt.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-sp", "--special"},
         format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.special = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
@@ -1111,35 +1110,35 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             "(default: %s)",
             params.conversation ? "true" : "false"
         ),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.conversation = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-i", "--interactive"},
         format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.interactive = true;
         }
     ).set_examples({LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-if", "--interactive-first"},
         format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.interactive_first = true;
         }
     ).set_examples({LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-mli", "--multiline-input"},
         "allows you to write or paste multiple lines without ending each in '\\'",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.multiline_input = true;
         }
     ).set_examples({LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"--in-prefix-bos"},
         "prefix BOS to user inputs, preceding the `--in-prefix` string",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.input_prefix_bos = true;
             params.enable_chat_template = false;
         }
@@ -1147,7 +1146,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--in-prefix"}, "STRING",
         "string to prefix user inputs with (default: empty)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.input_prefix = value;
             params.enable_chat_template = false;
         }
@@ -1155,7 +1154,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.input_suffix = value;
             params.enable_chat_template = false;
         }
@@ -1163,7 +1162,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.warmup = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
@@ -1173,145 +1172,145 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
             params.spm_infill ? "enabled" : "disabled"
         ),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.spm_infill = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"--samplers"}, "SAMPLERS",
         format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             const auto sampler_names = string_split(value, ';');
-            sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
+            params.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
         }
     ));
     add_opt(llama_arg(
         {"--sampling-seq"}, "SEQUENCE",
         format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.samplers_sequence = llama_sampling_types_from_chars(value);
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.samplers_sequence = llama_sampling_types_from_chars(value);
         }
     ));
     add_opt(llama_arg(
         {"--ignore-eos"},
         "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.ignore_eos = true;
         }
     ));
     add_opt(llama_arg(
         {"--penalize-nl"},
-        format("penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
-            sparams.penalize_nl = true;
+        format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
+        [](gpt_params & params) {
+            params.sparams.penalize_nl = true;
         }
     ));
     add_opt(llama_arg(
         {"--temp"}, "N",
-        format("temperature (default: %.1f)", (double)sparams.temp),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.temp = std::stof(value);
-            sparams.temp = std::max(sparams.temp, 0.0f);
+        format("temperature (default: %.1f)", (double)params.sparams.temp),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.temp = std::stof(value);
+            params.sparams.temp = std::max(params.sparams.temp, 0.0f);
         }
     ));
     add_opt(llama_arg(
         {"--top-k"}, "N",
-        format("top-k sampling (default: %d, 0 = disabled)", sparams.top_k),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
-            sparams.top_k = value;
+        format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
+        [](gpt_params & params, int value) {
+            params.sparams.top_k = value;
         }
     ));
     add_opt(llama_arg(
         {"--top-p"}, "N",
-        format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.top_p = std::stof(value);
+        format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.top_p = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--min-p"}, "N",
-        format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.min_p = std::stof(value);
+        format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.min_p = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--tfs"}, "N",
-        format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.tfs_z = std::stof(value);
+        format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.tfs_z = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--typical"}, "N",
-        format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.typical_p = std::stof(value);
+        format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typical_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.typical_p = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--repeat-last-n"}, "N",
-        format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
-            sparams.penalty_last_n = value;
-            sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
+        format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
+        [](gpt_params & params, int value) {
+            params.sparams.penalty_last_n = value;
+            params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
         }
     ));
     add_opt(llama_arg(
         {"--repeat-penalty"}, "N",
-        format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.penalty_repeat = std::stof(value);
+        format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_repeat = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--presence-penalty"}, "N",
-        format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.penalty_present = std::stof(value);
+        format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_present = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--frequency-penalty"}, "N",
-        format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.penalty_freq = std::stof(value);
+        format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_freq = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--dynatemp-range"}, "N",
-        format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.dynatemp_range = std::stof(value);
+        format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.dynatemp_range = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--dynatemp-exp"}, "N",
-        format("dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.dynatemp_exponent = std::stof(value);
+        format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.dynatemp_exponent = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--mirostat"}, "N",
         format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
-        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
-            sparams.mirostat = value;
+        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
+        [](gpt_params & params, int value) {
+            params.sparams.mirostat = value;
         }
     ));
     add_opt(llama_arg(
         {"--mirostat-lr"}, "N",
-        format("Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.mirostat_eta = std::stof(value);
+        format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.mirostat_eta = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--mirostat-ent"}, "N",
-        format("Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.mirostat_tau = std::stof(value);
+        format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.mirostat_tau = std::stof(value);
         }
     ));
     add_opt(llama_arg(
@@ -1319,14 +1318,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         "modifies the likelihood of token appearing in the completion,\n"
         "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
         "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::stringstream ss(value);
             llama_token key;
             char sign;
             std::string value_str;
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                    params.sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
                 } else {
                     throw std::invalid_argument("invalid input format");
                 }
@@ -1337,43 +1336,43 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     ));
     add_opt(llama_arg(
         {"--cfg-negative-prompt"}, "PROMPT",
-        format("negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.cfg_negative_prompt = value;
+        format("negative prompt to use for guidance (default: '%s')", params.sparams.cfg_negative_prompt.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.cfg_negative_prompt = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--cfg-negative-prompt-file"}, "FNAME",
         "negative prompt file to use for guidance",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
             }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-            if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-                sparams.cfg_negative_prompt.pop_back();
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.sparams.cfg_negative_prompt));
+            if (!params.sparams.cfg_negative_prompt.empty() && params.sparams.cfg_negative_prompt.back() == '\n') {
+                params.sparams.cfg_negative_prompt.pop_back();
             }
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--cfg-scale"}, "N",
-        format("strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.cfg_scale = std::stof(value);
+        format("strength of guidance (default: %.1f, 1.0 = disable)", (double)params.sparams.cfg_scale),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.cfg_scale = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--grammar"}, "GRAMMAR",
-        format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.grammar = value;
+        format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.grammar = value;
         }
     ));
     add_opt(llama_arg(
         {"--grammar-file"}, "FNAME",
         "file to read grammar from",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1381,21 +1380,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             std::copy(
                 std::istreambuf_iterator<char>(file),
                 std::istreambuf_iterator<char>(),
-                std::back_inserter(sparams.grammar)
+                std::back_inserter(params.sparams.grammar)
             );
         }
     ));
     add_opt(llama_arg(
         {"-j", "--json-schema"}, "SCHEMA",
         "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
-            sparams.grammar = json_schema_to_grammar(json::parse(value));
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.grammar = json_schema_to_grammar(json::parse(value));
         }
     ));
     add_opt(llama_arg(
         {"--pooling"}, "{none,mean,cls,last}",
         "pooling type for embeddings, use model default if unspecified",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
             else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
             else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
@@ -1406,7 +1405,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--attention"}, "{causal,non,causal}",
         "attention type for embeddings, use model default if unspecified",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
             else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
             else { throw std::invalid_argument("invalid value"); }
@@ -1415,7 +1414,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--rope-scaling"}, "{none,linear,yarn}",
         "RoPE frequency scaling method, defaults to linear unless specified by the model",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
             else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
             else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
@@ -1425,91 +1424,91 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--rope-scale"}, "N",
         "RoPE context scaling factor, expands context by a factor of N",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.rope_freq_scale = 1.0f / std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--rope-freq-base"}, "N",
         "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.rope_freq_base = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--rope-freq-scale"}, "N",
         "RoPE frequency scaling factor, expands context by a factor of 1/N",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.rope_freq_scale = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--yarn-orig-ctx"}, "N",
         format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.yarn_orig_ctx = value;
         }
     ));
     add_opt(llama_arg(
         {"--yarn-ext-factor"}, "N",
         format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.yarn_ext_factor = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--yarn-attn-factor"}, "N",
         format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.yarn_attn_factor = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--yarn-beta-slow"}, "N",
         format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.yarn_beta_slow = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"--yarn-beta-fast"}, "N",
         format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.yarn_beta_fast = std::stof(value);
         }
     ));
     add_opt(llama_arg(
         {"-gan", "--grp-attn-n"}, "N",
         format("group-attention factor (default: %d)", params.grp_attn_n),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.grp_attn_n = value;
         }
     ));
     add_opt(llama_arg(
         {"-gaw", "--grp-attn-w"}, "N",
         format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.grp_attn_w = value;
         }
     ));
     add_opt(llama_arg(
         {"-dkvc", "--dump-kv-cache"},
         "verbose print of the KV cache",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.dump_kv_cache = true;
         }
     ));
     add_opt(llama_arg(
         {"-nkvo", "--no-kv-offload"},
         "disable KV offload",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.no_kv_offload = true;
         }
     ));
     add_opt(llama_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
         format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             // TODO: get the type right here
             params.cache_type_k = value;
         }
@@ -1517,7 +1516,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ctv", "--cache-type-v"}, "TYPE",
         format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             // TODO: get the type right here
             params.cache_type_v = value;
         }
@@ -1525,119 +1524,119 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--all-logits"},
         format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.logits_all = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--hellaswag"},
         "compute HellaSwag score over random tasks from datafile supplied with -f",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.hellaswag = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--hellaswag-tasks"}, "N",
         format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.hellaswag_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--winogrande"},
         "compute Winogrande score over random tasks from datafile supplied with -f",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.winogrande = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--winogrande-tasks"}, "N",
         format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.winogrande_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--multiple-choice"},
         "compute multiple choice score over random tasks from datafile supplied with -f",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.multiple_choice = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--multiple-choice-tasks"}, "N",
         format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.multiple_choice_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--kl-divergence"},
         "computes KL-divergence to logits provided via --kl-divergence-base",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.kl_divergence = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--ppl-stride"}, "N",
         format("stride for perplexity calculation (default: %d)", params.ppl_stride),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.ppl_stride = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"--ppl-output-type"}, "<0|1>",
         format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.ppl_output_type = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(llama_arg(
         {"-dt", "--defrag-thold"}, "N",
         format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.defrag_thold = std::stof(value);
         }
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
     add_opt(llama_arg(
         {"-np", "--parallel"}, "N",
         format("number of parallel sequences to decode (default: %d)", params.n_parallel),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_parallel = value;
         }
     ));
     add_opt(llama_arg(
         {"-ns", "--sequences"}, "N",
         format("number of sequences to decode (default: %d)", params.n_sequences),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_sequences = value;
         }
     ));
     add_opt(llama_arg(
         {"-cb", "--cont-batching"},
         format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.cont_batching = true;
         }
     ).set_env("LLAMA_ARG_CONT_BATCHING"));
     add_opt(llama_arg(
         {"-nocb", "--no-cont-batching"},
         "disable continuous batching",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.cont_batching = false;
         }
     ).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
     add_opt(llama_arg(
         {"--mmproj"}, "FILE",
         "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.mmproj = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(llama_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
@@ -1645,7 +1644,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--rpc"}, "SERVERS",
         "comma separated list of RPC servers",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.rpc_servers = value;
         }
     ));
@@ -1653,14 +1652,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--mlock"},
         "force system to keep model in RAM rather than swapping or compressing",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.use_mlock = true;
         }
     ));
     add_opt(llama_arg(
         {"--no-mmap"},
         "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.use_mmap = false;
         }
     ));
@@ -1672,7 +1671,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         "- numactl: use the CPU map provided by numactl\n"
         "if run without this previously, it is recommended to drop the system page cache before using this\n"
         "see https://github.com/ggerganov/llama.cpp/issues/1437",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
             else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
@@ -1682,7 +1681,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ngl", "--gpu-layers"}, "N",
         "number of layers to store in VRAM",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
@@ -1693,7 +1692,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ngld", "--gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_gpu_layers_draft = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
@@ -1707,7 +1706,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         "- none: use one GPU only\n"
         "- layer (default): split layers and KV across GPUs\n"
         "- row: split rows across GPUs",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string arg_next = value;
             if (arg_next == "none") {
                 params.split_mode = LLAMA_SPLIT_MODE_NONE;
@@ -1732,7 +1731,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ts", "--tensor-split"}, "N0,N1,N2,...",
         "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::string arg_next = value;
 
             // split string by , and /
@@ -1759,7 +1758,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-mg", "--main-gpu"}, "INDEX",
         format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.main_gpu = value;
 #ifndef GGML_USE_CUDA_SYCL_VULKAN
             fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
@@ -1769,7 +1768,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--check-tensors"},
         format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.check_tensors = true;
         }
     ));
@@ -1777,7 +1776,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         {"--override-kv"}, "KEY=TYPE:VALUE",
         "advanced option to override model metadata by key. may be specified multiple times.\n"
         "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
                 throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
             }
@@ -1786,21 +1785,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--lora"}, "FNAME",
         "path to LoRA adapter (can be repeated to use multiple adapters)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.lora_adapters.push_back({ std::string(value), 1.0 });
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(llama_arg(
         {"--lora-scaled"}, "FNAME", "SCALE",
         "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & fname, const std::string & scale) {
+        [](gpt_params & params, const std::string & fname, const std::string & scale) {
             params.lora_adapters.push_back({ fname, std::stof(scale) });
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(llama_arg(
         {"--control-vector"}, "FNAME",
         "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.control_vectors.push_back({ 1.0f, value, });
         }
     ));
@@ -1808,14 +1807,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         {"--control-vector-scaled"}, "FNAME", "SCALE",
         "add a control vector with user defined scaling SCALE\n"
         "note: this argument can be repeated to add multiple scaled control vectors",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & fname, const std::string & scale) {
+        [](gpt_params & params, const std::string & fname, const std::string & scale) {
             params.control_vectors.push_back({ std::stof(scale), fname });
         }
     ));
     add_opt(llama_arg(
         {"--control-vector-layer-range"}, "START", "END",
         "layer range to apply the control vector(s) to, start and end inclusive",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & start, const std::string & end) {
+        [](gpt_params & params, const std::string & start, const std::string & end) {
             params.control_vector_layer_start = std::stoi(start);
             params.control_vector_layer_end = std::stoi(end);
         }
@@ -1823,7 +1822,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-a", "--alias"}, "STRING",
         "set alias for model name (to be used by REST API)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.model_alias = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL"));
@@ -1835,49 +1834,49 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 "model path (default: `models/$filename` with filename from `--hf-file` "
                 "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
             ),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.model = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
     add_opt(llama_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.model_draft = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.model_url = value;
         }
     ).set_env("LLAMA_ARG_MODEL_URL"));
     add_opt(llama_arg(
         {"-hfr", "--hf-repo"}, "REPO",
         "Hugging Face model repository (default: unused)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HF_REPO"));
     add_opt(llama_arg(
         {"-hff", "--hf-file"}, "FILE",
         "Hugging Face model file (default: unused)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
     add_opt(llama_arg(
         {"-hft", "--hf-token"}, "TOKEN",
         "Hugging Face access token (default: value from HF_TOKEN environment variable)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.hf_token = value;
         }
     ).set_env("HF_TOKEN"));
     add_opt(llama_arg(
         {"--context-file"}, "FNAME",
         "file to load context from (repeat to specify multiple files)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -1888,28 +1887,28 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--chunk-size"}, "N",
         format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.chunk_size = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(llama_arg(
         {"--chunk-separator"}, "STRING",
         format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.chunk_separator = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(llama_arg(
         {"--junk"}, "N",
         format("number of times to repeat the junk text (default: %d)", params.n_junk),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_junk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
     add_opt(llama_arg(
         {"--pos"}, "N",
         format("position of the passkey in the junk text (default: %d)", params.i_pos),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.i_pos = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
@@ -1921,7 +1920,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
                     ? params.cvector_outfile.c_str()
                     : params.out_file.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.out_file = value;
             params.cvector_outfile = value;
             params.lora_outfile = value;
@@ -1930,49 +1929,49 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ofreq", "--output-frequency"}, "N",
         format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_out_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"--save-frequency"}, "N",
         format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_save_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"--process-output"},
         format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.process_output = true;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"--no-ppl"},
         format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.compute_ppl = false;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"--chunk"}, "N",
         format("start processing the input from chunk N (default: %d)", params.i_chunk),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.i_chunk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(llama_arg(
         {"-pps"},
         format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.is_pp_shared = true;
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(llama_arg(
         {"-npp"}, "n0,n1,...",
         "number of prompt tokens",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
         }
@@ -1980,7 +1979,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-ntg"}, "n0,n1,...",
         "number of text generation tokens",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
         }
@@ -1988,7 +1987,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-npl"}, "n0,n1,...",
         "number of parallel prompts",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             auto p = string_split<int>(value, ',');
             params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
         }
@@ -1996,63 +1995,63 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--embd-normalize"}, "N",
         format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.embd_normalize = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(llama_arg(
         {"--embd-output-format"}, "FORMAT",
         "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.embd_out = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(llama_arg(
         {"--embd-separator"}, "STRING",
         "separator of embendings (default \\n) for example \"<#sep#>\"",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.embd_sep = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(llama_arg(
         {"--host"}, "HOST",
         format("ip address to listen (default: %s)", params.hostname.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.hostname = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
     add_opt(llama_arg(
         {"--port"}, "PORT",
         format("port to listen (default: %d)", params.port),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.port = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
     add_opt(llama_arg(
         {"--path"}, "PATH",
         format("path to serve static files from (default: %s)", params.public_path.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.public_path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--embedding", "--embeddings"},
         format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.embedding = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
     add_opt(llama_arg(
         {"--api-key"}, "KEY",
         "API key to use for authentication (default: none)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.api_keys.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
     add_opt(llama_arg(
         {"--api-key-file"}, "FNAME",
         "path to file containing API keys (default: none)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::ifstream key_file(value);
             if (!key_file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -2069,21 +2068,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--ssl-key-file"}, "FNAME",
         "path to file a PEM-encoded SSL private key",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.ssl_file_key = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--ssl-cert-file"}, "FNAME",
         "path to file a PEM-encoded SSL certificate",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.ssl_file_cert = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--timeout"}, "N",
         format("server read/write timeout in seconds (default: %d)", params.timeout_read),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.timeout_read  = value;
             params.timeout_write = value;
         }
@@ -2091,14 +2090,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--threads-http"}, "N",
         format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_threads_http = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
     add_opt(llama_arg(
         {"-spf", "--system-prompt-file"}, "FNAME",
         "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
@@ -2115,7 +2114,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--log-format"}, "{text, json}",
         "log output format: json or text (default: json)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             if (value == "json") {
                 params.log_json = true;
             } else if (value == "text") {
@@ -2128,21 +2127,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--metrics"},
         format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.endpoint_metrics = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
     add_opt(llama_arg(
         {"--no-slots"},
         format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.endpoint_slots = false;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
     add_opt(llama_arg(
         {"--slot-save-path"}, "PATH",
         "path to save slot kv cache (default: disabled)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.slot_save_path = value;
             // if doesn't end with DIRECTORY_SEPARATOR, add it
             if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
@@ -2155,7 +2154,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         "set custom jinja chat template (default: template taken from model's metadata)\n"
         "if suffix/prefix are specified, template will be disabled\n"
         "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             if (!llama_chat_verify_template(value)) {
                 throw std::runtime_error(format(
                     "error: the supplied chat template is not supported: %s\n"
@@ -2169,28 +2168,28 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.slot_prompt_similarity = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--lora-init-without-apply"},
         format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.lora_init_without_apply = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--simple-io"},
         "use basic IO for better compatibility in subprocesses and limited consoles",
-        [](gpt_params & params, llama_sampling_params & sparams) {
+        [](gpt_params & params) {
             params.simple_io = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-ld", "--logdir"}, "LOGDIR",
         "path under which to save YAML logs (no logging if unset)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.logdir = value;
 
             if (params.logdir.back() != DIRECTORY_SEPARATOR) {
@@ -2201,35 +2200,35 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--positive-file"}, "FNAME",
         format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.cvector_positive_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(llama_arg(
         {"--negative-file"}, "FNAME",
         format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             params.cvector_negative_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(llama_arg(
         {"--pca-batch"}, "N",
         format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_pca_batch = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(llama_arg(
         {"--pca-iter"}, "N",
         format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
-        [](gpt_params & params, llama_sampling_params & sparams, int value) {
+        [](gpt_params & params, int value) {
             params.n_pca_iterations = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(llama_arg(
         {"--method"}, "{pca, mean}",
         "dimensionality reduction method to be used (default: pca)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
             else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
             else { throw std::invalid_argument("invalid value"); }
@@ -2238,7 +2237,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--output-format"}, "{md,jsonl}",
         "output format for batched-bench results (default: md)",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) {
+        [](gpt_params & params, const std::string & value) {
             /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
             else if (value == "md") { params.batched_bench_output_jsonl = false; }
             else { std::invalid_argument("invalid value"); }
@@ -2249,32 +2248,32 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--log-test"},
         "Log test",
-        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-test"); }
+        [](gpt_params & params) { log_param_single_parse("--log-test"); }
     ));
     add_opt(llama_arg(
         {"--log-disable"},
         "Log disable",
-        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-disable"); }
+        [](gpt_params & params) { log_param_single_parse("--log-disable"); }
     ));
     add_opt(llama_arg(
         {"--log-enable"},
         "Log enable",
-        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-enable"); }
+        [](gpt_params & params) { log_param_single_parse("--log-enable"); }
     ));
     add_opt(llama_arg(
         {"--log-new"},
         "Log new",
-        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-new"); }
+        [](gpt_params & params) { log_param_single_parse("--log-new"); }
     ));
     add_opt(llama_arg(
         {"--log-append"},
         "Log append",
-        [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-append"); }
+        [](gpt_params & params) { log_param_single_parse("--log-append"); }
     ));
     add_opt(llama_arg(
         {"--log-file"}, "FNAME",
         "Log file",
-        [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
+        [](gpt_params & params, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
     ));
 #endif // LOG_DISABLE_LOGS
 
diff --git a/common/common.h b/common/common.h
index c4893d17481ca..1f709271d4038 100644
--- a/common/common.h
+++ b/common/common.h
@@ -310,29 +310,29 @@ struct llama_arg {
     std::string value_hint_2; // for second arg value
     std::string env;
     std::string help;
-    void (*handler_void)   (gpt_params & params, llama_sampling_params & sparams) = nullptr;
-    void (*handler_string) (gpt_params & params, llama_sampling_params & sparams, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, llama_sampling_params & sparams, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, llama_sampling_params & sparams, int) = nullptr;
+    void (*handler_void)   (gpt_params & params) = nullptr;
+    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (gpt_params & params, int) = nullptr;
 
     llama_arg(
         const std::initializer_list<std::string> & args,
         const std::string & value_hint,
         const std::string & help,
-        void (*handler)(gpt_params & params, llama_sampling_params & sparams, const std::string &)
+        void (*handler)(gpt_params & params, const std::string &)
     ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
 
     llama_arg(
         const std::initializer_list<std::string> & args,
         const std::string & value_hint,
         const std::string & help,
-        void (*handler)(gpt_params & params, llama_sampling_params & sparams, int)
+        void (*handler)(gpt_params & params, int)
     ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
 
     llama_arg(
         const std::initializer_list<std::string> & args,
         const std::string & help,
-        void (*handler)(gpt_params & params, llama_sampling_params & sparams)
+        void (*handler)(gpt_params & params)
     ) : args(args), help(help), handler_void(handler) {}
 
     // support 2 values for arg
@@ -341,7 +341,7 @@ struct llama_arg {
         const std::string & value_hint,
         const std::string & value_hint_2,
         const std::string & help,
-        void (*handler)(gpt_params & params, llama_sampling_params & sparams, const std::string &, const std::string &)
+        void (*handler)(gpt_params & params, const std::string &, const std::string &)
     ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
     llama_arg & set_examples(std::set<enum llama_example> examples) {

From e625f5fd1ec435e8f1262d6a1a753cdd44fb7213 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 7 Sep 2024 18:41:42 +0200
Subject: [PATCH 19/20] optimize more

---
 common/common.cpp                    | 32 ++++++++++++++-----------
 common/common.h                      | 36 ++++++++++++++--------------
 examples/export-docs/export-docs.cpp |  7 +++---
 3 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 804af1d943e0f..0bf01ce2a2b66 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -373,7 +373,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
                 }
             } catch (std::exception & e) {
                 throw std::invalid_argument(format(
-                    "error while handling environment variable \"%s\": %s\n\n", opt.env.c_str(), e.what()));
+                    "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
             }
         }
     }
@@ -395,7 +395,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
         }
         auto opt = *arg_to_options[arg];
         if (opt.has_value_from_env()) {
-            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env.c_str(), arg.c_str());
+            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
         }
         try {
             if (opt.handler_void) {
@@ -595,15 +595,19 @@ std::string llama_arg::to_string() {
     std::string leading_spaces(n_leading_spaces, ' ');
 
     std::ostringstream ss;
-    for (const auto & arg : args) {
+    for (const auto arg : args) {
         if (arg == args.front()) {
-            ss << (args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str()));
+            if (args.size() == 1) {
+                ss << arg;
+            } else {
+                ss << format("%-7s", arg) << ", ";
+            }
         } else {
             ss << arg << (arg != args.back() ? ", " : "");
         }
     }
-    if (!value_hint.empty()) ss << " " << value_hint;
-    if (!value_hint_2.empty()) ss << " " << value_hint_2;
+    if (value_hint) ss << " " << value_hint;
+    if (value_hint_2) ss << " " << value_hint_2;
     if (ss.tellp() > n_leading_spaces - 3) {
         // current line is too long, add new line
         ss << "\n" << leading_spaces;
@@ -675,7 +679,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 if (seen_args.find(a) == seen_args.end()) {
                     seen_args.insert(a);
                 } else {
-                    throw std::runtime_error(format("found duplicated argument in source code: %s", a.c_str()));
+                    throw std::runtime_error(format("found duplicated argument in source code: %s", a));
                 }
             }
             options.push_back(std::move(arg));
@@ -693,7 +697,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--version"},
         "show version and build info",
-        [](gpt_params & params) {
+        [](gpt_params &) {
             fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
             fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
             exit(0);
@@ -2248,32 +2252,32 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     add_opt(llama_arg(
         {"--log-test"},
         "Log test",
-        [](gpt_params & params) { log_param_single_parse("--log-test"); }
+        [](gpt_params &) { log_param_single_parse("--log-test"); }
     ));
     add_opt(llama_arg(
         {"--log-disable"},
         "Log disable",
-        [](gpt_params & params) { log_param_single_parse("--log-disable"); }
+        [](gpt_params &) { log_param_single_parse("--log-disable"); }
     ));
     add_opt(llama_arg(
         {"--log-enable"},
         "Log enable",
-        [](gpt_params & params) { log_param_single_parse("--log-enable"); }
+        [](gpt_params &) { log_param_single_parse("--log-enable"); }
     ));
     add_opt(llama_arg(
         {"--log-new"},
         "Log new",
-        [](gpt_params & params) { log_param_single_parse("--log-new"); }
+        [](gpt_params &) { log_param_single_parse("--log-new"); }
     ));
     add_opt(llama_arg(
         {"--log-append"},
         "Log append",
-        [](gpt_params & params) { log_param_single_parse("--log-append"); }
+        [](gpt_params &) { log_param_single_parse("--log-append"); }
     ));
     add_opt(llama_arg(
         {"--log-file"}, "FNAME",
         "Log file",
-        [](gpt_params & params, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
+        [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
     ));
 #endif // LOG_DISABLE_LOGS
 
diff --git a/common/common.h b/common/common.h
index 1f709271d4038..b79149da04899 100644
--- a/common/common.h
+++ b/common/common.h
@@ -305,10 +305,10 @@ struct gpt_params {
 
 struct llama_arg {
     std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::vector<std::string> args;
-    std::string value_hint; // help text or example for arg value
-    std::string value_hint_2; // for second arg value
-    std::string env;
+    std::vector<const char *> args;
+    const char * value_hint   = nullptr; // help text or example for arg value
+    const char * value_hint_2 = nullptr; // for second arg value
+    const char * env          = nullptr;
     std::string help;
     void (*handler_void)   (gpt_params & params) = nullptr;
     void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
@@ -316,42 +316,42 @@ struct llama_arg {
     void (*handler_int)    (gpt_params & params, int) = nullptr;
 
     llama_arg(
-        const std::initializer_list<std::string> & args,
-        const std::string & value_hint,
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
         const std::string & help,
         void (*handler)(gpt_params & params, const std::string &)
     ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
 
     llama_arg(
-        const std::initializer_list<std::string> & args,
-        const std::string & value_hint,
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
         const std::string & help,
         void (*handler)(gpt_params & params, int)
     ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
 
     llama_arg(
-        const std::initializer_list<std::string> & args,
+        const std::initializer_list<const char *> & args,
         const std::string & help,
         void (*handler)(gpt_params & params)
     ) : args(args), help(help), handler_void(handler) {}
 
     // support 2 values for arg
     llama_arg(
-        const std::initializer_list<std::string> & args,
-        const std::string & value_hint,
-        const std::string & value_hint_2,
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const char * value_hint_2,
         const std::string & help,
         void (*handler)(gpt_params & params, const std::string &, const std::string &)
     ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
-    llama_arg & set_examples(std::set<enum llama_example> examples) {
+    llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
         this->examples = std::move(examples);
         return *this;
     }
 
-    llama_arg & set_env(std::string env) {
+    llama_arg & set_env(const char * env) {
         help = help + "\n(env: " + env + ")";
-        this->env = std::move(env);
+        this->env = env;
         return *this;
     }
 
@@ -360,8 +360,8 @@ struct llama_arg {
     }
 
     bool get_value_from_env(std::string & output) const {
-        if (env.empty()) return false;
-        char * value = std::getenv(env.c_str());
+        if (env == nullptr) return false;
+        char * value = std::getenv(env);
         if (value) {
             output = value;
             return true;
@@ -370,7 +370,7 @@ struct llama_arg {
     }
 
     bool has_value_from_env() const {
-        return std::getenv(env.c_str());
+        return env != nullptr && std::getenv(env);
     }
 
     std::string to_string();
diff --git a/examples/export-docs/export-docs.cpp b/examples/export-docs/export-docs.cpp
index 86c041a811d12..a09036dcf346d 100644
--- a/examples/export-docs/export-docs.cpp
+++ b/examples/export-docs/export-docs.cpp
@@ -22,18 +22,19 @@ static void export_md(std::string fname, llama_example ex) {
         // args
         for (const auto & arg : opt.args) {
         if (arg == opt.args.front()) {
-                file << (opt.args.size() == 1 ? arg : (arg + ", "));
+                file << arg;
+                if (opt.args.size() > 1) file << ", ";
             } else {
                 file << arg << (arg != opt.args.back() ? ", " : "");
             }
         }
         // value hint
-        if (!opt.value_hint.empty()) {
+        if (opt.value_hint) {
             std::string md_value_hint(opt.value_hint);
             string_replace_all(md_value_hint, "|", "\\|");
             file << " " << md_value_hint;
         }
-        if (!opt.value_hint_2.empty()) {
+        if (opt.value_hint_2) {
             std::string md_value_hint_2(opt.value_hint_2);
             string_replace_all(md_value_hint_2, "|", "\\|");
             file << " " << md_value_hint_2;

From 4b96c69a08a0e0e6f09883f7d8bad1375fbcaf86 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 7 Sep 2024 19:20:56 +0200
Subject: [PATCH 20/20] export-docs --> gen-docs

---
 Makefile                                                    | 6 +++---
 common/common.cpp                                           | 2 +-
 common/common.h                                             | 2 --
 examples/{export-docs => gen-docs}/CMakeLists.txt           | 4 ++--
 .../{export-docs/export-docs.cpp => gen-docs/gen-docs.cpp}  | 4 ----
 5 files changed, 6 insertions(+), 12 deletions(-)
 rename examples/{export-docs => gen-docs}/CMakeLists.txt (70%)
 rename examples/{export-docs/export-docs.cpp => gen-docs/gen-docs.cpp} (95%)

diff --git a/Makefile b/Makefile
index 8b8605d5558d5..6053bc17b60db 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,7 @@ BUILD_TARGETS = \
 	llama-tokenize \
 	llama-vdot \
 	llama-cvector-generator \
-	llama-export-docs \
+	llama-gen-docs \
 	tests/test-c.o
 
 # Binaries only useful for tests
@@ -1444,11 +1444,11 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@
 
-llama-export-docs: examples/export-docs/export-docs.cpp \
+llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	./llama-export-docs
+	./llama-gen-docs
 
 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
diff --git a/common/common.cpp b/common/common.cpp
index 3b70fd53b50f0..e92dee7a7f6ec 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -298,7 +298,7 @@ static std::string format(const char * fmt, ...) {
     return std::string(buf.data(), size);
 }
 
-void gpt_params_handle_model_default(gpt_params & params) {
+static void gpt_params_handle_model_default(gpt_params & params) {
     if (!params.hf_repo.empty()) {
         // short-hand to avoid specifying --hf-file -> default it to --model
         if (params.hf_file.empty()) {
diff --git a/common/common.h b/common/common.h
index bdb16f412fe6b..d7c08f20a124b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -386,8 +386,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto
 // print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
 void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
 
-void gpt_params_handle_model_default(gpt_params & params);
-
 std::string gpt_params_get_system_info(const gpt_params & params);
 
 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
diff --git a/examples/export-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt
similarity index 70%
rename from examples/export-docs/CMakeLists.txt
rename to examples/gen-docs/CMakeLists.txt
index 0e953167ed653..c94cda7764341 100644
--- a/examples/export-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-export-docs)
-add_executable(${TARGET} export-docs.cpp)
+set(TARGET llama-gen-docs)
+add_executable(${TARGET} gen-docs.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/export-docs/export-docs.cpp b/examples/gen-docs/gen-docs.cpp
similarity index 95%
rename from examples/export-docs/export-docs.cpp
rename to examples/gen-docs/gen-docs.cpp
index a09036dcf346d..8b1dafd63a5e8 100644
--- a/examples/export-docs/export-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -1,11 +1,7 @@
 #include "common.h"
-#include "llama.h"
 
-#include <cmath>
-#include <cstdio>
 #include <fstream>
 #include <string>
-#include <vector>
 
 // Export usage message (-h) to markdown format