From 6a3a2fcc5b50431dc133eee4a86d512506d0550e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Sep 2024 13:37:09 +0200 Subject: [PATCH 01/20] (wip) argparser v3 --- common/common.cpp | 73 ++++++++++++++++++++++++++++++++++++++++++ common/common.h | 63 +++++++++++++++++++++++++++++++++++- examples/main/main.cpp | 3 ++ 3 files changed, 138 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 9fa18472512ab..34d0eff78f312 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1714,6 +1714,79 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) #endif +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +void gpt_params_print_usage(std::vector & options) { + constexpr static int n_leading_spaces = 40; + std::string leading_spaces(n_leading_spaces, ' '); + for (const auto & opt : options) { + std::ostringstream ss; + for (const auto & arg : opt.args) { + if (&arg == &opt.args.front()) { + ss << format("%-7s", (arg + ",").c_str()); + } else { + ss << arg << (&arg != &opt.args.back() ? ", " : ""); + } + } + if (!opt.value_ex.empty()) ss << " " << opt.value_ex; + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; + } else { + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = llama_arg::break_str_into_lines(opt.help, 50); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + } + printf("%s", ss.str().c_str()); + } +} + +std::vector gpt_params_parser_register(gpt_params & params) { + std::vector options; + options.push_back(llama_arg( + {"-h", "--help", "--usage"}, + "print usage and exit", + [¶ms, &options]() { + gpt_params_print_usage(options); + exit(0); + return true; + } + )); + options.push_back(llama_arg( + {"-m", "--model"}, + format("model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)", params.model.c_str()), + [¶ms](std::string value) { + params.model = value; + return true; + } + ).set_value_ex("FNAME")); + return options; +} + +bool gpt_params_parser_run(int argc, char ** argv, std::vector & options) { + for (const auto & opt : options) { + if (opt.handler_void) opt.handler_void(); + } + return true; +} + void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; diff --git a/common/common.h b/common/common.h index cb5e7f6df10c5..04f4476f039de 100644 --- a/common/common.h +++ b/common/common.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -123,7 +124,7 @@ struct gpt_params { // // sampling parameters struct llama_sampling_params sparams; - std::string model = ""; // model path + std::string model = "model.gguf"; // model path std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias std::string model_url = ""; // model url to download @@ -277,6 +278,66 @@ struct gpt_params { std::string lora_outfile = "ggml-lora-merged-f16.gguf"; }; +enum llama_example { + LLAMA_EXAMPLE_ALL, + LLAMA_EXAMPLE_SERVER, + LLAMA_EXAMPLE_MAIN, +}; + +struct llama_arg { + std::set examples = {LLAMA_EXAMPLE_ALL}; + std::vector args; + std::string value_ex; + std::string env; + std::string help; + std::function handler_void = nullptr; + std::function handler_string = nullptr; + std::function handler_bool = nullptr; + std::function handler_int = nullptr; + std::function handler_float = nullptr; + + llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_string(handler) {} + + llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_bool(handler) {} + + llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_void(handler) {} + + llama_arg & set_examples(std::set _examples) { + examples = std::move(_examples); + return *this; + } + + llama_arg & set_value_ex(std::string _value_ex) { + value_ex = std::move(_value_ex); + return *this; + } + + llama_arg & set_env(std::string _env) { + env = _env; + return *this; + } + + // utility function + static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { + std::vector result; + std::istringstream iss(input); + std::string word, line; + while (iss >> word) { + if (line.length() + !line.empty() + word.length() > max_char_per_line) { + if (!line.empty()) result.push_back(line); + line = word; + } else { + line += (!line.empty() ? " " : "") + word; + } + } + if (!line.empty()) result.push_back(line); + return result; + } +}; + +std::vector gpt_params_parser_register(gpt_params & params); +bool gpt_params_parser_run(int argc, char ** argv, std::vector & options); + void gpt_params_parse_from_env(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c55efbb66d7c1..6a025ed512217 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -131,6 +131,9 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector Date: Thu, 5 Sep 2024 15:55:44 +0200 Subject: [PATCH 02/20] migrated --- common/common.cpp | 3326 +++++++++-------- common/common.h | 81 +- examples/batched-bench/batched-bench.cpp | 8 +- examples/batched/batched.cpp | 8 +- .../cvector-generator/cvector-generator.cpp | 8 +- examples/embedding/embedding.cpp | 4 +- examples/eval-callback/eval-callback.cpp | 4 +- examples/export-lora/export-lora.cpp | 8 +- examples/gritlm/gritlm.cpp | 4 +- examples/imatrix/imatrix.cpp | 8 +- examples/infill/infill.cpp | 4 +- examples/llava/llava-cli.cpp | 10 +- examples/llava/minicpmv-cli.cpp | 5 +- examples/lookahead/lookahead.cpp | 4 +- examples/lookup/lookup-create.cpp | 4 +- examples/lookup/lookup-stats.cpp | 4 +- examples/lookup/lookup.cpp | 4 +- examples/main/main.cpp | 7 +- examples/parallel/parallel.cpp | 4 +- examples/passkey/passkey.cpp | 8 +- examples/perplexity/perplexity.cpp | 4 +- examples/retrieval/retrieval.cpp | 8 +- examples/save-load-state/save-load-state.cpp | 4 +- examples/server/server.cpp | 7 +- examples/simple/simple.cpp | 8 +- examples/speculative/speculative.cpp | 4 +- 26 files changed, 1803 insertions(+), 1745 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 34d0eff78f312..09e3a992c6a06 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -306,6 +306,32 @@ bool set_process_priority(enum ggml_sched_priority prio) { // CLI argument parsing // +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + void gpt_params_handle_model_default(gpt_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model @@ -352,22 +378,60 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) } } -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { - bool invalid_param = false; +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector & options) { std::string arg; const std::string arg_prefix = "--"; llama_sampling_params & sparams = params.sparams; + std::unordered_map arg_to_options; + for (const auto & opt : options) { + for (const auto & arg : opt.args) { + arg_to_options[arg] = &opt; + } + } + + auto check_arg = [&](int i) { + if (i+1 >= argc) { + throw std::invalid_argument("expected value for argument"); + } + }; + for (int i = 1; i < argc; i++) { arg = argv[i]; if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { std::replace(arg.begin(), arg.end(), '_', '-'); } - if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { - throw std::invalid_argument("error: unknown argument: " + arg); + if (arg_to_options.find(arg) == arg_to_options.end()) { + throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); } - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); + try { + auto opt = *arg_to_options[arg]; + if (opt.handler_void) { + opt.handler_void(); + continue; + } + + // arg with single value + check_arg(i); + std::string val = argv[++i]; + if (opt.handler_int) { + opt.handler_int(std::stoi(val)); + continue; + } + if (opt.handler_string) { + opt.handler_string(val); + continue; + } + + // arg with 2 values + check_arg(i); + std::string val2 = argv[++i]; + if (opt.handler_str_str) { + opt.handler_str_str(val, val2); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format("error: %s", e.what())); } } @@ -404,41 +468,21 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { return true; } -void gpt_params_parse_from_env(gpt_params & params) { - // we only care about server-related params for now - get_env("LLAMA_ARG_MODEL", params.model); - get_env("LLAMA_ARG_MODEL_URL", params.model_url); - get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias); - get_env("LLAMA_ARG_HF_REPO", params.hf_repo); - get_env("LLAMA_ARG_HF_FILE", params.hf_file); - get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads); - get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx); - get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel); - get_env("LLAMA_ARG_BATCH", params.n_batch); - get_env("LLAMA_ARG_UBATCH", params.n_ubatch); - get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers); - get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http); - get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template); - get_env("LLAMA_ARG_N_PREDICT", params.n_predict); - get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics); - get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots); - get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); - get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); - get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); - get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching); - get_env("LLAMA_ARG_HOST", params.hostname); - get_env("LLAMA_ARG_PORT", params.port); -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { +bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector & options) { const auto params_org = params; // the example can modify the default params try { - if (!gpt_params_parse_ex(argc, argv, params) || params.usage) { + if (!gpt_params_parse_ex(argc, argv, params, options)) { params = params_org; - params.usage = true; return false; } + if (params.usage) { + gpt_params_print_usage(options); + if (params.print_usage) { + params.print_usage(argc, argv); + } + exit(0); + } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); params = params_org; @@ -521,1646 +565,1690 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } -#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } +static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { + std::vector result; + std::istringstream iss(input); + std::string line; + auto add_line = [&](const std::string& l) { + if (l.length() <= max_char_per_line) { + result.push_back(l); + } else { + std::istringstream line_stream(l); + std::string word, current_line; + while (line_stream >> word) { + if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { + if (!current_line.empty()) result.push_back(current_line); + current_line = word; + } else { + current_line += (!current_line.empty() ? " " : "") + word; + } + } + if (!current_line.empty()) result.push_back(current_line); + } + }; + while (std::getline(iss, line)) { + add_line(line); + } + return result; +} + +void gpt_params_print_usage(std::vector & options) { + const static int n_leading_spaces = 40; + const static int n_char_per_line_help = 70; // TODO: detect this based on current console + + auto print_options = [](std::vector & options) { + std::string leading_spaces(n_leading_spaces, ' '); + for (const auto & opt : options) { + std::ostringstream ss; + for (const auto & arg : opt->args) { + if (&arg == &opt->args.front()) { + ss << (opt->args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str())); + } else { + ss << arg << (&arg != &opt->args.back() ? ", " : ""); + } + } + if (!opt->value_hint.empty()) ss << " " << opt->value_hint; + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; + } else { + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = break_str_into_lines(opt->help, n_char_per_line_help); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + } + printf("%s", ss.str().c_str()); + } + }; -bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { - const char split_delim = ','; + std::vector common_options; + std::vector specific_options; + for (auto & opt : options) { + if (opt.in_example(LLAMA_EXAMPLE_COMMON)) { + common_options.push_back(&opt); + } else { + specific_options.push_back(&opt); + } + } + printf("----- common options -----\n\n"); + print_options(common_options); + printf("\n\n----- example-specific options -----\n\n"); + print_options(specific_options); +} +std::vector gpt_params_parser_init(gpt_params & params, llama_example ex) { + return gpt_params_parser_init(params, ex, nullptr); +} + +std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage) { + std::vector options; + params.print_usage = print_usage; llama_sampling_params & sparams = params.sparams; - if (arg == "-s" || arg == "--seed") { - CHECK_ARG - // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. - params.seed = std::stoul(argv[i]); - sparams.seed = std::stoul(argv[i]); - return true; + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto sampler_type : sparams.samplers_sequence) { + sampler_type_chars += static_cast(sampler_type); + sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; } - if (arg == "-t" || arg == "--threads") { - CHECK_ARG - params.cpuparams.n_threads = std::stoi(argv[i]); - if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); + sampler_type_names.pop_back(); + const char split_delim = ','; + + + /** + * filter options by example + * rules: + * - all examples inherit options from LLAMA_EXAMPLE_COMMON + * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example + * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example + */ + std::unordered_set seen_args; + auto add_opt = [&](llama_arg arg) { + if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + // make sure there is no argument duplications + for (const auto & a : arg.args) { + if (seen_args.find(a) == seen_args.end()) { + seen_args.insert(a); + } else { + throw std::runtime_error(format("found duplicated argument in source code: %s", a.c_str())); + } + } + options.push_back(std::move(arg)); } - return true; - } - if (arg == "-C" || arg == "--cpu-mask") { - CHECK_ARG - std::string mask = argv[i]; - params.cpuparams.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); - return true; - } - if (arg == "-Cr" || arg == "--cpu-range") { - CHECK_ARG - std::string range = argv[i]; - params.cpuparams.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask); - return true; - } - if (arg == "--prio") { - CHECK_ARG - params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict") { - CHECK_ARG - params.cpuparams.strict_cpu = std::stoul(argv[i]); - return true; - } - if (arg == "--poll") { - CHECK_ARG - params.cpuparams.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-tb" || arg == "--threads-batch") { - CHECK_ARG - params.cpuparams_batch.n_threads = std::stoi(argv[i]); - if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + }; + + + add_opt(llama_arg( + {"-h", "--help", "--usage"}, + "print usage and exit", + [¶ms]() { + params.usage = true; } - return true; - } - if (arg == "-Cb" || arg == "--cpu-mask-batch") { - CHECK_ARG - std::string mask = argv[i]; - params.cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask); - return true; - } - if (arg == "-Crb" || arg == "--cpu-range_batch") { - CHECK_ARG - std::string range = argv[i]; - params.cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask); - return true; - } - if (arg == "--prio-batch") { - CHECK_ARG - params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-batch") { - params.cpuparams_batch.strict_cpu = true; - return true; - } - if (arg == "--poll-batch") { - CHECK_ARG - params.cpuparams_batch.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-td" || arg == "--threads-draft") { - CHECK_ARG - params.draft_cpuparams.n_threads = std::stoi(argv[i]); - if (params.draft_cpuparams.n_threads <= 0) { - params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); + )); + add_opt(llama_arg( + {"--version"}, + "show version and build info", + []() { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); } - return true; - } - if (arg == "-Cd" || arg == "--cpu-mask-draft") { - CHECK_ARG - std::string mask = argv[i]; - params.draft_cpuparams.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask); - return true; - } - if (arg == "-Crd" || arg == "--cpu-range-draft") { - CHECK_ARG - std::string range = argv[i]; - params.draft_cpuparams.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask); - return true; - } - if (arg == "--prio-draft") { - CHECK_ARG - params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-draft") { - params.draft_cpuparams.strict_cpu = true; - return true; - } - if (arg == "--poll-draft") { - CHECK_ARG - params.draft_cpuparams.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-tbd" || arg == "--threads-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]); - if (params.draft_cpuparams_batch.n_threads <= 0) { - params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + )); + add_opt(llama_arg( + {"-v", "--verbose"}, + "print verbose information", + [¶ms]() { + params.verbosity = 1; } - return true; - } - if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") { - CHECK_ARG - std::string range = argv[i]; - params.draft_cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask); - return true; - } - if (arg == "--prio-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-batch-draft") { - params.draft_cpuparams_batch.strict_cpu = true; - return true; - } - if (arg == "--poll-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-p" || arg == "--prompt") { - CHECK_ARG - params.prompt = argv[i]; - return true; - } - if (arg == "-e" || arg == "--escape") { - params.escape = true; - return true; - } - if (arg == "--no-escape") { - params.escape = false; - return true; - } - if (arg == "--prompt-cache") { - CHECK_ARG - params.path_prompt_cache = argv[i]; - return true; - } - if (arg == "--prompt-cache-all") { - params.prompt_cache_all = true; - return true; - } - if (arg == "--prompt-cache-ro") { - params.prompt_cache_ro = true; - return true; - } - if (arg == "-bf" || arg == "--binary-file") { - CHECK_ARG - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - return true; - } - if (arg == "-f" || arg == "--file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"--verbosity"}, "N", + format("set specific verbosity level (default: %d)", params.verbosity), + [¶ms](int value) { + params.verbosity = value; } - // store the external file name in params - params.prompt_file = argv[i]; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); + )); + add_opt(llama_arg( + {"--verbose-prompt"}, + format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + [¶ms]() { + params.verbose_prompt = true; } - return true; - } - if (arg == "--in-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"--no-display-prompt"}, + format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + [¶ms]() { + params.display_prompt = false; } - params.in_files.push_back(argv[i]); - return true; - } - if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { - CHECK_ARG - params.n_predict = std::stoi(argv[i]); - return true; - } - if (arg == "--top-k") { - CHECK_ARG - sparams.top_k = std::stoi(argv[i]); - return true; - } - if (arg == "-c" || arg == "--ctx-size") { - CHECK_ARG - params.n_ctx = std::stoi(argv[i]); - return true; - } - if (arg == "--grp-attn-n" || arg == "-gan") { - CHECK_ARG - params.grp_attn_n = std::stoi(argv[i]); - return true; - } - if (arg == "--grp-attn-w" || arg == "-gaw") { - CHECK_ARG - params.grp_attn_w = std::stoi(argv[i]); - return true; - } - if (arg == "--rope-freq-base") { - CHECK_ARG - params.rope_freq_base = std::stof(argv[i]); - return true; - } - if (arg == "--rope-freq-scale") { - CHECK_ARG - params.rope_freq_scale = std::stof(argv[i]); - return true; - } - if (arg == "--rope-scaling") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { invalid_param = true; } - return true; - } - if (arg == "--rope-scale") { - CHECK_ARG - params.rope_freq_scale = 1.0f / std::stof(argv[i]); - return true; - } - if (arg == "--yarn-orig-ctx") { - CHECK_ARG - params.yarn_orig_ctx = std::stoi(argv[i]); - return true; - } - if (arg == "--yarn-ext-factor") { - CHECK_ARG - params.yarn_ext_factor = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-attn-factor") { - CHECK_ARG - params.yarn_attn_factor = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-beta-fast") { - CHECK_ARG - params.yarn_beta_fast = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-beta-slow") { - CHECK_ARG - params.yarn_beta_slow = std::stof(argv[i]); - return true; - } - if (arg == "--pooling") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else { invalid_param = true; } - return true; - } - if (arg == "--attention") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } - else { invalid_param = true; } - return true; - } - if (arg == "--defrag-thold" || arg == "-dt") { - CHECK_ARG - params.defrag_thold = std::stof(argv[i]); - return true; - } - if (arg == "--samplers") { - CHECK_ARG - const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); - return true; - } - if (arg == "--sampling-seq") { - CHECK_ARG - sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); - return true; - } - if (arg == "--top-p") { - CHECK_ARG - sparams.top_p = std::stof(argv[i]); - return true; - } - if (arg == "--min-p") { - CHECK_ARG - sparams.min_p = std::stof(argv[i]); - return true; - } - if (arg == "--temp") { - CHECK_ARG - sparams.temp = std::stof(argv[i]); - sparams.temp = std::max(sparams.temp, 0.0f); - return true; - } - if (arg == "--tfs") { - CHECK_ARG - sparams.tfs_z = std::stof(argv[i]); - return true; - } - if (arg == "--typical") { - CHECK_ARG - sparams.typical_p = std::stof(argv[i]); - return true; - } - if (arg == "--repeat-last-n") { - CHECK_ARG - sparams.penalty_last_n = std::stoi(argv[i]); - sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - return true; - } - if (arg == "--repeat-penalty") { - CHECK_ARG - sparams.penalty_repeat = std::stof(argv[i]); - return true; - } - if (arg == "--frequency-penalty") { - CHECK_ARG - sparams.penalty_freq = std::stof(argv[i]); - return true; - } - if (arg == "--presence-penalty") { - CHECK_ARG - sparams.penalty_present = std::stof(argv[i]); - return true; - } - if (arg == "--dynatemp-range") { - CHECK_ARG - sparams.dynatemp_range = std::stof(argv[i]); - return true; - } - if (arg == "--dynatemp-exp") { - CHECK_ARG - sparams.dynatemp_exponent = std::stof(argv[i]); - return true; - } - if (arg == "--mirostat") { - CHECK_ARG - sparams.mirostat = std::stoi(argv[i]); - return true; - } - if (arg == "--mirostat-lr") { - CHECK_ARG - sparams.mirostat_eta = std::stof(argv[i]); - return true; - } - if (arg == "--mirostat-ent") { - CHECK_ARG - sparams.mirostat_tau = std::stof(argv[i]); - return true; - } - if (arg == "--cfg-negative-prompt") { - CHECK_ARG - sparams.cfg_negative_prompt = argv[i]; - return true; - } - if (arg == "--cfg-negative-prompt-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"-co", "--color"}, + format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + [¶ms]() { + params.use_color = true; } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); - if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { - sparams.cfg_negative_prompt.pop_back(); + )); + add_opt(llama_arg( + {"-s", "--seed"}, "SEED", + format("RNG seed (default: %d, use random seed for < 0)", params.seed), + [&sparams, ¶ms](std::string value) { + // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. + params.seed = std::stoul(value); + sparams.seed = std::stoul(value); } - return true; - } - if (arg == "--cfg-scale") { - CHECK_ARG - sparams.cfg_scale = std::stof(argv[i]); - return true; - } - if (arg == "-b" || arg == "--batch-size") { - CHECK_ARG - params.n_batch = std::stoi(argv[i]); - return true; - } - if (arg == "-ub" || arg == "--ubatch-size") { - CHECK_ARG - params.n_ubatch = std::stoi(argv[i]); - return true; - } - if (arg == "--keep") { - CHECK_ARG - params.n_keep = std::stoi(argv[i]); - return true; - } - if (arg == "--draft") { - CHECK_ARG - params.n_draft = std::stoi(argv[i]); - return true; - } - if (arg == "--chunks") { - CHECK_ARG - params.n_chunks = std::stoi(argv[i]); - return true; - } - if (arg == "-np" || arg == "--parallel") { - CHECK_ARG - params.n_parallel = std::stoi(argv[i]); - return true; - } - if (arg == "-ns" || arg == "--sequences") { - CHECK_ARG - params.n_sequences = std::stoi(argv[i]); - return true; - } - if (arg == "--p-split" || arg == "-ps") { - CHECK_ARG - params.p_split = std::stof(argv[i]); - return true; - } - if (arg == "-m" || arg == "--model") { - CHECK_ARG - params.model = argv[i]; - return true; - } - if (arg == "-md" || arg == "--model-draft") { - CHECK_ARG - params.model_draft = argv[i]; - return true; - } - if (arg == "-a" || arg == "--alias") { - CHECK_ARG - params.model_alias = argv[i]; - return true; - } - if (arg == "-mu" || arg == "--model-url") { - CHECK_ARG - params.model_url = argv[i]; - return true; - } - if (arg == "-hft" || arg == "--hf-token") { - if (++i >= argc) { - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"-t", "--threads"}, "N", + format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + [¶ms](int value) { + params.cpuparams.n_threads = value; + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); + } } - params.hf_token = argv[i]; - return true; - } - if (arg == "-hfr" || arg == "--hf-repo") { - CHECK_ARG - params.hf_repo = argv[i]; - return true; - } - if (arg == "-hff" || arg == "--hf-file") { - CHECK_ARG - params.hf_file = argv[i]; - return true; - } - if (arg == "--lora") { - CHECK_ARG - params.lora_adapters.push_back({ - std::string(argv[i]), - 1.0, - }); - return true; - } - if (arg == "--lora-scaled") { - CHECK_ARG - std::string lora_adapter = argv[i]; - CHECK_ARG - params.lora_adapters.push_back({ - lora_adapter, - std::stof(argv[i]), - }); - return true; - } - if (arg == "--lora-init-without-apply") { - params.lora_init_without_apply = true; - return true; - } - if (arg == "--control-vector") { - CHECK_ARG - params.control_vectors.push_back({ 1.0f, argv[i], }); - return true; - } - if (arg == "--control-vector-scaled") { - CHECK_ARG - const char* fname = argv[i]; - CHECK_ARG - params.control_vectors.push_back({ std::stof(argv[i]), fname, }); - return true; - } - if (arg == "--control-vector-layer-range") { - CHECK_ARG - params.control_vector_layer_start = std::stoi(argv[i]); - CHECK_ARG - params.control_vector_layer_end = std::stoi(argv[i]); - return true; - } - if (arg == "--mmproj") { - CHECK_ARG - params.mmproj = argv[i]; - return true; - } - if (arg == "--image") { - CHECK_ARG - params.image.emplace_back(argv[i]); - return true; - } - if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - return true; - } - if (arg == "-sp" || arg == "--special") { - params.special = true; - return true; - } - if (arg == "--embedding" || arg == "--embeddings") { - params.embedding = true; - return true; - } - if (arg == "--embd-normalize") { - CHECK_ARG - params.embd_normalize = std::stoi(argv[i]); - return true; - } - if (arg == "--embd-output-format") { - CHECK_ARG - params.embd_out = argv[i]; - return true; - } - if (arg == "--embd-separator") { - CHECK_ARG - params.embd_sep = argv[i]; - return true; - } - if (arg == "-if" || arg == "--interactive-first") { - params.interactive_first = true; - return true; - } - if (arg == "-cnv" || arg == "--conversation") { - params.conversation = true; - return true; - } - if (arg == "--infill") { - params.infill = true; - return true; - } - if (arg == "-dkvc" || arg == "--dump-kv-cache") { - params.dump_kv_cache = true; - return true; - } - if (arg == "-nkvo" || arg == "--no-kv-offload") { - params.no_kv_offload = true; - return true; - } - if (arg == "-ctk" || arg == "--cache-type-k") { - params.cache_type_k = argv[++i]; - return true; - } - if (arg == "-ctv" || arg == "--cache-type-v") { - params.cache_type_v = argv[++i]; - return true; - } - if (arg == "-mli" || arg == "--multiline-input") { - params.multiline_input = true; - return true; - } - if (arg == "--simple-io") { - params.simple_io = true; - return true; - } - if (arg == "-cb" || arg == "--cont-batching") { - params.cont_batching = true; - return true; - } - if (arg == "-nocb" || arg == "--no-cont-batching") { - params.cont_batching = false; - return true; - } - if (arg == "-fa" || arg == "--flash-attn") { - params.flash_attn = true; - return true; - } - if (arg == "-co" || arg == "--color") { - params.use_color = true; - return true; - } - if (arg == "--mlock") { - params.use_mlock = true; - return true; - } - if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - CHECK_ARG - params.n_gpu_layers = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + )); + add_opt(llama_arg( + {"-tb", "--threads-batch"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads)", + [¶ms](int value) { + params.cpuparams_batch.n_threads = value; + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } } - return true; - } - if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") { - CHECK_ARG - params.n_gpu_layers_draft = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + )); + add_opt(llama_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [¶ms](int value) { + params.draft_cpuparams.n_threads = value; + if (params.draft_cpuparams.n_threads <= 0) { + params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); + } } - return true; - } - if (arg == "--main-gpu" || arg == "-mg") { - CHECK_ARG - params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } - if (arg == "--split-mode" || arg == "-sm") { - CHECK_ARG - std::string arg_next = argv[i]; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [¶ms](int value) { + params.draft_cpuparams_batch.n_threads = value; + if (params.draft_cpuparams_batch.n_threads <= 0) { + params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } } - else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-C", "--cpu-mask"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", + [¶ms](std::string value) { + std::string mask = value; + params.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } } - else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; + )); + add_opt(llama_arg( + {"-Cr", "--cpu-range"}, "lo-hi", + "range of CPUs for affinity. Complements --cpu-mask", + [¶ms](std::string value) { + std::string range = value; + params.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } } - else { - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"--cpu-strict"}, "<0|1>", + format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + [¶ms](std::string value) { + params.cpuparams.strict_cpu = std::stoul(value); } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } - if (arg == "--tensor-split" || arg == "-ts") { - CHECK_ARG - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{ R"([,/]+)" }; - std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - if (split_arg.size() >= llama_max_devices()) { - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"--poll"}, "<0...100>", + format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + [¶ms](std::string value) { + params.cpuparams.poll = std::stoul(value); } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); + )); + add_opt(llama_arg( + {"-Cb", "--cpu-mask-batch"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", + [¶ms](std::string value) { + std::string mask = value; + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); } - else { - params.tensor_split[i] = 0.0f; + } + )); + add_opt(llama_arg( + {"-Crb", "--cpu-range-batch"}, "lo-hi", + "ranges of CPUs for affinity. Complements --cpu-mask-batch", + [¶ms](std::string value) { + std::string range = value; + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict-batch"}, "<0|1>", + "use strict CPU placement (default: same as --cpu-strict)", + [¶ms](int value) { + params.cpuparams_batch.strict_cpu = value; + } + )); + add_opt(llama_arg( + {"--poll-batch"}, "<0|1>", + "use polling to wait for work (default: same as --poll", + [¶ms](int value) { + params.cpuparams_batch.poll = value; + } + )); + add_opt(llama_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [¶ms](std::string value) { + std::string mask = value; + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [¶ms](std::string value) { + std::string range = value; + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [¶ms](int value) { + params.draft_cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [¶ms](int value) { + params.draft_cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [¶ms](std::string value) { + std::string range = value; + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [¶ms](int value) { + params.draft_cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [¶ms](int value) { + params.draft_cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--draft"}, "N", + format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + [¶ms](int value) { + params.n_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-ps", "--p-split"}, "N", + format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + [¶ms](std::string value) { + params.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-lcs", "--lookup-cache-static"}, "FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)", + [¶ms](std::string value) { + params.lookup_cache_static = value; + } + )); + add_opt(llama_arg( + {"-lcd", "--lookup-cache-dynamic"}, "FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)", + [¶ms](std::string value) { + params.lookup_cache_dynamic = value; + } + )); + add_opt(llama_arg( + {"-c", "--ctx-size"}, "N", + format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + [¶ms](int value) { + params.n_ctx = value; + } + )); + add_opt(llama_arg( + {"-n", "--predict"}, "N", + format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + [¶ms](int value) { + params.n_predict = value; + } + )); + add_opt(llama_arg( + {"-b", "--batch-size"}, "N", + format("logical maximum batch size (default: %d)", params.n_batch), + [¶ms](int value) { + params.n_batch = value; + } + )); + add_opt(llama_arg( + {"-ub", "--ubatch-size"}, "N", + format("physical maximum batch size (default: %d)", params.n_ubatch), + [¶ms](int value) { + params.n_ubatch = value; + } + )); + add_opt(llama_arg( + {"--keep"}, "N", + format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + [¶ms](int value) { + params.n_keep = value; + } + )); + add_opt(llama_arg( + {"--chunks"}, "N", + format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + [¶ms](int value) { + params.n_chunks = value; + } + )); + add_opt(llama_arg( + {"-fa", "--flash-attn"}, + format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + [¶ms]() { + params.flash_attn = true; + } + )); + add_opt(llama_arg( + {"-p", "--prompt"}, "PROMPT", + "prompt to start generation with\n", + [¶ms](std::string value) { + params.prompt = value; + } + )); + add_opt(llama_arg( + {"-f", "--file"}, "FNAME", + "a file containing the prompt (default: none)", + [¶ms](std::string value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } + )); + add_opt(llama_arg( + {"--in-file"}, "FNAME", + "an input file (repeat to specify multiple files)", + [¶ms](std::string value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.in_files.push_back(value); + } + )); + add_opt(llama_arg( + {"-bf", "--binary-file"}, "FNAME", + "binary file containing the prompt (default: none)", + [¶ms](std::string value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); + } + )); + add_opt(llama_arg( + {"-e", "--escape"}, + format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [¶ms]() { + params.escape = true; + } + )); + add_opt(llama_arg( + {"--no-escape"}, + "do not process escape sequences", + [¶ms]() { + params.escape = false; + } + )); + add_opt(llama_arg( + {"-ptc", "--print-token-count"}, "N", + format("print token count every N tokens (default: %d)", params.n_print), + [¶ms](int value) { + params.n_print = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache"}, "FNAME", + "file to cache prompt state for faster startup (default: none)", + [¶ms](std::string value) { + params.path_prompt_cache = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-all"}, + "if specified, saves user input and generations to cache as well\n", + [¶ms]() { + params.prompt_cache_all = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-ro"}, + "if specified, uses the prompt cache but does not update it", + [¶ms]() { + params.prompt_cache_ro = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-r", "--reverse-prompt"}, "PROMPT", + "halt generation at PROMPT, return control in interactive mode\n", + [¶ms](std::string value) { + params.antiprompt.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-sp", "--special"}, + format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + [¶ms]() { + params.special = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-cnv", "--conversation"}, + "run in conversation mode, does not print special tokens and suffix/prefix\n", + [¶ms]() { + params.conversation = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-i", "--interactive"}, + format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + [¶ms]() { + params.interactive = true; + } + ).set_examples({LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-if", "--interactive-first"}, + format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + [¶ms]() { + params.interactive_first = true; + } + ).set_examples({LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-mli", "--multiline-input"}, + "allows you to write or paste multiple lines without ending each in '\\'", + [¶ms]() { + params.multiline_input = true; + } + ).set_examples({LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--in-prefix-bos"}, + "prefix BOS to user inputs, preceding the `--in-prefix` string", + [¶ms]() { + params.input_prefix_bos = true; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--in-prefix"}, "STRING", + "string to prefix user inputs with (default: empty)", + [¶ms](std::string value) { + params.input_prefix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--in-suffix"}, "STRING", + "string to suffix after user inputs with (default: empty)", + [¶ms](std::string value) { + params.input_suffix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--no-warmup"}, + "skip warming up the model with an empty run", + [¶ms]() { + params.warmup = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--spm-infill"}, + format( + "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", + params.spm_infill ? "enabled" : "disabled" + ), + [¶ms]() { + params.spm_infill = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--samplers"}, "SAMPLERS", + format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + [&sparams](std::string value) { + const auto sampler_names = string_split(value, ';'); + sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); + } + )); + add_opt(llama_arg( + {"--sampling-seq"}, "SEQUENCE", + format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + [&sparams](std::string value) { + sparams.samplers_sequence = llama_sampling_types_from_chars(value); + } + )); + add_opt(llama_arg( + {"--ignore-eos"}, + "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", + [¶ms]() { + params.ignore_eos = true; + } + )); + add_opt(llama_arg( + {"--penalize-nl"}, + format("penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false"), + [&sparams]() { + sparams.penalize_nl = true; + } + )); + add_opt(llama_arg( + {"--temp"}, "N", + format("temperature (default: %.1f)", (double)sparams.temp), + [&sparams](std::string value) { + sparams.temp = std::stof(value); + sparams.temp = std::max(sparams.temp, 0.0f); + } + )); + add_opt(llama_arg( + {"--top-k"}, "N", + format("top-k sampling (default: %d, 0 = disabled)", sparams.top_k), + [&sparams](int value) { + sparams.top_k = value; + } + )); + add_opt(llama_arg( + {"--top-p"}, "N", + format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p), + [&sparams](std::string value) { + sparams.top_p = std::stof(value); + } + )); + add_opt(llama_arg( + {"--min-p"}, "N", + format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p), + [&sparams](std::string value) { + sparams.min_p = std::stof(value); + } + )); + add_opt(llama_arg( + {"--tfs"}, "N", + format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z), + [&sparams](std::string value) { + sparams.tfs_z = std::stof(value); + } + )); + add_opt(llama_arg( + {"--typical"}, "N", + format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p), + [&sparams](std::string value) { + sparams.typical_p = std::stof(value); + } + )); + add_opt(llama_arg( + {"--repeat-last-n"}, "N", + format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n), + [&sparams](int value) { + sparams.penalty_last_n = value; + sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); + } + )); + add_opt(llama_arg( + {"--repeat-penalty"}, "N", + format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat), + [&sparams](std::string value) { + sparams.penalty_repeat = std::stof(value); + } + )); + add_opt(llama_arg( + {"--presence-penalty"}, "N", + format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present), + [&sparams](std::string value) { + sparams.penalty_present = std::stof(value); + } + )); + add_opt(llama_arg( + {"--frequency-penalty"}, "N", + format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq), + [&sparams](std::string value) { + sparams.penalty_freq = std::stof(value); + } + )); + add_opt(llama_arg( + {"--dynatemp-range"}, "N", + format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range), + [&sparams](std::string value) { + sparams.dynatemp_range = std::stof(value); + } + )); + add_opt(llama_arg( + {"--dynatemp-exp"}, "N", + format("dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent), + [&sparams](std::string value) { + sparams.dynatemp_exponent = std::stof(value); + } + )); + add_opt(llama_arg( + {"--mirostat"}, "N", + format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat), + [&sparams](int value) { + sparams.mirostat = value; + } + )); + add_opt(llama_arg( + {"--mirostat-lr"}, "N", + format("Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta), + [&sparams](std::string value) { + sparams.mirostat_eta = std::stof(value); + } + )); + add_opt(llama_arg( + {"--mirostat-ent"}, "N", + format("Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau), + [&sparams](std::string value) { + sparams.mirostat_tau = std::stof(value); + } + )); + add_opt(llama_arg( + {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", + "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", + [&sparams](std::string value) { + std::stringstream ss(value); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + } else { + throw std::invalid_argument("invalid input format"); + } + } catch (const std::exception&) { + throw std::invalid_argument("invalid input format"); + } + } + )); + add_opt(llama_arg( + {"--cfg-negative-prompt"}, "PROMPT", + format("negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str()), + [&sparams](std::string value) { + sparams.cfg_negative_prompt = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--cfg-negative-prompt-file"}, "FNAME", + "negative prompt file to use for guidance", + [&sparams](std::string value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); + if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { + sparams.cfg_negative_prompt.pop_back(); + } + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--cfg-scale"}, "N", + format("strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale), + [&sparams](std::string value) { + sparams.cfg_scale = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--grammar"}, "GRAMMAR", + format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str()), + [&sparams](std::string value) { + sparams.grammar = value; + } + )); + add_opt(llama_arg( + {"--grammar-file"}, "FNAME", + "file to read grammar from", + [&sparams](std::string value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(sparams.grammar) + ); + } + )); + add_opt(llama_arg( + {"-j", "--json-schema"}, "SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", + [&sparams](std::string value) { + sparams.grammar = json_schema_to_grammar(json::parse(value)); + } + )); + add_opt(llama_arg( + {"--pooling"}, "{none,mean,cls,last}", + "pooling type for embeddings, use model default if unspecified", + [¶ms](std::string value) { + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--attention"}, "{causal,non,causal}", + "attention type for embeddings, use model default if unspecified", + [¶ms](std::string value) { + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--rope-scaling"}, "{none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model", + [¶ms](std::string value) { + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"--rope-scale"}, "N", + "RoPE context scaling factor, expands context by a factor of N", + [¶ms](std::string value) { + params.rope_freq_scale = 1.0f / std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-base"}, "N", + "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", + [¶ms](std::string value) { + params.rope_freq_base = std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-scale"}, "N", + "RoPE frequency scaling factor, expands context by a factor of 1/N", + [¶ms](std::string value) { + params.rope_freq_scale = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-orig-ctx"}, "N", + format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + [¶ms](int value) { + params.yarn_orig_ctx = value; + } + )); + add_opt(llama_arg( + {"--yarn-ext-factor"}, "N", + format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + [¶ms](std::string value) { + params.yarn_ext_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-attn-factor"}, "N", + format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + [¶ms](std::string value) { + params.yarn_attn_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-slow"}, "N", + format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + [¶ms](std::string value) { + params.yarn_beta_slow = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-fast"}, "N", + format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + [¶ms](std::string value) { + params.yarn_beta_fast = std::stof(value); + } + )); + add_opt(llama_arg( + {"-gan", "--grp-attn-n"}, "N", + format("group-attention factor (default: %d)", params.grp_attn_n), + [¶ms](int value) { + params.grp_attn_n = value; + } + )); + add_opt(llama_arg( + {"-gaw", "--grp-attn-w"}, "N", + format("group-attention width (default: %.1f)", (double)params.grp_attn_w), + [¶ms](int value) { + params.grp_attn_w = value; + } + )); + add_opt(llama_arg( + {"-dkvc", "--dump-kv-cache"}, + "verbose print of the KV cache", + [¶ms]() { + params.dump_kv_cache = true; + } + )); + add_opt(llama_arg( + {"-nkvo", "--no-kv-offload"}, + "disable KV offload", + [¶ms]() { + params.no_kv_offload = true; + } + )); + add_opt(llama_arg( + {"-ctk", "--cache-type-k"}, "TYPE", + format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + [¶ms](std::string value) { + // TODO: get the type right here + params.cache_type_k = value; + } + )); + add_opt(llama_arg( + {"-ctv", "--cache-type-v"}, "TYPE", + format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + [¶ms](std::string value) { + // TODO: get the type right here + params.cache_type_v = value; + } + )); + add_opt(llama_arg( + {"--all-logits"}, + format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + [¶ms]() { + params.logits_all = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag"}, + "compute HellaSwag score over random tasks from datafile supplied with -f", + [¶ms]() { + params.hellaswag = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag-tasks"}, "N", + format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + [¶ms](int value) { + params.hellaswag_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande"}, + "compute Winogrande score over random tasks from datafile supplied with -f", + [¶ms]() { + params.winogrande = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande-tasks"}, "N", + format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + [¶ms](int value) { + params.winogrande_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice"}, + "compute multiple choice score over random tasks from datafile supplied with -f", + [¶ms]() { + params.multiple_choice = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice-tasks"}, "N", + format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + [¶ms](int value) { + params.multiple_choice_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--kl-divergence"}, + "computes KL-divergence to logits provided via --kl-divergence-base", + [¶ms]() { + params.kl_divergence = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-stride"}, "N", + format("stride for perplexity calculation (default: %d)", params.ppl_stride), + [¶ms](int value) { + params.ppl_stride = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-output-type"}, "<0|1>", + format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + [¶ms](int value) { + params.ppl_output_type = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"-dt", "--defrag-thold"}, "N", + format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + [¶ms](std::string value) { + params.defrag_thold = std::stof(value); + } + )); + add_opt(llama_arg( + {"-np", "--parallel"}, "N", + format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [¶ms](int value) { + params.n_parallel = value; + } + )); + add_opt(llama_arg( + {"-ns", "--sequences"}, "N", + format("number of sequences to decode (default: %d)", params.n_sequences), + [¶ms](int value) { + params.n_sequences = value; + } + )); + add_opt(llama_arg( + {"-cb", "--cont-batching"}, + format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [¶ms]() { + params.cont_batching = true; + } + )); + add_opt(llama_arg( + {"-nocb", "--no-cont-batching"}, + "disable continuous batching", + [¶ms]() { + params.cont_batching = false; + } + )); + add_opt(llama_arg( + {"--mmproj"}, "FILE", + "path to a multimodal projector file for LLaVA. see examples/llava/README.md", + [¶ms](std::string value) { + params.mmproj = value; + } + )); + add_opt(llama_arg( + {"--image"}, "FILE", + "path to an image file. use with multimodal models. Specify multiple times for batching", + [¶ms](std::string value) { + params.image.emplace_back(value); + } + )); + add_opt(llama_arg( + {"--rpc"}, "SERVERS", + "comma separated list of RPC servers", + [¶ms](std::string value) { + params.rpc_servers = value; + } + )); + add_opt(llama_arg( + {"--mlock"}, + "force system to keep model in RAM rather than swapping or compressing", + [¶ms]() { + params.use_mlock = true; + } + )); + add_opt(llama_arg( + {"--no-mmap"}, + "do not memory-map model (slower load but may reduce pageouts if not using mlock)", + [¶ms]() { + params.use_mmap = false; + } + )); + add_opt(llama_arg( + {"--numa"}, "TYPE", + "attempt optimizations that help on some NUMA systems\n" + "- distribute: spread execution evenly over all nodes\n" + "- isolate: only spawn threads on CPUs on the node that execution started on\n" + "- numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggerganov/llama.cpp/issues/1437", + [¶ms](std::string value) { + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"-ngl", "--gpu-layers"}, "N", + "number of layers to store in VRAM", + [¶ms](int value) { + params.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + )); + add_opt(llama_arg( + {"-ngld", "--gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [¶ms](int value) { + params.n_gpu_layers_draft = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } } + )); + add_opt(llama_arg( + {"-sm", "--split-mode"}, "SPLIT_MODE", + "how to split the model across multiple GPUs, one of:\n" + "- none: use one GPU only\n" + "- layer (default): split layers and KV across GPUs\n" + "- row: split rows across GPUs", + [¶ms](std::string value) { + std::string arg_next = value; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + throw std::invalid_argument("invalid value"); + } #ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); #endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } - if (arg == "--rpc") { - CHECK_ARG - params.rpc_servers = argv[i]; - return true; - } - if (arg == "--no-mmap") { - params.use_mmap = false; - return true; - } - if (arg == "--numa") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; } - return true; - } - if (arg == "-v" || arg == "--verbose") { - params.verbosity = 1; - return true; - } - if (arg == "--verbosity") { - CHECK_ARG - params.verbosity = std::stoi(argv[i]); - return true; - } - if (arg == "--verbose-prompt") { - params.verbose_prompt = true; - return true; - } - if (arg == "--no-display-prompt") { - params.display_prompt = false; - return true; - } - if (arg == "-r" || arg == "--reverse-prompt") { - CHECK_ARG - params.antiprompt.emplace_back(argv[i]); - return true; - } - if (arg == "-ld" || arg == "--logdir") { - CHECK_ARG - params.logdir = argv[i]; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; } - return true; - } - if (arg == "-lcs" || arg == "--lookup-cache-static") { - CHECK_ARG - params.lookup_cache_static = argv[i]; - return true; - } - if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { - CHECK_ARG - params.lookup_cache_dynamic = argv[i]; - return true; - } - if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - CHECK_ARG - params.logits_file = argv[i]; - return true; - } - if (arg == "--perplexity" || arg == "--all-logits") { - params.logits_all = true; - return true; - } - if (arg == "--ppl-stride") { - CHECK_ARG - params.ppl_stride = std::stoi(argv[i]); - return true; - } - if (arg == "--ppl-output-type") { - CHECK_ARG - params.ppl_output_type = std::stoi(argv[i]); - return true; - } - if (arg == "-ptc" || arg == "--print-token-count") { - CHECK_ARG - params.n_print = std::stoi(argv[i]); - return true; - } - if (arg == "--check-tensors") { - params.check_tensors = true; - return true; - } - if (arg == "--hellaswag") { - params.hellaswag = true; - return true; - } - if (arg == "--hellaswag-tasks") { - CHECK_ARG - params.hellaswag_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--winogrande") { - params.winogrande = true; - return true; - } - if (arg == "--winogrande-tasks") { - CHECK_ARG - params.winogrande_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--multiple-choice") { - params.multiple_choice = true; - return true; - } - if (arg == "--multiple-choice-tasks") { - CHECK_ARG - params.multiple_choice_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--kl-divergence") { - params.kl_divergence = true; - return true; - } - if (arg == "--ignore-eos") { - params.ignore_eos = true; - return true; - } - if (arg == "--penalize-nl") { - sparams.penalize_nl = true; - return true; - } - if (arg == "-l" || arg == "--logit-bias") { - CHECK_ARG - std::stringstream ss(argv[i]); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + )); + add_opt(llama_arg( + {"-ts", "--tensor-split"}, "N0,N1,N2,...", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", + [¶ms](std::string value) { + std::string arg_next = value; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + throw std::invalid_argument( + format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + ); } - else { - throw std::exception(); + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } else { + params.tensor_split[i] = 0.0f; + } } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN } - catch (const std::exception&) { - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"-mg", "--main-gpu"}, "INDEX", + format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + [¶ms](int value) { + params.main_gpu = value; +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN } - return true; - } - if (arg == "-h" || arg == "--help" || arg == "--usage" ) { - params.usage = true; - return true; - } - if (arg == "--version") { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - if (arg == "--in-prefix-bos") { - params.input_prefix_bos = true; - params.enable_chat_template = false; - return true; - } - if (arg == "--in-prefix") { - CHECK_ARG - params.input_prefix = argv[i]; - params.enable_chat_template = false; - return true; - } - if (arg == "--in-suffix") { - CHECK_ARG - params.input_suffix = argv[i]; - params.enable_chat_template = false; - return true; - } - if (arg == "--spm-infill") { - params.spm_infill = true; - return true; - } - if (arg == "--grammar") { - CHECK_ARG - sparams.grammar = argv[i]; - return true; - } - if (arg == "--grammar-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(sparams.grammar) - ); - return true; - } - if (arg == "-j" || arg == "--json-schema") { - CHECK_ARG - sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); - return true; - } - if (arg == "--override-kv") { - CHECK_ARG - if (!string_parse_kv_override(argv[i], params.kv_overrides)) { - fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"--check-tensors"}, + format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + [¶ms]() { + params.check_tensors = true; } - return true; - } - if (arg == "--host") { - CHECK_ARG - params.hostname = argv[i]; - return true; - } - if (arg == "--port") { - CHECK_ARG - params.port = std::stoi(argv[i]); - return true; - } - if (arg == "--path") { - CHECK_ARG - params.public_path = argv[i]; - return true; - } - if (arg == "--api-key") { - CHECK_ARG - params.api_keys.push_back(argv[i]); - return true; - } - if (arg == "--api-key-file") { - CHECK_ARG - std::ifstream key_file(argv[i]); - if (!key_file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; + )); + add_opt(llama_arg( + {"--override-kv"}, "KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", + [¶ms](std::string value) { + if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { + throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + } + } + )); + add_opt(llama_arg( + {"--lora"}, "FNAME", + "path to LoRA adapter (can be repeated to use multiple adapters)", + [¶ms](std::string value) { + params.lora_adapters.push_back({ std::string(value), 1.0 }); + } + )); + add_opt(llama_arg( + {"--lora-scaled"}, "FNAME", "SCALE", + "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", + [¶ms](std::string fname, std::string scale) { + params.lora_adapters.push_back({ fname, std::stof(scale) }); + } + )); + add_opt(llama_arg( + {"--control-vector"}, "FNAME", + "add a control vector\nnote: this argument can be repeated to add multiple control vectors", + [¶ms](std::string value) { + params.control_vectors.push_back({ 1.0f, value, }); + } + )); + add_opt(llama_arg( + {"--control-vector-scaled"}, "FNAME", "SCALE", + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors", + [¶ms](std::string fname, std::string scale) { + params.control_vectors.push_back({ std::stof(scale), fname }); + } + )); + add_opt(llama_arg( + {"--control-vector-layer-range"}, "START", "END", + "layer range to apply the control vector(s) to, start and end inclusive", + [¶ms](std::string start, std::string end) { + params.control_vector_layer_start = std::stoi(start); + params.control_vector_layer_end = std::stoi(end); + } + )); + add_opt(llama_arg( + {"-m", "--model"}, "FNAME", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? std::string("model path from which to load base model") + : format( + "model path (default: `models/$filename` with filename from `--hf-file` " + "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH + ), + [¶ms](std::string value) { + params.model = value; + } + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [¶ms](std::string value) { + params.model_draft = value; + } + )); + add_opt(llama_arg( + {"-mu", "--model-url"}, "MODEL_URL", + "model download url (default: unused)", + [¶ms](std::string value) { + params.model_url = value; + } + )); + add_opt(llama_arg( + {"-hfr", "--hf-repo"}, "REPO", + "Hugging Face model repository (default: unused)", + [¶ms](std::string value) { + params.hf_repo = value; } - std::string key; - while (std::getline(key_file, key)) { - if (!key.empty()) { - params.api_keys.push_back(key); + )); + add_opt(llama_arg( + {"-hff", "--hf-file"}, "FILE", + "Hugging Face model file (default: unused)", + [¶ms](std::string value) { + params.hf_file = value; + } + )); + add_opt(llama_arg( + {"-hft", "--hf-token"}, "TOKEN", + "Hugging Face access token (default: value from HF_TOKEN environment variable)", + [¶ms](std::string value) { + params.hf_token = value; + } + )); + add_opt(llama_arg( + {"--context-file"}, "FNAME", + "file to load context from (repeat to specify multiple files)", + [¶ms](std::string value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); } + params.context_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-size"}, "N", + format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + [¶ms](int value) { + params.chunk_size = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-separator"}, "STRING", + format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + [¶ms](std::string value) { + params.chunk_separator = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--junk"}, "N", + format("number of times to repeat the junk text (default: %d)", params.n_junk), + [¶ms](int value) { + params.n_junk = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"--pos"}, "N", + format("position of the passkey in the junk text (default: %d)", params.i_pos), + [¶ms](int value) { + params.i_pos = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"-o", "--output"}, "FNAME", + format("output file (default: '%s')", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? params.lora_outfile.c_str() + : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR + ? params.cvector_outfile.c_str() + : params.out_file.c_str()), + [¶ms](std::string value) { + params.out_file = value; + params.cvector_outfile = value; + params.lora_outfile = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"-ofreq", "--output-frequency"}, "N", + format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + [¶ms](int value) { + params.n_out_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--save-frequency"}, "N", + format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + [¶ms](int value) { + params.n_save_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--process-output"}, + format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + [¶ms]() { + params.process_output = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--no-ppl"}, + format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [¶ms]() { + params.compute_ppl = false; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--chunk"}, "N", + format("start processing the input from chunk N (default: %d)", params.i_chunk), + [¶ms](int value) { + params.i_chunk = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"-pps"}, + format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + [¶ms]() { + params.is_pp_shared = true; + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npp"}, "n0,n1,...", + "number of prompt tokens", + [¶ms](std::string value) { + auto p = string_split(value, split_delim); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); } - key_file.close(); - return true; - } - if (arg == "--ssl-key-file") { - CHECK_ARG - params.ssl_file_key = argv[i]; - return true; - } - if (arg == "--ssl-cert-file") { - CHECK_ARG - params.ssl_file_cert = argv[i]; - return true; - } - if (arg == "--timeout" || arg == "-to") { - CHECK_ARG - params.timeout_read = std::stoi(argv[i]); - params.timeout_write = std::stoi(argv[i]); - return true; - } - if (arg == "--threads-http") { - CHECK_ARG - params.n_threads_http = std::stoi(argv[i]); - return true; - } - if (arg == "-spf" || arg == "--system-prompt-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-ntg"}, "n0,n1,...", + "number of text generation tokens", + [¶ms](std::string value) { + auto p = string_split(value, split_delim); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - params.system_prompt = system_prompt; - return true; - } - if (arg == "--log-format") { - CHECK_ARG - if (std::strcmp(argv[i], "json") == 0) { - params.log_json = true; - } else if (std::strcmp(argv[i], "text") == 0) { - params.log_json = false; - } else { - invalid_param = true; - return true; + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npl"}, "n0,n1,...", + "number of parallel prompts", + [¶ms](std::string value) { + auto p = string_split(value, split_delim); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"--embd-normalize"}, "N", + format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + [¶ms](int value) { + params.embd_normalize = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-output-format"}, "FORMAT", + "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", + [¶ms](std::string value) { + params.embd_out = value; } - return true; - } - if (arg == "--no-slots") { - params.endpoint_slots = false; - return true; - } - if (arg == "--metrics") { - params.endpoint_metrics = true; - return true; - } - if (arg == "--slot-save-path") { - CHECK_ARG - params.slot_save_path = argv[i]; - // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - params.slot_save_path += DIRECTORY_SEPARATOR; + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-separator"}, "STRING", + "separator of embendings (default \\n) for example \"<#sep#>\"", + [¶ms](std::string value) { + params.embd_sep = value; } - return true; - } - if (arg == "--chat-template") { - CHECK_ARG - if (!llama_chat_verify_template(argv[i])) { - fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); - fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); - invalid_param = true; - return true; + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--host"}, "HOST", + format("ip address to listen (default: %s)", params.hostname.c_str()), + [¶ms](std::string value) { + params.hostname = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--port"}, "PORT", + format("port to listen (default: %d)", params.port), + [¶ms](int value) { + params.port = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--path"}, "PATH", + format("path to serve static files from (default: %s)", params.public_path.c_str()), + [¶ms](std::string value) { + params.public_path = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--embedding(s)"}, + format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + [¶ms]() { + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--api-key"}, "KEY", + "API key to use for authentication (default: none)", + [¶ms](std::string value) { + params.api_keys.push_back(value); } - params.chat_template = argv[i]; - return true; - } - if (arg == "--slot-prompt-similarity" || arg == "-sps") { - CHECK_ARG - params.slot_prompt_similarity = std::stof(argv[i]); - return true; - } - if (arg == "-pps") { - params.is_pp_shared = true; - return true; - } - if (arg == "-npp") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); - return true; - } - if (arg == "-ntg") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); - return true; - } - if (arg == "-npl") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); - return true; - } - if (arg == "--context-file") { - CHECK_ARG - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--api-key-file"}, "FNAME", + "path to file containing API keys (default: none)", + [¶ms](std::string value) { + std::ifstream key_file(value); + if (!key_file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); } - params.context_files.push_back(argv[i]); - return true; - } - if (arg == "--chunk-size") { - CHECK_ARG - params.chunk_size = std::stoi(argv[i]); - return true; - } - if (arg == "--chunk-separator") { - CHECK_ARG - params.chunk_separator = argv[i]; - return true; - } - if (arg == "--junk") { - CHECK_ARG - params.n_junk = std::stoi(argv[i]); - return true; - } - if (arg == "--pos") { - CHECK_ARG - params.i_pos = std::stoi(argv[i]); - return true; - } - if (arg == "-o" || arg == "--output" || arg == "--output-file") { - CHECK_ARG - params.out_file = argv[i]; - params.cvector_outfile = argv[i]; - params.lora_outfile = argv[i]; - return true; - } - if (arg == "-ofreq" || arg == "--output-frequency") { - CHECK_ARG - params.n_out_freq = std::stoi(argv[i]); - return true; - } - if (arg == "--save-frequency") { - CHECK_ARG - params.n_save_freq = std::stoi(argv[i]); - return true; - } - if (arg == "--process-output") { - params.process_output = true; - return true; - } - if (arg == "--no-ppl") { - params.compute_ppl = false; - return true; - } - if (arg == "--chunk" || arg == "--from-chunk") { - CHECK_ARG - params.i_chunk = std::stoi(argv[i]); - return true; - } - // cvector params - if (arg == "--positive-file") { - CHECK_ARG - params.cvector_positive_file = argv[i]; - return true; - } - if (arg == "--negative-file") { - CHECK_ARG - params.cvector_negative_file = argv[i]; - return true; - } - if (arg == "--pca-batch") { - CHECK_ARG - params.n_pca_batch = std::stoi(argv[i]); - return true; - } - if (arg == "--pca-iter") { - CHECK_ARG - params.n_pca_iterations = std::stoi(argv[i]); - return true; - } - if (arg == "--method") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } - else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } - else { invalid_param = true; } - return true; - } - if (arg == "--no-warmup") { - params.warmup = false; - return true; - } -#ifndef LOG_DISABLE_LOGS - // Parse args for logging parameters - if (log_param_single_parse(argv[i])) { - // Do nothing, log_param_single_parse automatically does it's thing - // and returns if a match was found and parsed. - return true; - } - if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { - // We have a matching known parameter requiring an argument, - // now we need to check if there is anything after this argv - // and flag invalid_param or parse it. - CHECK_ARG - if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { - invalid_param = true; - return true; + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-key-file"}, "FNAME", + "path to file a PEM-encoded SSL private key", + [¶ms](std::string value) { + params.ssl_file_key = value; } - return true; - } - // End of Parse args for logging parameters -#endif // LOG_DISABLE_LOGS - - return false; -} - -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -void gpt_params_print_usage(std::vector & options) { - constexpr static int n_leading_spaces = 40; - std::string leading_spaces(n_leading_spaces, ' '); - for (const auto & opt : options) { - std::ostringstream ss; - for (const auto & arg : opt.args) { - if (&arg == &opt.args.front()) { - ss << format("%-7s", (arg + ",").c_str()); + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-cert-file"}, "FNAME", + "path to file a PEM-encoded SSL certificate", + [¶ms](std::string value) { + params.ssl_file_cert = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--timeout"}, "N", + format("server read/write timeout in seconds (default: %d)", params.timeout_read), + [¶ms](int value) { + params.timeout_read = value; + params.timeout_write = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--threads-http"}, "N", + format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + [¶ms](int value) { + params.n_threads_http = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-spf", "--system-prompt-file"}, "FNAME", + "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", + [¶ms](std::string value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string system_prompt; + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(system_prompt) + ); + params.system_prompt = system_prompt; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--log-format"}, "{text, json}", + "log output format: json or text (default: json)", + [¶ms](std::string value) { + if (value == "json") { + params.log_json = true; + } else if (value == "text") { + params.log_json = false; } else { - ss << arg << (&arg != &opt.args.back() ? ", " : ""); + throw std::invalid_argument("invalid value"); } } - if (!opt.value_ex.empty()) ss << " " << opt.value_ex; - if (ss.tellp() > n_leading_spaces - 3) { - // current line is too long, add new line - ss << "\n" << leading_spaces; - } else { - // padding between arg and help, same line - ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--metrics"}, + format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + [¶ms]() { + params.endpoint_metrics = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--no-slots"}, + format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [¶ms]() { + params.endpoint_slots = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--slot-save-path"}, "PATH", + "path to save slot kv cache (default: disabled)", + [¶ms](std::string value) { + params.slot_save_path = value; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--chat-template"}, "JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", + [¶ms](std::string value) { + if (!llama_chat_verify_template(value)) { + throw std::runtime_error(format( + "error: the supplied chat template is not supported: %s\n" + "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", + value.c_str() + )); + } + params.chat_template = value; } - const auto help_lines = llama_arg::break_str_into_lines(opt.help, 50); - for (const auto & line : help_lines) { - ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", + format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + [¶ms](std::string value) { + params.slot_prompt_similarity = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--lora-init-without-apply"}, + format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + [¶ms]() { + params.lora_init_without_apply = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--simple-io"}, + "use basic IO for better compatibility in subprocesses and limited consoles", + [¶ms]() { + params.simple_io = true; } - printf("%s", ss.str().c_str()); - } -} + )); + add_opt(llama_arg( + {"-ld", "--logdir"}, "LOGDIR", + "path under which to save YAML logs (no logging if unset)", + [¶ms](std::string value) { + params.logdir = value; -std::vector gpt_params_parser_register(gpt_params & params) { - std::vector options; - options.push_back(llama_arg( - {"-h", "--help", "--usage"}, - "print usage and exit", - [¶ms, &options]() { - gpt_params_print_usage(options); - exit(0); - return true; + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } } )); - options.push_back(llama_arg( - {"-m", "--model"}, - format("model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)", params.model.c_str()), + add_opt(llama_arg( + {"--positive-file"}, "FNAME", + format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), [¶ms](std::string value) { - params.model = value; - return true; + params.cvector_positive_file = value; } - ).set_value_ex("FNAME")); - return options; -} - -bool gpt_params_parser_run(int argc, char ** argv, std::vector & options) { - for (const auto & opt : options) { - if (opt.handler_void) opt.handler_void(); - } - return true; -} - -void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - const llama_sampling_params & sparams = params.sparams; - - std::string sampler_type_chars; - std::string sampler_type_names; - for (const auto sampler_type : sparams.samplers_sequence) { - sampler_type_chars += static_cast(sampler_type); - sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; - } - sampler_type_names.pop_back(); - - struct option_info { - LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) - option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) { - va_list args_list; - va_start(args_list, desc); - char buffer[1024]; - vsnprintf(buffer, sizeof(buffer), desc, args_list); - va_end(args_list); - this->desc = buffer; + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--negative-file"}, "FNAME", + format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + [¶ms](std::string value) { + params.cvector_negative_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-batch"}, "N", + format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + [¶ms](int value) { + params.n_pca_batch = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-iter"}, "N", + format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + [¶ms](int value) { + params.n_pca_iterations = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--method"}, "{pca, mean}", + "dimensionality reduction method to be used (default: pca)", + [¶ms](std::string value) { + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { throw std::invalid_argument("invalid value"); } } - - option_info(const std::string & grp) : grp(grp) {} - - std::string tags; - std::string args; - std::string desc; - std::string grp; - }; - - std::vector options; - - // TODO: filter by tags - - options.push_back({ "general" }); - options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); - options.push_back({ "*", " --version", "show version and build info" }); - options.push_back({ "*", "-v, --verbose", "print verbose information" }); - options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity }); - options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); - options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); - options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); - options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); - options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads }); - options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); - options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); - options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); - -#ifndef GGML_USE_OPENMP - // these options are available only with the internal threadpool - options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"}); - options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"}); - options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu}); - options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority}); - options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll}); - - options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"}); - options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"}); - options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"}); - options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"}); - options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"}); - - options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"}); - options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"}); - options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"}); - options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"}); - options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"}); - - options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"}); - options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"}); - options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)"}); - options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"}); - options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"}); -#endif // GGML_USE_OPENMP - - options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); - options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); - options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", - "path to static lookup cache to use for lookup decoding (not updated by generation)" }); - options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME", - "path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); - - options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); - options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); - options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); - options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); - options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); - options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); - options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); - options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n" - "in conversation mode, this will be used as system prompt\n" - "(default: '%s')", params.prompt.c_str() }); - options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); - options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); - options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); - options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); - options.push_back({ "*", " --no-escape", "do not process escape sequences" }); - options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print }); - options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" }); - options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n" - "not supported with --interactive or other interactive options" }); - options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" }); - options.push_back({ "main", "-r, --reverse-prompt PROMPT", - "halt generation at PROMPT, return control in interactive mode\n" - "can be specified more than once for multiple prompts" }); - options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); - options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n" - "if suffix/prefix are not specified, default chat template will be used\n" - "(default: %s)", params.conversation ? "true" : "false" }); - options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); - options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); - options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); - options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); - options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); - options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); - options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" }); - options.push_back({ "server infill", - " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); - - options.push_back({ "sampling" }); - options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" - "(default: %s)", sampler_type_names.c_str() }); - options.push_back({ "*", " --sampling-seq SEQUENCE", - "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() }); - options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" }); - options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" }); - options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp }); - options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k }); - options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p }); - options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p }); - options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z }); - options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p }); - options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n }); - options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat }); - options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present }); - options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq }); - options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range }); - options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent }); - options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n" - "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat }); - options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta }); - options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau }); - options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" - "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" - "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); - options.push_back({ "main", " --cfg-negative-prompt PROMPT", - "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() }); - options.push_back({ "main", " --cfg-negative-prompt-file FNAME", - "negative prompt file to use for guidance" }); - options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); - options.push_back({ "main", " --chat-template JINJA_TEMPLATE", - "set custom jinja chat template (default: template taken from model's metadata)\n" - "if suffix/prefix are specified, template will be disabled\n" - "only commonly used templates are accepted:\n" - "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); - options.push_back({ "grammar" }); - options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); - options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); - options.push_back({ "*", "-j, --json-schema SCHEMA", - "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n" - "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); - - options.push_back({ "embedding" }); - options.push_back({ "embedding", " --pooling {none,mean,cls,last}", - "pooling type for embeddings, use model default if unspecified" }); - options.push_back({ "embedding", " --attention {causal,non-causal}", - "attention type for embeddings, use model default if unspecified" }); - - options.push_back({ "context hacking" }); - options.push_back({ "*", " --rope-scaling {none,linear,yarn}", - "RoPE frequency scaling method, defaults to linear unless specified by the model" }); - options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" }); - options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" }); - options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" }); - options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx }); - options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor }); - options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor }); - options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow }); - options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast }); - options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n }); - options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w }); - options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" }); - options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" }); - options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() }); - options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() }); - - options.push_back({ "perplexity" }); - options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" }); - options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks }); - options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks }); - options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --multiple-choice-tasks N", - "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks }); - options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" }); - options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride }); - options.push_back({ "perplexity", " --ppl-output-type {0,1}", - "output type for perplexity calculation (default: %d)", params.ppl_output_type }); - - options.push_back({ "parallel" }); - options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold }); - options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); - options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); - options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); - options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" }); - - options.push_back({ "multi-modality" }); - options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); - options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); - - options.push_back({ "backend" }); - options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); - - if (llama_supports_mlock()) { - options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); - } - if (llama_supports_mmap()) { - options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" }); - } - options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n" - " - distribute: spread execution evenly over all nodes\n" - " - isolate: only spawn threads on CPUs on the node that execution started on\n" - " - numactl: use the CPU map provided by numactl\n" - "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437" }); - - if (llama_supports_gpu_offload()) { - options.push_back({ "*", "-ngl, --gpu-layers N", - "number of layers to store in VRAM" }); - options.push_back({ "*", "-ngld, --gpu-layers-draft N", - "number of layers to store in VRAM for the draft model" }); - options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", - "how to split the model across multiple GPUs, one of:\n" - " - none: use one GPU only\n" - " - layer (default): split layers and KV across GPUs\n" - " - row: split rows across GPUs" }); - options.push_back({ "*", "-ts, --tensor-split SPLIT", - "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); - options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" - "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); - } - - options.push_back({ "model" }); - options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" }); - options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", - "advanced option to override model metadata by key. may be specified multiple times.\n" - "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); - options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" }); - options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); - options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" - "note: this argument can be repeated to add multiple control vectors" }); - options.push_back({ "*", " --control-vector-scaled FNAME SCALE", - "add a control vector with user defined scaling SCALE\n" - "note: this argument can be repeated to add multiple scaled control vectors" }); - options.push_back({ "*", " --control-vector-layer-range START END", - "layer range to apply the control vector(s) to, start and end inclusive" }); - options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" - "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH }); - options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" }); - options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); - options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); - options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); - options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" }); - - options.push_back({ "retrieval" }); - options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); - options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size }); - options.push_back({ "retrieval", " --chunk-separator STRING", - "separator between chunks (default: '%s')", params.chunk_separator.c_str() }); - - options.push_back({ "passkey" }); - options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); - options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); - - options.push_back({ "imatrix" }); - options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() }); - options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq }); - options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); - options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); - options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); - options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk }); - - options.push_back({ "bench" }); - options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); - options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); - options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); - options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); - - options.push_back({ "embedding" }); - options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize }); - options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" }); - options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" }); - - options.push_back({ "server" }); - options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); - options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); - options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); - options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); - options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); - options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); - options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); - options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); - options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http }); - options.push_back({ "server", " --system-prompt-file FNAME", - "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); - options.push_back({ "server", " --log-format {text,json}", - "log output format: json or text (default: json)" }); - options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" }); - options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" }); - options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" }); - options.push_back({ "server", " --chat-template JINJA_TEMPLATE", - "set custom jinja chat template (default: template taken from model's metadata)\n" - "only commonly used templates are accepted:\n" - "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); - options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", - "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); - options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"}); - + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); #ifndef LOG_DISABLE_LOGS - options.push_back({ "logging" }); - options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" }); - options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" }); - options.push_back({ "logging", " --log-test", "Run simple logging test" }); - options.push_back({ "logging", " --log-disable", "Disable trace logs" }); - options.push_back({ "logging", " --log-enable", "Enable trace logs" }); - options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" }); - options.push_back({ "logging", " --log-new", "Create a separate new log file on start. " - "Each log file will have unique name: \"..log\"" }); - options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); + // TODO: make this looks less weird + add_opt(llama_arg( + {"--log-test"}, + "Log test", + []() { log_param_single_parse("--log-test"); } + )); + add_opt(llama_arg( + {"--log-disable"}, + "Log disable", + []() { log_param_single_parse("--log-disable"); } + )); + add_opt(llama_arg( + {"--log-enable"}, + "Log enable", + []() { log_param_single_parse("--log-enable"); } + )); + add_opt(llama_arg( + {"--log-new"}, + "Log new", + []() { log_param_single_parse("--log-new"); } + )); + add_opt(llama_arg( + {"--log-append"}, + "Log append", + []() { log_param_single_parse("--log-append"); } + )); + add_opt(llama_arg( + {"--log-file"}, "FNAME", + "Log file", + [](std::string value) { log_param_pair_parse(false, "--log-file", value); } + )); #endif // LOG_DISABLE_LOGS - options.push_back({ "cvector" }); - options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); - options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); - options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); - options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); - - options.push_back({ "export-lora" }); - options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); - options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); - options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); - options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); - - printf("usage: %s [options]\n", argv[0]); - - for (const auto & o : options) { - if (!o.grp.empty()) { - printf("\n%s:\n\n", o.grp.c_str()); - continue; - } - printf(" %-32s", o.args.c_str()); - if (o.args.length() > 30) { - printf("\n%34s", ""); - } - - const auto desc = o.desc; - size_t start = 0; - size_t end = desc.find('\n'); - while (end != std::string::npos) { - printf("%s\n%34s", desc.substr(start, end - start).c_str(), ""); - start = end + 1; - end = desc.find('\n', start); - } - - printf("%s\n", desc.substr(start).c_str()); - } - printf("\n"); + return options; } std::string gpt_params_get_system_info(const gpt_params & params) { diff --git a/common/common.h b/common/common.h index 04f4476f039de..27e908d7f22d6 100644 --- a/common/common.h +++ b/common/common.h @@ -170,6 +170,7 @@ struct gpt_params { bool kl_divergence = false; // compute KL divergence + std::function print_usage = nullptr; // print example-specific usage and example bool usage = false; // print usage bool use_color = false; // use color to distinguish generations and inputs bool special = false; // enable special token output @@ -279,73 +280,67 @@ struct gpt_params { }; enum llama_example { - LLAMA_EXAMPLE_ALL, - LLAMA_EXAMPLE_SERVER, + LLAMA_EXAMPLE_COMMON, + LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_MAIN, + LLAMA_EXAMPLE_INFILL, + LLAMA_EXAMPLE_EMBEDDING, + LLAMA_EXAMPLE_PERPLEXITY, + LLAMA_EXAMPLE_RETRIEVAL, + LLAMA_EXAMPLE_PASSKEY, + LLAMA_EXAMPLE_IMATRIX, + LLAMA_EXAMPLE_BENCH, + LLAMA_EXAMPLE_SERVER, + LLAMA_EXAMPLE_CVECTOR_GENERATOR, + LLAMA_EXAMPLE_EXPORT_LORA, + + LLAMA_EXAMPLE_COUNT, }; struct llama_arg { - std::set examples = {LLAMA_EXAMPLE_ALL}; + std::set examples = {LLAMA_EXAMPLE_COMMON}; std::vector args; - std::string value_ex; + std::string value_hint; // help text or example for arg value + std::string value_hint_2; // for second arg value std::string env; std::string help; - std::function handler_void = nullptr; - std::function handler_string = nullptr; - std::function handler_bool = nullptr; - std::function handler_int = nullptr; - std::function handler_float = nullptr; + std::function handler_void = nullptr; + std::function handler_string = nullptr; + std::function handler_str_str = nullptr; + std::function handler_int = nullptr; - llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_string(handler) {} + llama_arg(std::vector args, std::string value_hint, std::string help, std::function handler) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} - llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_bool(handler) {} + llama_arg(std::vector args, std::string value_hint, std::string help, std::function handler) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} - llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_void(handler) {} + llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_void(handler) {} - llama_arg & set_examples(std::set _examples) { - examples = std::move(_examples); - return *this; - } + // support 2 values for arg + llama_arg(std::vector args, std::string value_hint, std::string value_hint_2, std::string help, std::function handler) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - llama_arg & set_value_ex(std::string _value_ex) { - value_ex = std::move(_value_ex); + llama_arg & set_examples(std::set examples) { + this->examples = std::move(examples); return *this; } - llama_arg & set_env(std::string _env) { - env = _env; + llama_arg & set_env(std::string env) { + this->env = std::move(env); return *this; } - // utility function - static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { - std::vector result; - std::istringstream iss(input); - std::string word, line; - while (iss >> word) { - if (line.length() + !line.empty() + word.length() > max_char_per_line) { - if (!line.empty()) result.push_back(line); - line = word; - } else { - line += (!line.empty() ? " " : "") + word; - } - } - if (!line.empty()) result.push_back(line); - return result; + bool in_example(enum llama_example ex) { + return examples.find(ex) != examples.end(); } }; -std::vector gpt_params_parser_register(gpt_params & params); -bool gpt_params_parser_run(int argc, char ** argv, std::vector & options); +std::vector gpt_params_parser_init(gpt_params & params, llama_example ex); +std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage); +bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector & options); +bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params, std::vector & options); +void gpt_params_print_usage(std::vector & options); -void gpt_params_parse_from_env(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params); -bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); -bool gpt_params_parse (int argc, char ** argv, gpt_params & params); -bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); -void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); - std::string gpt_params_get_system_info(const gpt_params & params); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 25e7c775a0095..0d6076108eac8 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -28,9 +28,7 @@ static std::vector parse_list(char * p) { return ret; } -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); LOG_TEE("\n"); @@ -39,8 +37,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 53fbfb0a8cf2a..55c7a09d1edd9 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -7,9 +7,7 @@ #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); LOG_TEE("\n"); @@ -21,8 +19,8 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index a68268388389d..0795175a12a73 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -35,9 +35,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { return ret; } -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { printf("\nexample usage:\n"); printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); @@ -390,8 +388,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index b05aa006e7da5..74151d24f32d3 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -79,8 +79,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 5e89988e2beda..de11d86ba2712 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -144,8 +144,8 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 8df457e219493..544e7fff6fbcc 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -391,9 +391,7 @@ struct lora_merge_ctx { } }; -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { printf("\nexample usage:\n"); printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); printf("\nNOTE: output model is F16\n"); @@ -403,8 +401,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 2c61c2e1eb3bc..73ad8c11b2a98 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -157,8 +157,8 @@ static std::string gritlm_instruction(const std::string & instruction) { int main(int argc, char * argv[]) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 83b85d72b043a..2a4f230740b76 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -17,9 +17,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s \\\n" " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" @@ -579,8 +577,8 @@ int main(int argc, char ** argv) { params.logits_all = true; params.verbosity = 1; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 05700c1d591d9..4f5f7d028d5f8 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -106,8 +106,8 @@ int main(int argc, char ** argv) { llama_sampling_params & sparams = params.sparams; g_params = ¶ms; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 86b39f20eea6e..4dd17cf68ab1c 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -112,9 +112,7 @@ struct llava_context { struct llama_model * model = NULL; }; -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\n example usage:\n"); LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); @@ -280,8 +278,8 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } @@ -293,7 +291,7 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv, {}); + print_usage(argc, argv); return 1; } auto model = llava_init(¶ms); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index f500ea5b944f4..18a9ad09f7de1 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -253,8 +253,8 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - show_additional_info(argc, argv); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } @@ -266,7 +266,6 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty())) { - gpt_params_print_usage(argc, argv, params); show_additional_info(argc, argv); return 1; } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 81cf1629c5b6a..6b2a131da2c1b 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -37,8 +37,8 @@ struct ngram_container { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 5f04709f50231..795b06c8894f0 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -13,8 +13,8 @@ int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 400f3e0b08957..93299ef8b738a 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -15,8 +15,8 @@ int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index d53a9828c2ea2..5a7b773faa2e8 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -14,8 +14,8 @@ int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6a025ed512217..058a6da142b0d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -131,12 +131,9 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); LOG_TEE("\n"); @@ -21,8 +19,8 @@ int main(int argc, char ** argv) { params.n_keep = 32; params.i_pos = -1; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 484dd589109c7..2855dd0ab68ca 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1967,8 +1967,8 @@ int main(int argc, char ** argv) { params.n_ctx = 512; params.logits_all = true; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index aab9d81058af9..c3e835c864048 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -4,9 +4,7 @@ #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); LOG_TEE("\n"); @@ -113,8 +111,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3ea7c790d2bf7..ef82e81521e15 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -10,8 +10,8 @@ int main(int argc, char ** argv) { params.prompt = "The quick brown fox"; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 109dbc023efe0..099b224cf9c5f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2491,14 +2491,11 @@ int main(int argc, char ** argv) { // own arguments required by this example gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } - // parse arguments from environment variables - gpt_params_parse_from_env(params); - // TODO: not great to use extern vars server_log_json = params.log_json; server_verbose = params.verbosity > 0; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 69a92cf7dc0c0..57ce71c1c4ba5 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -6,9 +6,7 @@ #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); LOG_TEE("\n"); @@ -20,8 +18,8 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 1616edecbbef6..849f5d9983c16 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -27,8 +27,8 @@ struct seq_draft { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } From 753782ae350f3bb00a2e5d19f7dac9a210fb8518 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 16:46:31 +0200 Subject: [PATCH 03/20] add test --- Makefile | 6 +++ common/common.cpp | 79 +++++++++++++++++++++++---------------- common/common.h | 2 + tests/CMakeLists.txt | 1 + tests/test-arg-parser.cpp | 67 +++++++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+), 33 deletions(-) create mode 100644 tests/test-arg-parser.cpp diff --git a/Makefile b/Makefile index 332496cfc39c1..9c61d3ec02b24 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,7 @@ BUILD_TARGETS = \ # Binaries only useful for tests TEST_TARGETS = \ + tests/test-arg-parser \ tests/test-autorelease \ tests/test-backend-ops \ tests/test-chat-template \ @@ -1505,6 +1506,11 @@ run-benchmark-matmult: llama-benchmark-matmult .PHONY: run-benchmark-matmult swift +tests/test-arg-parser: tests/test-arg-parser.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + tests/test-llama-grammar: tests/test-llama-grammar.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/common/common.cpp b/common/common.cpp index 09e3a992c6a06..ce9199c844254 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -383,8 +383,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto const std::string arg_prefix = "--"; llama_sampling_params & sparams = params.sparams; - std::unordered_map arg_to_options; - for (const auto & opt : options) { + std::unordered_map arg_to_options; + for (auto & opt : options) { for (const auto & arg : opt.args) { arg_to_options[arg] = &opt; } @@ -404,8 +404,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto if (arg_to_options.find(arg) == arg_to_options.end()) { throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); } + auto opt = *arg_to_options[arg]; try { - auto opt = *arg_to_options[arg]; if (opt.handler_void) { opt.handler_void(); continue; @@ -431,7 +431,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto continue; } } catch (std::exception & e) { - throw std::invalid_argument(format("error: %s", e.what())); + throw std::invalid_argument(format( + "error while handling argument \"%s\": %s\n\n" + "usage:\n%s\n\nto show complete usage, run with -h", + arg.c_str(), e.what(), arg_to_options[arg]->to_string(false).c_str())); } } @@ -592,39 +595,49 @@ static std::vector break_str_into_lines(std::string input, size_t m return result; } -void gpt_params_print_usage(std::vector & options) { +std::string llama_arg::to_string(bool markdown) { + // params for printing to console const static int n_leading_spaces = 40; const static int n_char_per_line_help = 70; // TODO: detect this based on current console - - auto print_options = [](std::vector & options) { - std::string leading_spaces(n_leading_spaces, ' '); - for (const auto & opt : options) { - std::ostringstream ss; - for (const auto & arg : opt->args) { - if (&arg == &opt->args.front()) { - ss << (opt->args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str())); - } else { - ss << arg << (&arg != &opt->args.back() ? ", " : ""); - } - } - if (!opt->value_hint.empty()) ss << " " << opt->value_hint; - if (ss.tellp() > n_leading_spaces - 3) { - // current line is too long, add new line - ss << "\n" << leading_spaces; - } else { - // padding between arg and help, same line - ss << std::string(leading_spaces.size() - ss.tellp(), ' '); - } - const auto help_lines = break_str_into_lines(opt->help, n_char_per_line_help); - for (const auto & line : help_lines) { - ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; - } - printf("%s", ss.str().c_str()); + std::string leading_spaces(n_leading_spaces, ' '); + + std::ostringstream ss; + if (markdown) ss << "| `"; + for (const auto & arg : args) { + if (arg == args.front()) { + ss << (args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str())); + } else { + ss << arg << (arg != args.back() ? ", " : ""); + } + } + if (!value_hint.empty()) ss << " " << value_hint; + if (!markdown) { + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; + } else { + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = break_str_into_lines(help, n_char_per_line_help); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + } + } else { + ss << "` | " << help << " |"; + } + return ss.str(); +} + +void gpt_params_print_usage(std::vector & options) { + auto print_options = [](std::vector & options) { + for (llama_arg * opt : options) { + printf("%s", opt->to_string(false).c_str()); } }; - std::vector common_options; - std::vector specific_options; + std::vector common_options; + std::vector specific_options; for (auto & opt : options) { if (opt.in_example(LLAMA_EXAMPLE_COMMON)) { common_options.push_back(&opt); @@ -1688,7 +1701,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example } )); add_opt(llama_arg( - {"-sm", "--split-mode"}, "SPLIT_MODE", + {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" diff --git a/common/common.h b/common/common.h index 27e908d7f22d6..05211bf972764 100644 --- a/common/common.h +++ b/common/common.h @@ -331,6 +331,8 @@ struct llama_arg { bool in_example(enum llama_example ex) { return examples.find(ex) != examples.end(); } + + std::string to_string(bool markdown); }; std::vector gpt_params_parser_init(gpt_params & params, llama_example ex); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0207e3a5943c9..30e71cfd44c51 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -108,6 +108,7 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) # llama_target_and_test(test-double-float.cpp) # SLOW +llama_target_and_test(test-arg-parser.cpp) llama_target_and_test(test-quantize-fns.cpp) llama_target_and_test(test-quantize-perf.cpp) llama_target_and_test(test-sampling.cpp) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp new file mode 100644 index 0000000000000..8b95a59d39c86 --- /dev/null +++ b/tests/test-arg-parser.cpp @@ -0,0 +1,67 @@ +#include +#include +#include + +#undef NDEBUG +#include + +#include "common.h" + +int main(void) { + gpt_params params; + + printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); + for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { + try { + gpt_params_parser_init(params, (enum llama_example)ex); + } catch (std::exception & e) { + printf("%s\n", e.what()); + assert(false); + } + } + + auto list_str_to_char = [](std::vector & argv) -> std::vector { + std::vector res; + for (auto & arg : argv) { + res.push_back(const_cast(arg.data())); + } + return res; + }; + + std::vector argv; + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + + printf("test-arg-parser: test invalid usage\n\n"); + + argv = {"binary_name", "-m"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + + argv = {"binary_name", "-ngl", "hello"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + + argv = {"binary_name", "-sm", "hello"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + + + printf("test-arg-parser: test valid usage\n\n"); + + argv = {"binary_name", "-m", "model_file.gguf"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.model == "model_file.gguf"); + + argv = {"binary_name", "-t", "1234"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.cpuparams.n_threads == 1234); + + argv = {"binary_name", "--verbose"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.verbosity == 1); + + argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.model == "abc.gguf"); + assert(params.n_predict == 6789); + assert(params.n_batch == 9090); + + printf("test-arg-parser: all tests OK\n\n"); +} From 60ae92bd5430640609e45afa62931fcaec08dae1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 19:26:21 +0200 Subject: [PATCH 04/20] handle env --- common/common.cpp | 113 +++++++++++++++++--------------------- common/common.h | 16 ++++++ tests/test-arg-parser.cpp | 24 ++++++++ 3 files changed, 90 insertions(+), 63 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ce9199c844254..49db551ae6339 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -77,41 +77,6 @@ using json = nlohmann::ordered_json; -// -// Environment variable utils -// - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::string(value) : target; -} - -template -static typename std::enable_if::value && std::is_integral::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::stoi(value) : target; -} - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::stof(value) : target; -} - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - if (value) { - std::string val(value); - target = val == "1" || val == "true"; - } -} - // // CPU utils // @@ -390,6 +355,29 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto } } + // handle environment variables + for (auto & opt : options) { + std::string value; + if (opt.get_value_from_env(value)) { + try { + if (opt.handler_void && (value == "1" || value == "true")) { + opt.handler_void(); + } + if (opt.handler_int) { + opt.handler_int(std::stoi(value)); + } + if (opt.handler_string) { + opt.handler_string(value); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling environment variable \"%s\": %s\n\n", opt.env.c_str(), e.what())); + } + } + } + + // handle command line arguments auto check_arg = [&](int i) { if (i+1 >= argc) { throw std::invalid_argument("expected value for argument"); @@ -405,6 +393,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); } auto opt = *arg_to_options[arg]; + if (opt.has_value_from_env()) { + fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env.c_str(), arg.c_str()); + } try { if (opt.handler_void) { opt.handler_void(); @@ -449,10 +440,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto gpt_params_handle_model_default(params); - if (params.hf_token.empty()) { - get_env("HF_TOKEN", params.hf_token); - } - if (params.escape) { string_process_escapes(params.prompt); string_process_escapes(params.input_prefix); @@ -762,7 +749,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example params.cpuparams.n_threads = std::thread::hardware_concurrency(); } } - )); + ).set_env("LLAMA_ARG_THREADS")); add_opt(llama_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", @@ -960,28 +947,28 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](int value) { params.n_ctx = value; } - )); + ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(llama_arg( {"-n", "--predict"}, "N", format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), [¶ms](int value) { params.n_predict = value; } - )); + ).set_env("LLAMA_ARG_N_PREDICT")); add_opt(llama_arg( {"-b", "--batch-size"}, "N", format("logical maximum batch size (default: %d)", params.n_batch), [¶ms](int value) { params.n_batch = value; } - )); + ).set_env("LLAMA_ARG_BATCH")); add_opt(llama_arg( {"-ub", "--ubatch-size"}, "N", format("physical maximum batch size (default: %d)", params.n_ubatch), [¶ms](int value) { params.n_ubatch = value; } - )); + ).set_env("LLAMA_ARG_UBATCH")); add_opt(llama_arg( {"--keep"}, "N", format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), @@ -1002,7 +989,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms]() { params.flash_attn = true; } - )); + ).set_env("LLAMA_ARG_FLASH_ATTN")); add_opt(llama_arg( {"-p", "--prompt"}, "PROMPT", "prompt to start generation with\n", @@ -1599,7 +1586,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.defrag_thold = std::stof(value); } - )); + ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(llama_arg( {"-np", "--parallel"}, "N", format("number of parallel sequences to decode (default: %d)", params.n_parallel), @@ -1620,14 +1607,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms]() { params.cont_batching = true; } - )); + ).set_env("LLAMA_ARG_CONT_BATCHING")); add_opt(llama_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", [¶ms]() { params.cont_batching = false; } - )); + ).set_env("LLAMA_ARG_NO_CONT_BATCHING")); add_opt(llama_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", @@ -1688,7 +1675,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } } - )); + ).set_env("LLAMA_ARG_N_GPU_LAYERS")); add_opt(llama_arg( {"-ngld", "--gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", @@ -1830,7 +1817,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.model = value; } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); add_opt(llama_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", @@ -1844,28 +1831,28 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.model_url = value; } - )); + ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(llama_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", [¶ms](std::string value) { params.hf_repo = value; } - )); + ).set_env("LLAMA_ARG_HF_REPO")); add_opt(llama_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", [¶ms](std::string value) { params.hf_file = value; } - )); + ).set_env("LLAMA_ARG_HF_FILE")); add_opt(llama_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", [¶ms](std::string value) { params.hf_token = value; } - )); + ).set_env("HF_TOKEN")); add_opt(llama_arg( {"--context-file"}, "FNAME", "file to load context from (repeat to specify multiple files)", @@ -2012,14 +1999,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.hostname = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); add_opt(llama_arg( {"--port"}, "PORT", format("port to listen (default: %d)", params.port), [¶ms](int value) { params.port = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); add_opt(llama_arg( {"--path"}, "PATH", format("path to serve static files from (default: %s)", params.public_path.c_str()), @@ -2028,19 +2015,19 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( - {"--embedding(s)"}, + {"--embedding", "--embeddings"}, format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), [¶ms]() { params.embedding = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(llama_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", [¶ms](std::string value) { params.api_keys.push_back(value); } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); add_opt(llama_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", @@ -2086,7 +2073,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](int value) { params.n_threads_http = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); add_opt(llama_arg( {"-spf", "--system-prompt-file"}, "FNAME", "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", @@ -2123,14 +2110,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms]() { params.endpoint_metrics = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); add_opt(llama_arg( {"--no-slots"}, format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), [¶ms]() { params.endpoint_slots = false; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); add_opt(llama_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", @@ -2157,7 +2144,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example } params.chat_template = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(llama_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), diff --git a/common/common.h b/common/common.h index 05211bf972764..c6f476ec34586 100644 --- a/common/common.h +++ b/common/common.h @@ -316,6 +316,7 @@ struct llama_arg { llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_void(handler) {} // support 2 values for arg + // note: env variable is not yet support for 2 values llama_arg(std::vector args, std::string value_hint, std::string value_hint_2, std::string help, std::function handler) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} llama_arg & set_examples(std::set examples) { @@ -324,6 +325,7 @@ struct llama_arg { } llama_arg & set_env(std::string env) { + help = help + "\n(env: " + env + ")"; this->env = std::move(env); return *this; } @@ -332,6 +334,20 @@ struct llama_arg { return examples.find(ex) != examples.end(); } + bool get_value_from_env(std::string & output) { + if (env.empty()) return false; + char * value = std::getenv(env.c_str()); + if (value) { + output = value; + return true; + } + return false; + } + + bool has_value_from_env() { + return std::getenv(env.c_str()); + } + std::string to_string(bool markdown); }; diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 8b95a59d39c86..ff1a626c39761 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -63,5 +63,29 @@ int main(void) { assert(params.n_predict == 6789); assert(params.n_batch == 9090); + printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n"); + + setenv("LLAMA_ARG_THREADS", "blah", true); + argv = {"binary_name"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + + setenv("LLAMA_ARG_MODEL", "blah.gguf", true); + setenv("LLAMA_ARG_THREADS", "1010", true); + argv = {"binary_name"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.model == "blah.gguf"); + assert(params.cpuparams.n_threads == 1010); + + + printf("test-arg-parser: test environment variables being overwritten\n\n"); + + setenv("LLAMA_ARG_MODEL", "blah.gguf", true); + setenv("LLAMA_ARG_THREADS", "1010", true); + argv = {"binary_name", "-m", "overwritten.gguf"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.model == "overwritten.gguf"); + assert(params.cpuparams.n_threads == 1010); + + printf("test-arg-parser: all tests OK\n\n"); } From 286dcc9dbef0485435dd34142f781605afe5f1b2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 19:28:06 +0200 Subject: [PATCH 05/20] fix linux build --- common/common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common/common.h b/common/common.h index c6f476ec34586..f849483307d0a 100644 --- a/common/common.h +++ b/common/common.h @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' From 75d0869ef5e79fb8d31b5f120ffe1cacd1fdabf9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 19:59:55 +0200 Subject: [PATCH 06/20] add export-docs example --- .gitignore | 1 + Makefile | 7 +++++ common/common.cpp | 29 +++++++---------- common/common.h | 2 +- examples/export-docs/CMakeLists.txt | 5 +++ examples/export-docs/export-docs.cpp | 47 ++++++++++++++++++++++++++++ 6 files changed, 73 insertions(+), 18 deletions(-) create mode 100644 examples/export-docs/CMakeLists.txt create mode 100644 examples/export-docs/export-docs.cpp diff --git a/.gitignore b/.gitignore index 9986ac6b19d4e..1092d097a7542 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ llama-batched-swift /rpc-server out/ tmp/ +autogen-*.md # Deprecated diff --git a/Makefile b/Makefile index 9c61d3ec02b24..ba3f11c5352bf 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,7 @@ BUILD_TARGETS = \ llama-tokenize \ llama-vdot \ llama-cvector-generator \ + llama-export-docs \ tests/test-c.o # Binaries only useful for tests @@ -1449,6 +1450,12 @@ examples/server/%.hpp: examples/server/public/% Makefile echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ ) > $@ +llama-export-docs: examples/export-docs/export-docs.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + ./llama-export-docs + libllava.a: examples/llava/llava.cpp \ examples/llava/llava.h \ examples/llava/clip.cpp \ diff --git a/common/common.cpp b/common/common.cpp index 49db551ae6339..2d99bfc255252 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -425,7 +425,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto throw std::invalid_argument(format( "error while handling argument \"%s\": %s\n\n" "usage:\n%s\n\nto show complete usage, run with -h", - arg.c_str(), e.what(), arg_to_options[arg]->to_string(false).c_str())); + arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); } } @@ -582,14 +582,13 @@ static std::vector break_str_into_lines(std::string input, size_t m return result; } -std::string llama_arg::to_string(bool markdown) { +std::string llama_arg::to_string() { // params for printing to console const static int n_leading_spaces = 40; const static int n_char_per_line_help = 70; // TODO: detect this based on current console std::string leading_spaces(n_leading_spaces, ' '); std::ostringstream ss; - if (markdown) ss << "| `"; for (const auto & arg : args) { if (arg == args.front()) { ss << (args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str())); @@ -598,20 +597,16 @@ std::string llama_arg::to_string(bool markdown) { } } if (!value_hint.empty()) ss << " " << value_hint; - if (!markdown) { - if (ss.tellp() > n_leading_spaces - 3) { - // current line is too long, add new line - ss << "\n" << leading_spaces; - } else { - // padding between arg and help, same line - ss << std::string(leading_spaces.size() - ss.tellp(), ' '); - } - const auto help_lines = break_str_into_lines(help, n_char_per_line_help); - for (const auto & line : help_lines) { - ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; - } + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; } else { - ss << "` | " << help << " |"; + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = break_str_into_lines(help, n_char_per_line_help); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; } return ss.str(); } @@ -619,7 +614,7 @@ std::string llama_arg::to_string(bool markdown) { void gpt_params_print_usage(std::vector & options) { auto print_options = [](std::vector & options) { for (llama_arg * opt : options) { - printf("%s", opt->to_string(false).c_str()); + printf("%s", opt->to_string().c_str()); } }; diff --git a/common/common.h b/common/common.h index f849483307d0a..7536120fc1588 100644 --- a/common/common.h +++ b/common/common.h @@ -349,7 +349,7 @@ struct llama_arg { return std::getenv(env.c_str()); } - std::string to_string(bool markdown); + std::string to_string(); }; std::vector gpt_params_parser_init(gpt_params & params, llama_example ex); diff --git a/examples/export-docs/CMakeLists.txt b/examples/export-docs/CMakeLists.txt new file mode 100644 index 0000000000000..0e953167ed653 --- /dev/null +++ b/examples/export-docs/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-export-docs) +add_executable(${TARGET} export-docs.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/export-docs/export-docs.cpp b/examples/export-docs/export-docs.cpp new file mode 100644 index 0000000000000..e21c4b89d53eb --- /dev/null +++ b/examples/export-docs/export-docs.cpp @@ -0,0 +1,47 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include + +// Export usage message (-h) to markdown format + +static void export_md(std::string fname, llama_example ex) { + std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); + + gpt_params params; + auto options = gpt_params_parser_init(params, ex); + + file << "| Argument | Explanation |\n"; + file << "| -------- | ----------- |\n"; + for (auto & opt : options) { + file << "| `"; + // args + for (const auto & arg : opt.args) { + if (arg == opt.args.front()) { + file << (opt.args.size() == 1 ? arg : (arg + ", ")); + } else { + file << arg << (arg != opt.args.back() ? ", " : ""); + } + } + // value hint + std::string md_value_hint(opt.value_hint); + string_replace_all(md_value_hint, "|", "\\|"); + file << " " << md_value_hint; + // help text + std::string md_help(opt.help); + string_replace_all(md_help, "\n", "
"); + string_replace_all(md_help, "|", "\\|"); + file << "` | " << md_help << " |\n"; + } +} + +int main(int, char **) { + export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN); + export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); + + return 0; +} From f5e6a80c3f941d08d9cd8a8c5e3ab9a46c5ffa8a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 20:00:52 +0200 Subject: [PATCH 07/20] fix build (2) --- common/common.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/common.cpp b/common/common.cpp index 2d99bfc255252..838f59f4e1bfa 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #if defined(__APPLE__) && defined(__MACH__) #include From 88e3a4f3bc4a4f4f93a9e17c186eac4822579e7c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 20:20:46 +0200 Subject: [PATCH 08/20] skip build test-arg-parser on windows --- tests/test-arg-parser.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index ff1a626c39761..f3e24e9d8ec10 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -8,6 +8,9 @@ #include "common.h" int main(void) { +#ifdef _WIN32 + printf("test-arg-parser: skip on windows build\n"); +#else gpt_params params; printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); @@ -88,4 +91,5 @@ int main(void) { printf("test-arg-parser: all tests OK\n\n"); +#endif // __MINGW32__ } From fe6df473a355acaedec38d34255f79309d3d8b24 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 20:26:26 +0200 Subject: [PATCH 09/20] update server docs --- examples/server/README.md | 389 +++++++++++++------------------------- 1 file changed, 132 insertions(+), 257 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 805e05b4a5114..6570c64f93093 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -17,262 +17,137 @@ The project is under active development, and we are [looking for feedback and co ## Usage -``` -usage: ./llama-server [options] - -general: - - -h, --help, --usage print usage and exit - --version show version and build info - -v, --verbose print verbose information - --verbosity N set specific verbosity level (default: 0) - --verbose-prompt print a verbose prompt before generation (default: false) - --no-display-prompt don't print prompt at generation (default: false) - -co, --color colorise output to distinguish prompt and user input from generations (default: false) - -s, --seed SEED RNG seed (default: -1, use random seed for < 0) - -t, --threads N number of threads to use during generation (default: 8) - -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) - -td, --threads-draft N number of threads to use during generation (default: same as --threads) - -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft) - --draft N number of tokens to draft for speculative decoding (default: 5) - -ps, --p-split N speculative decoding split probability (default: 0.1) - -lcs, --lookup-cache-static FNAME - path to static lookup cache to use for lookup decoding (not updated by generation) - -lcd, --lookup-cache-dynamic FNAME - path to dynamic lookup cache to use for lookup decoding (updated by generation) - -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) - -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) - -b, --batch-size N logical maximum batch size (default: 2048) - -ub, --ubatch-size N physical maximum batch size (default: 512) - --keep N number of tokens to keep from the initial prompt (default: 0, -1 = all) - --chunks N max number of chunks to process (default: -1, -1 = all) - -fa, --flash-attn enable Flash Attention (default: disabled) - -p, --prompt PROMPT prompt to start generation with - in conversation mode, this will be used as system prompt - (default: '') - -f, --file FNAME a file containing the prompt (default: none) - --in-file FNAME an input file (repeat to specify multiple files) - -bf, --binary-file FNAME binary file containing the prompt (default: none) - -e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true) - --no-escape do not process escape sequences - -ptc, --print-token-count N print token count every N tokens (default: -1) - --prompt-cache FNAME file to cache prompt state for faster startup (default: none) - --prompt-cache-all if specified, saves user input and generations to cache as well - not supported with --interactive or other interactive options - --prompt-cache-ro if specified, uses the prompt cache but does not update it - -r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode - can be specified more than once for multiple prompts - -sp, --special special tokens output enabled (default: false) - -cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix - if suffix/prefix are not specified, default chat template will be used - (default: false) - -i, --interactive run in interactive mode (default: false) - -if, --interactive-first run in interactive mode and wait for input right away (default: false) - -mli, --multiline-input allows you to write or paste multiple lines without ending each in '\' - --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string - --in-prefix STRING string to prefix user inputs with (default: empty) - --in-suffix STRING string to suffix after user inputs with (default: empty) - --spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) - -sampling: - - --samplers SAMPLERS samplers that will be used for generation in the order, separated by ';' - (default: top_k;tfs_z;typical_p;top_p;min_p;temperature) - --sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt) - --ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf) - --penalize-nl penalize newline tokens (default: false) - --temp N temperature (default: 0.8) - --top-k N top-k sampling (default: 40, 0 = disabled) - --top-p N top-p sampling (default: 0.9, 1.0 = disabled) - --min-p N min-p sampling (default: 0.1, 0.0 = disabled) - --tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled) - --typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) - --repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) - --repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) - --presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled) - --frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) - --dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled) - --dynatemp-exp N dynamic temperature exponent (default: 1.0) - --mirostat N use Mirostat sampling. - Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used. - (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) - --mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1) - --mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0) - -l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion, - i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', - or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' - --cfg-negative-prompt PROMPT - negative prompt to use for guidance (default: '') - --cfg-negative-prompt-file FNAME - negative prompt file to use for guidance - --cfg-scale N strength of guidance (default: 1.0, 1.0 = disable) - --chat-template JINJA_TEMPLATE - set custom jinja chat template (default: template taken from model's metadata) - if suffix/prefix are specified, template will be disabled - only commonly used templates are accepted: - https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template - -grammar: - - --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') - --grammar-file FNAME file to read grammar from - -j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object - For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead - -embedding: - - --pooling {none,mean,cls,last} - pooling type for embeddings, use model default if unspecified - --attention {causal,non-causal} - attention type for embeddings, use model default if unspecified - -context hacking: - - --rope-scaling {none,linear,yarn} - RoPE frequency scaling method, defaults to linear unless specified by the model - --rope-scale N RoPE context scaling factor, expands context by a factor of N - --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model) - --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N - --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size) - --yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) - --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0) - --yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0) - --yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0) - -gan, --grp-attn-n N group-attention factor (default: 1) - -gaw, --grp-attn-w N group-attention width (default: 512.0) - -dkvc, --dump-kv-cache verbose print of the KV cache - -nkvo, --no-kv-offload disable KV offload - -ctk, --cache-type-k TYPE KV cache data type for K (default: f16) - -ctv, --cache-type-v TYPE KV cache data type for V (default: f16) - -perplexity: - - --all-logits return logits for all tokens in the batch (default: false) - --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f - --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400) - --winogrande compute Winogrande score over random tasks from datafile supplied with -f - --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0) - --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f - --multiple-choice-tasks N - number of tasks to use when computing the multiple choice score (default: 0) - --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base - --ppl-stride N stride for perplexity calculation (default: 0) - --ppl-output-type {0,1} output type for perplexity calculation (default: 0) - -parallel: - - -dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled) - -np, --parallel N number of parallel sequences to decode (default: 1) - -ns, --sequences N number of sequences to decode (default: 1) - -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled) - -multi-modality: - - --mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md - --image FILE path to an image file. use with multimodal models. Specify multiple times for batching - -backend: - - --rpc SERVERS comma separated list of RPC servers - --mlock force system to keep model in RAM rather than swapping or compressing - --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock) - --numa TYPE attempt optimizations that help on some NUMA systems - - distribute: spread execution evenly over all nodes - - isolate: only spawn threads on CPUs on the node that execution started on - - numactl: use the CPU map provided by numactl - if run without this previously, it is recommended to drop the system page cache before using this - see https://github.com/ggerganov/llama.cpp/issues/1437 - -model: - - --check-tensors check model tensor data for invalid values (default: false) - --override-kv KEY=TYPE:VALUE - advanced option to override model metadata by key. may be specified multiple times. - types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false - --lora FNAME apply LoRA adapter (implies --no-mmap) - --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap) - --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter - --control-vector FNAME add a control vector - note: this argument can be repeated to add multiple control vectors - --control-vector-scaled FNAME SCALE - add a control vector with user defined scaling SCALE - note: this argument can be repeated to add multiple scaled control vectors - --control-vector-layer-range START END - layer range to apply the control vector(s) to, start and end inclusive - -m, --model FNAME model path (default: models/$filename with filename from --hf-file - or --model-url if set, otherwise models/7B/ggml-model-f16.gguf) - -md, --model-draft FNAME draft model for speculative decoding (default: unused) - -mu, --model-url MODEL_URL model download url (default: unused) - -hfr, --hf-repo REPO Hugging Face model repository (default: unused) - -hff, --hf-file FILE Hugging Face model file (default: unused) - -hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable) - -server: - - --host HOST ip address to listen (default: 127.0.0.1) - --port PORT port to listen (default: 8080) - --path PATH path to serve static files from (default: ) - --embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled) - --api-key KEY API key to use for authentication (default: none) - --api-key-file FNAME path to file containing API keys (default: none) - --ssl-key-file FNAME path to file a PEM-encoded SSL private key - --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate - --timeout N server read/write timeout in seconds (default: 600) - --threads-http N number of threads used to process HTTP requests (default: -1) - --system-prompt-file FNAME - set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications - --log-format {text,json} - log output format: json or text (default: json) - --metrics enable prometheus compatible metrics endpoint (default: disabled) - --no-slots disables slots monitoring endpoint (default: enabled) - --slot-save-path PATH path to save slot kv cache (default: disabled) - --chat-template JINJA_TEMPLATE - set custom jinja chat template (default: template taken from model's metadata) - only commonly used templates are accepted: - https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template - -sps, --slot-prompt-similarity SIMILARITY - how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) - --lora-init-without-apply - load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) - -logging: - - --simple-io use basic IO for better compatibility in subprocesses and limited consoles - -ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset) - --log-test Run simple logging test - --log-disable Disable trace logs - --log-enable Enable trace logs - --log-file FNAME Specify a log filename (without extension) - --log-new Create a separate new log file on start. Each log file will have unique name: "..log" - --log-append Don't truncate the old log file. -``` - -Available environment variables (if specified, these variables will override parameters specified in arguments): - -- `LLAMA_CACHE`: cache directory, used by `--hf-repo` -- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo` -- `LLAMA_ARG_MODEL`: equivalent to `-m` -- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu` -- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a` -- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo` -- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file` -- `LLAMA_ARG_THREADS`: equivalent to `-t` -- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c` -- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np` -- `LLAMA_ARG_BATCH`: equivalent to `-b` -- `LLAMA_ARG_UBATCH`: equivalent to `-ub` -- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl` -- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http` -- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template` -- `LLAMA_ARG_N_PREDICT`: equivalent to `-n` -- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`) -- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default. -- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`) -- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`) -- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default. -- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt` -- `LLAMA_ARG_HOST`: equivalent to `--host` -- `LLAMA_ARG_PORT`: equivalent to `--port` +| Argument | Explanation | +| -------- | ----------- | +| `-h, --help, --usage ` | print usage and exit | +| `--version ` | show version and build info | +| `-v, --verbose ` | print verbose information | +| `--verbosity N` | set specific verbosity level (default: 0) | +| `--verbose-prompt ` | print a verbose prompt before generation (default: false) | +| `--no-display-prompt ` | don't print prompt at generation (default: false) | +| `-co, --color ` | colorise output to distinguish prompt and user input from generations (default: false) | +| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) | +| `-t, --threads N` | number of threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | +| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | +| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | +| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | +| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)
| +| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)
| +| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | +| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | +| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | +| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll | +| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | +| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | +| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | +| `-n, --predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
(env: LLAMA_ARG_N_PREDICT) | +| `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | +| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | +| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | +| `--chunks N` | max number of chunks to process (default: -1, -1 = all) | +| `-fa, --flash-attn ` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | +| `-p, --prompt PROMPT` | prompt to start generation with
| +| `-f, --file FNAME` | a file containing the prompt (default: none) | +| `--in-file FNAME` | an input file (repeat to specify multiple files) | +| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | +| `-e, --escape ` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | +| `--no-escape ` | do not process escape sequences | +| `--spm-infill ` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | +| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) | +| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) | +| `--ignore-eos ` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | +| `--penalize-nl ` | penalize newline tokens (default: false) | +| `--temp N` | temperature (default: 0.8) | +| `--top-k N` | top-k sampling (default: 40, 0 = disabled) | +| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | +| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | +| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | +| `--mirostat N` | use Mirostat sampling.
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | +| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar-file FNAME` | file to read grammar from | +| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | +| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model | +| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N | +| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model) | +| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N | +| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) | +| `-gan, --grp-attn-n N` | group-attention factor (default: 1) | +| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) | +| `-dkvc, --dump-kv-cache ` | verbose print of the KV cache | +| `-nkvo, --no-kv-offload ` | disable KV offload | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) | +| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | +| `-np, --parallel N` | number of parallel sequences to decode (default: 1) | +| `-ns, --sequences N` | number of sequences to decode (default: 1) | +| `-cb, --cont-batching ` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | +| `-nocb, --no-cont-batching ` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | +| `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md | +| `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching | +| `--rpc SERVERS` | comma separated list of RPC servers | +| `--mlock ` | force system to keep model in RAM rather than swapping or compressing | +| `--no-mmap ` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | +| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437 | +| `-ngl, --gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | +| `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model | +| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs | +| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 | +| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) | +| `--check-tensors ` | check model tensor data for invalid values (default: false) | +| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | +| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | +| `--lora-scaled FNAME` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | +| `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | +| `--control-vector-scaled FNAME` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | +| `--control-vector-layer-range START` | layer range to apply the control vector(s) to, start and end inclusive | +| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | +| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | +| `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | +| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)
(env: LLAMA_ARG_HF_FILE) | +| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | +| `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | +| `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | +| `--path PATH` | path to serve static files from (default: ) | +| `--embedding, --embeddings ` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | +| `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | +| `--api-key-file FNAME` | path to file containing API keys (default: none) | +| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key | +| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate | +| `--timeout N` | server read/write timeout in seconds (default: 600) | +| `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | +| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | +| `--log-format {text, json}` | log output format: json or text (default: json) | +| `--metrics ` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | +| `--no-slots ` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | +| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| +| `--lora-init-without-apply ` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | +| `--simple-io ` | use basic IO for better compatibility in subprocesses and limited consoles | +| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) | +| `--log-test ` | Log test | +| `--log-disable ` | Log disable | +| `--log-enable ` | Log enable | +| `--log-new ` | Log new | +| `--log-append ` | Log append | +| `--log-file FNAME` | Log file | + +Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. Example usage of docker compose with environment variables: @@ -289,7 +164,7 @@ services: LLAMA_ARG_MODEL: /models/my_model.gguf LLAMA_ARG_CTX_SIZE: 4096 LLAMA_ARG_N_PARALLEL: 2 - LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0 + LLAMA_ARG_ENDPOINT_METRICS: 1 LLAMA_ARG_PORT: 8080 ``` From b1657cb934d95ee28c3bd5667e48d46ce1c7da91 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 20:58:10 +0200 Subject: [PATCH 10/20] bring back missing --alias --- common/common.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 534cbe35ce4f8..9e959b02bb037 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -650,7 +650,6 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; } sampler_type_names.pop_back(); - const char split_delim = ','; /** @@ -1804,6 +1803,13 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example params.control_vector_layer_end = std::stoi(end); } )); + add_opt(llama_arg( + {"-a", "--alias"}, "STRING", + "set alias for model name (to be used by REST API)", + [¶ms](std::string value) { + params.model_alias = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL")); add_opt(llama_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA @@ -1950,7 +1956,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example {"-npp"}, "n0,n1,...", "number of prompt tokens", [¶ms](std::string value) { - auto p = string_split(value, split_delim); + auto p = string_split(value, ','); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); @@ -1958,7 +1964,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example {"-ntg"}, "n0,n1,...", "number of text generation tokens", [¶ms](std::string value) { - auto p = string_split(value, split_delim); + auto p = string_split(value, ','); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); @@ -1966,7 +1972,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example {"-npl"}, "n0,n1,...", "number of parallel prompts", [¶ms](std::string value) { - auto p = string_split(value, split_delim); + auto p = string_split(value, ','); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); From 509ec08e5751bc9c51160f5b731ea94ddd4906ee Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Sep 2024 21:03:50 +0200 Subject: [PATCH 11/20] bring back --n-predict --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 9e959b02bb037..d04fc5f7ff5ca 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -944,7 +944,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(llama_arg( - {"-n", "--predict"}, "N", + {"-n", "--predict", "--n-predict"}, "N", format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), [¶ms](int value) { params.n_predict = value; From d545ffcb6deac469a0b4af0be69bf9b47997181e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Sep 2024 09:39:08 +0200 Subject: [PATCH 12/20] clarify test-arg-parser --- tests/test-arg-parser.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index f3e24e9d8ec10..2f3cf815d2a2c 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -8,9 +8,6 @@ #include "common.h" int main(void) { -#ifdef _WIN32 - printf("test-arg-parser: skip on windows build\n"); -#else gpt_params params; printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); @@ -66,6 +63,10 @@ int main(void) { assert(params.n_predict == 6789); assert(params.n_batch == 9090); +// skip this part on windows, because setenv is not supported +#ifdef _WIN32 + printf("test-arg-parser: skip on windows build\n"); +#else printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n"); setenv("LLAMA_ARG_THREADS", "blah", true); @@ -91,5 +92,5 @@ int main(void) { printf("test-arg-parser: all tests OK\n\n"); -#endif // __MINGW32__ +#endif // _WIN32 } From 79ce128d2a51e847f502204b519de14e3840a9ee Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Sep 2024 09:41:04 +0200 Subject: [PATCH 13/20] small correction --- tests/test-arg-parser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 2f3cf815d2a2c..8852bfc7e63b6 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -89,8 +89,8 @@ int main(void) { assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(params.model == "overwritten.gguf"); assert(params.cpuparams.n_threads == 1010); +#endif // _WIN32 printf("test-arg-parser: all tests OK\n\n"); -#endif // _WIN32 } From 961bd19da102c8dec63a23acc01976bb84ed2565 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Sep 2024 13:42:20 +0200 Subject: [PATCH 14/20] add comments --- common/common.cpp | 22 +++++++++++++------- common/common.h | 52 +++++++++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d04fc5f7ff5ca..d8d2caac31ec3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -456,6 +456,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto params.kv_overrides.back().key[0] = 0; } + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + sparams.seed = params.seed; + } + return true; } @@ -468,7 +473,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector & options) { +void gpt_params_print_usage(gpt_params & params, std::vector & options) { auto print_options = [](std::vector & options) { for (llama_arg * opt : options) { printf("%s", opt->to_string().c_str()); @@ -622,14 +627,16 @@ void gpt_params_print_usage(std::vector & options) { std::vector common_options; std::vector specific_options; for (auto & opt : options) { - if (opt.in_example(LLAMA_EXAMPLE_COMMON)) { - common_options.push_back(&opt); - } else { + // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example + if (opt.in_example(params.curr_ex)) { specific_options.push_back(&opt); + } else { + common_options.push_back(&opt); } } printf("----- common options -----\n\n"); print_options(common_options); + // TODO: maybe convert enum llama_example to string printf("\n\n----- example-specific options -----\n\n"); print_options(specific_options); } @@ -641,6 +648,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage) { std::vector options; params.print_usage = print_usage; + params.curr_ex = ex; llama_sampling_params & sparams = params.sparams; std::string sampler_type_chars; @@ -1772,14 +1780,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.lora_adapters.push_back({ std::string(value), 1.0 }); } - )); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(llama_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", [¶ms](std::string fname, std::string scale) { params.lora_adapters.push_back({ fname, std::stof(scale) }); } - )); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(llama_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", diff --git a/common/common.h b/common/common.h index 7536120fc1588..8f5e3a96ab06d 100644 --- a/common/common.h +++ b/common/common.h @@ -63,6 +63,24 @@ int32_t cpu_get_num_math(); // CLI argument parsing // +enum llama_example { + LLAMA_EXAMPLE_COMMON, + LLAMA_EXAMPLE_SPECULATIVE, + LLAMA_EXAMPLE_MAIN, + LLAMA_EXAMPLE_INFILL, + LLAMA_EXAMPLE_EMBEDDING, + LLAMA_EXAMPLE_PERPLEXITY, + LLAMA_EXAMPLE_RETRIEVAL, + LLAMA_EXAMPLE_PASSKEY, + LLAMA_EXAMPLE_IMATRIX, + LLAMA_EXAMPLE_BENCH, + LLAMA_EXAMPLE_SERVER, + LLAMA_EXAMPLE_CVECTOR_GENERATOR, + LLAMA_EXAMPLE_EXPORT_LORA, + + LLAMA_EXAMPLE_COUNT, +}; + // dimensionality reduction methods, used by cvector-generator enum dimre_method { DIMRE_METHOD_PCA, @@ -79,6 +97,7 @@ struct cpu_params { }; struct gpt_params { + enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON; uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed int32_t n_predict = -1; // new tokens to predict @@ -125,7 +144,7 @@ struct gpt_params { // // sampling parameters struct llama_sampling_params sparams; - std::string model = "model.gguf"; // model path + std::string model = ""; // model path std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias std::string model_url = ""; // model url to download @@ -280,24 +299,6 @@ struct gpt_params { std::string lora_outfile = "ggml-lora-merged-f16.gguf"; }; -enum llama_example { - LLAMA_EXAMPLE_COMMON, - LLAMA_EXAMPLE_SPECULATIVE, - LLAMA_EXAMPLE_MAIN, - LLAMA_EXAMPLE_INFILL, - LLAMA_EXAMPLE_EMBEDDING, - LLAMA_EXAMPLE_PERPLEXITY, - LLAMA_EXAMPLE_RETRIEVAL, - LLAMA_EXAMPLE_PASSKEY, - LLAMA_EXAMPLE_IMATRIX, - LLAMA_EXAMPLE_BENCH, - LLAMA_EXAMPLE_SERVER, - LLAMA_EXAMPLE_CVECTOR_GENERATOR, - LLAMA_EXAMPLE_EXPORT_LORA, - - LLAMA_EXAMPLE_COUNT, -}; - struct llama_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; std::vector args; @@ -352,11 +353,18 @@ struct llama_arg { std::string to_string(); }; +// initialize list of options (arguments) that can be used by the current example std::vector gpt_params_parser_init(gpt_params & params, llama_example ex); +// optionally, we can provide "print_usage" to print example usage std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage); -bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector & options); -bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params, std::vector & options); -void gpt_params_print_usage(std::vector & options); + +// parse input arguments from CLI +// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) +bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector & options); +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector & options); + +// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set +void gpt_params_print_usage(gpt_params & params, std::vector & options); void gpt_params_handle_model_default(gpt_params & params); From 53244f9c58883b78534ce867ab2fdee8a52fd641 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Sep 2024 13:47:10 +0200 Subject: [PATCH 15/20] fix args with 2 values --- common/common.cpp | 3 +- examples/export-docs/export-docs.cpp | 13 ++++-- examples/server/README.md | 70 ++++++++++++++-------------- 3 files changed, 47 insertions(+), 39 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d8d2caac31ec3..526fff05782bf 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -603,6 +603,7 @@ std::string llama_arg::to_string() { } } if (!value_hint.empty()) ss << " " << value_hint; + if (!value_hint_2.empty()) ss << " " << value_hint_2; if (ss.tellp() > n_leading_spaces - 3) { // current line is too long, add new line ss << "\n" << leading_spaces; @@ -850,7 +851,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example )); add_opt(llama_arg( {"--poll-batch"}, "<0|1>", - "use polling to wait for work (default: same as --poll", + "use polling to wait for work (default: same as --poll)", [¶ms](int value) { params.cpuparams_batch.poll = value; } diff --git a/examples/export-docs/export-docs.cpp b/examples/export-docs/export-docs.cpp index e21c4b89d53eb..86c041a811d12 100644 --- a/examples/export-docs/export-docs.cpp +++ b/examples/export-docs/export-docs.cpp @@ -28,9 +28,16 @@ static void export_md(std::string fname, llama_example ex) { } } // value hint - std::string md_value_hint(opt.value_hint); - string_replace_all(md_value_hint, "|", "\\|"); - file << " " << md_value_hint; + if (!opt.value_hint.empty()) { + std::string md_value_hint(opt.value_hint); + string_replace_all(md_value_hint, "|", "\\|"); + file << " " << md_value_hint; + } + if (!opt.value_hint_2.empty()) { + std::string md_value_hint_2(opt.value_hint_2); + string_replace_all(md_value_hint_2, "|", "\\|"); + file << " " << md_value_hint_2; + } // help text std::string md_help(opt.help); string_replace_all(md_help, "\n", "
"); diff --git a/examples/server/README.md b/examples/server/README.md index 6570c64f93093..62250fd8df672 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -19,13 +19,13 @@ The project is under active development, and we are [looking for feedback and co | Argument | Explanation | | -------- | ----------- | -| `-h, --help, --usage ` | print usage and exit | -| `--version ` | show version and build info | -| `-v, --verbose ` | print verbose information | +| `-h, --help, --usage` | print usage and exit | +| `--version` | show version and build info | +| `-v, --verbose` | print verbose information | | `--verbosity N` | set specific verbosity level (default: 0) | -| `--verbose-prompt ` | print a verbose prompt before generation (default: false) | -| `--no-display-prompt ` | don't print prompt at generation (default: false) | -| `-co, --color ` | colorise output to distinguish prompt and user input from generations (default: false) | +| `--verbose-prompt` | print a verbose prompt before generation (default: false) | +| `--no-display-prompt` | don't print prompt at generation (default: false) | +| `-co, --color` | colorise output to distinguish prompt and user input from generations (default: false) | | `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) | | `-t, --threads N` | number of threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | @@ -36,27 +36,27 @@ The project is under active development, and we are [looking for feedback and co | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | -| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll | +| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | -| `-n, --predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
(env: LLAMA_ARG_N_PREDICT) | +| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
(env: LLAMA_ARG_N_PREDICT) | | `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--chunks N` | max number of chunks to process (default: -1, -1 = all) | -| `-fa, --flash-attn ` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | +| `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | | `-p, --prompt PROMPT` | prompt to start generation with
| | `-f, --file FNAME` | a file containing the prompt (default: none) | | `--in-file FNAME` | an input file (repeat to specify multiple files) | | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | -| `-e, --escape ` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | -| `--no-escape ` | do not process escape sequences | -| `--spm-infill ` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | +| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | +| `--no-escape` | do not process escape sequences | +| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) | | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) | -| `--ignore-eos ` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--penalize-nl ` | penalize newline tokens (default: false) | +| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | +| `--penalize-nl` | penalize newline tokens (default: false) | | `--temp N` | temperature (default: 0.8) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled) | | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | @@ -87,33 +87,33 @@ The project is under active development, and we are [looking for feedback and co | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) | | `-gan, --grp-attn-n N` | group-attention factor (default: 1) | | `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) | -| `-dkvc, --dump-kv-cache ` | verbose print of the KV cache | -| `-nkvo, --no-kv-offload ` | disable KV offload | +| `-dkvc, --dump-kv-cache` | verbose print of the KV cache | +| `-nkvo, --no-kv-offload` | disable KV offload | | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1) | | `-ns, --sequences N` | number of sequences to decode (default: 1) | -| `-cb, --cont-batching ` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | -| `-nocb, --no-cont-batching ` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | +| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | +| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | | `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md | | `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching | -| `--rpc SERVERS` | comma separated list of RPC servers | -| `--mlock ` | force system to keep model in RAM rather than swapping or compressing | -| `--no-mmap ` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | +| `--mlock` | force system to keep model in RAM rather than swapping or compressing | +| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437 | | `-ngl, --gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | | `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) | -| `--check-tensors ` | check model tensor data for invalid values (default: false) | +| `--check-tensors` | check model tensor data for invalid values (default: false) | | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | -| `--lora-scaled FNAME` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | +| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | | `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | -| `--control-vector-scaled FNAME` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | -| `--control-vector-layer-range START` | layer range to apply the control vector(s) to, start and end inclusive | +| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | +| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | +| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_MODEL) | | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | @@ -123,7 +123,7 @@ The project is under active development, and we are [looking for feedback and co | `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: ) | -| `--embedding, --embeddings ` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | +| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | | `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key | @@ -132,19 +132,19 @@ The project is under active development, and we are [looking for feedback and co | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | | `--log-format {text, json}` | log output format: json or text (default: json) | -| `--metrics ` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | -| `--no-slots ` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | +| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | +| `--no-slots` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| -| `--lora-init-without-apply ` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | -| `--simple-io ` | use basic IO for better compatibility in subprocesses and limited consoles | +| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | +| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) | -| `--log-test ` | Log test | -| `--log-disable ` | Log disable | -| `--log-enable ` | Log enable | -| `--log-new ` | Log new | -| `--log-append ` | Log append | +| `--log-test` | Log test | +| `--log-disable` | Log disable | +| `--log-enable` | Log enable | +| `--log-new` | Log new | +| `--log-append` | Log append | | `--log-file FNAME` | Log file | Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. From e1281d0d7ae4736fc6d6ec2964f885eebb39a452 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Sep 2024 14:05:51 +0200 Subject: [PATCH 16/20] refine example-specific args --- common/common.cpp | 28 ++++++++++++++++++---------- common/common.h | 1 + examples/llava/llava-cli.cpp | 2 +- examples/main/main.cpp | 9 ++++++++- examples/server/README.md | 8 +------- 5 files changed, 29 insertions(+), 19 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 526fff05782bf..d28f918ef6dda 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -720,21 +720,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms]() { params.verbose_prompt = true; } - )); + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--no-display-prompt"}, format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), [¶ms]() { params.display_prompt = false; } - )); + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-co", "--color"}, format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), [¶ms]() { params.use_color = true; } - )); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-s", "--seed"}, "SEED", format("RNG seed (default: %d, use random seed for < 0)", params.seed), @@ -996,7 +996,9 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example ).set_env("LLAMA_ARG_FLASH_ATTN")); add_opt(llama_arg( {"-p", "--prompt"}, "PROMPT", - "prompt to start generation with\n", + ex == LLAMA_EXAMPLE_MAIN + ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" + : "prompt to start generation with", [¶ms](std::string value) { params.prompt = value; } @@ -1102,7 +1104,13 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-cnv", "--conversation"}, - "run in conversation mode, does not print special tokens and suffix/prefix\n", + format( + "run in conversation mode:\n" + "- does not print special tokens and suffix/prefix\n" + "- interactive mode is also enabled\n" + "(default: %s)", + params.conversation ? "true" : "false" + ), [¶ms]() { params.conversation = true; } @@ -1625,14 +1633,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.mmproj = value; } - )); + ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(llama_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", [¶ms](std::string value) { params.image.emplace_back(value); } - )); + ).set_examples({LLAMA_EXAMPLE_LLAVA})); #ifdef GGML_USE_RPC add_opt(llama_arg( {"--rpc"}, "SERVERS", @@ -1692,7 +1700,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } } - )); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" @@ -1837,7 +1845,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.model_draft = value; } - )); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", @@ -2178,7 +2186,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms]() { params.simple_io = true; } - )); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-ld", "--logdir"}, "LOGDIR", "path under which to save YAML logs (no logging if unset)", diff --git a/common/common.h b/common/common.h index 8f5e3a96ab06d..a8aa6fe144a30 100644 --- a/common/common.h +++ b/common/common.h @@ -77,6 +77,7 @@ enum llama_example { LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, + LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_COUNT, }; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 4dd17cf68ab1c..8a64fe1bbdc8b 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -278,7 +278,7 @@ int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage); if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 058a6da142b0d..c434ff608b06b 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -40,6 +40,13 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + printf("\n"); +} + static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); @@ -131,7 +138,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector(env: LLAMA_ARG_THREADS) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | @@ -46,7 +45,7 @@ The project is under active development, and we are [looking for feedback and co | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--chunks N` | max number of chunks to process (default: -1, -1 = all) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | -| `-p, --prompt PROMPT` | prompt to start generation with
| +| `-p, --prompt PROMPT` | prompt to start generation with | | `-f, --file FNAME` | a file containing the prompt (default: none) | | `--in-file FNAME` | an input file (repeat to specify multiple files) | | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | @@ -96,13 +95,10 @@ The project is under active development, and we are [looking for feedback and co | `-ns, --sequences N` | number of sequences to decode (default: 1) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | -| `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md | -| `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching | | `--mlock` | force system to keep model in RAM rather than swapping or compressing | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437 | | `-ngl, --gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | -| `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) | @@ -115,7 +111,6 @@ The project is under active development, and we are [looking for feedback and co | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | | `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_MODEL) | | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | -| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)
(env: LLAMA_ARG_HF_FILE) | @@ -138,7 +133,6 @@ The project is under active development, and we are [looking for feedback and co | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | -| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) | | `--log-test` | Log test | | `--log-disable` | Log disable | From ceddafa0e152d6213413773550d27a51ff7caabd Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 7 Sep 2024 18:19:41 +0200 Subject: [PATCH 17/20] no more lamba capture Co-authored-by: slaren@users.noreply.github.com --- common/common.cpp | 398 +++++++++++++++++++++++----------------------- common/common.h | 47 ++++-- 2 files changed, 232 insertions(+), 213 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 3694c12762a0c..012dd1adc98ef 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -362,13 +362,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto if (opt.get_value_from_env(value)) { try { if (opt.handler_void && (value == "1" || value == "true")) { - opt.handler_void(); + opt.handler_void(params, sparams); } if (opt.handler_int) { - opt.handler_int(std::stoi(value)); + opt.handler_int(params, sparams, std::stoi(value)); } if (opt.handler_string) { - opt.handler_string(value); + opt.handler_string(params, sparams, value); continue; } } catch (std::exception & e) { @@ -399,7 +399,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto } try { if (opt.handler_void) { - opt.handler_void(); + opt.handler_void(params, sparams); continue; } @@ -407,11 +407,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto check_arg(i); std::string val = argv[++i]; if (opt.handler_int) { - opt.handler_int(std::stoi(val)); + opt.handler_int(params, sparams, std::stoi(val)); continue; } if (opt.handler_string) { - opt.handler_string(val); + opt.handler_string(params, sparams, val); continue; } @@ -419,7 +419,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto check_arg(i); std::string val2 = argv[++i]; if (opt.handler_str_str) { - opt.handler_str_str(val, val2); + opt.handler_str_str(params, sparams, val, val2); continue; } } catch (std::exception & e) { @@ -687,14 +687,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-h", "--help", "--usage"}, "print usage and exit", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.usage = true; } )); add_opt(llama_arg( {"--version"}, "show version and build info", - []() { + [](gpt_params & params, llama_sampling_params & sparams) { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); @@ -703,42 +703,42 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-v", "--verbose"}, "print verbose information", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.verbosity = 1; } )); add_opt(llama_arg( {"--verbosity"}, "N", format("set specific verbosity level (default: %d)", params.verbosity), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.verbosity = value; } )); add_opt(llama_arg( {"--verbose-prompt"}, format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.verbose_prompt = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--no-display-prompt"}, format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-co", "--color"}, format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-s", "--seed"}, "SEED", format("RNG seed (default: %d, use random seed for < 0)", params.seed), - [&sparams, ¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. params.seed = std::stoul(value); sparams.seed = std::stoul(value); @@ -747,7 +747,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-t", "--threads"}, "N", format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.cpuparams.n_threads = value; if (params.cpuparams.n_threads <= 0) { params.cpuparams.n_threads = std::thread::hardware_concurrency(); @@ -757,7 +757,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.cpuparams_batch.n_threads = value; if (params.cpuparams_batch.n_threads <= 0) { params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); @@ -767,7 +767,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-td", "--threads-draft"}, "N", "number of threads to use during generation (default: same as --threads)", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.draft_cpuparams.n_threads = value; if (params.draft_cpuparams.n_threads <= 0) { params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); @@ -777,7 +777,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.draft_cpuparams_batch.n_threads = value; if (params.draft_cpuparams_batch.n_threads <= 0) { params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); @@ -787,7 +787,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string mask = value; params.cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { @@ -798,7 +798,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-Cr", "--cpu-range"}, "lo-hi", "range of CPUs for affinity. Complements --cpu-mask", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string range = value; params.cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams.cpumask)) { @@ -809,21 +809,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cpu-strict"}, "<0|1>", format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.cpuparams.strict_cpu = std::stoul(value); } )); add_opt(llama_arg( {"--poll"}, "<0...100>", format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.cpuparams.poll = std::stoul(value); } )); add_opt(llama_arg( {"-Cb", "--cpu-mask-batch"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string mask = value; params.cpuparams_batch.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { @@ -834,7 +834,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-Crb", "--cpu-range-batch"}, "lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string range = value; params.cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { @@ -845,21 +845,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cpu-strict-batch"}, "<0|1>", "use strict CPU placement (default: same as --cpu-strict)", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.cpuparams_batch.strict_cpu = value; } )); add_opt(llama_arg( {"--poll-batch"}, "<0|1>", "use polling to wait for work (default: same as --poll)", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.cpuparams_batch.poll = value; } )); add_opt(llama_arg( {"-Cd", "--cpu-mask-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string mask = value; params.draft_cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { @@ -870,7 +870,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string range = value; params.draft_cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { @@ -881,21 +881,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.draft_cpuparams.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.draft_cpuparams.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string range = value; params.draft_cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { @@ -906,91 +906,91 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.draft_cpuparams_batch.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.draft_cpuparams_batch.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--draft"}, "N", format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-ps", "--p-split"}, "N", format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.lookup_cache_static = value; } )); add_opt(llama_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.lookup_cache_dynamic = value; } )); add_opt(llama_arg( {"-c", "--ctx-size"}, "N", format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(llama_arg( {"-n", "--predict", "--n-predict"}, "N", format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); add_opt(llama_arg( {"-b", "--batch-size"}, "N", format("logical maximum batch size (default: %d)", params.n_batch), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); add_opt(llama_arg( {"-ub", "--ubatch-size"}, "N", format("physical maximum batch size (default: %d)", params.n_ubatch), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); add_opt(llama_arg( {"--keep"}, "N", format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_keep = value; } )); add_opt(llama_arg( {"--chunks"}, "N", format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_chunks = value; } )); add_opt(llama_arg( {"-fa", "--flash-attn"}, format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); @@ -999,14 +999,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example ex == LLAMA_EXAMPLE_MAIN ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" : "prompt to start generation with", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.prompt = value; } )); add_opt(llama_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1022,7 +1022,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1033,7 +1033,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1049,56 +1049,56 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-e", "--escape"}, format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.escape = true; } )); add_opt(llama_arg( {"--no-escape"}, "do not process escape sequences", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.escape = false; } )); add_opt(llama_arg( {"-ptc", "--print-token-count"}, "N", format("print token count every N tokens (default: %d)", params.n_print), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_print = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.path_prompt_cache = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.prompt_cache_all = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.prompt_cache_ro = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.antiprompt.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-sp", "--special"}, format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); @@ -1111,35 +1111,35 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "(default: %s)", params.conversation ? "true" : "false" ), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.conversation = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-i", "--interactive"}, format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.interactive = true; } ).set_examples({LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-if", "--interactive-first"}, format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.interactive_first = true; } ).set_examples({LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.multiline_input = true; } ).set_examples({LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.input_prefix_bos = true; params.enable_chat_template = false; } @@ -1147,7 +1147,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.input_prefix = value; params.enable_chat_template = false; } @@ -1155,7 +1155,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.input_suffix = value; params.enable_chat_template = false; } @@ -1163,7 +1163,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--no-warmup"}, "skip warming up the model with an empty run", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.warmup = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); @@ -1173,14 +1173,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.spm_infill = true; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--samplers"}, "SAMPLERS", format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { const auto sampler_names = string_split(value, ';'); sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); } @@ -1188,28 +1188,28 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--sampling-seq"}, "SEQUENCE", format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.samplers_sequence = llama_sampling_types_from_chars(value); } )); add_opt(llama_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.ignore_eos = true; } )); add_opt(llama_arg( {"--penalize-nl"}, format("penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false"), - [&sparams]() { + [](gpt_params & params, llama_sampling_params & sparams) { sparams.penalize_nl = true; } )); add_opt(llama_arg( {"--temp"}, "N", format("temperature (default: %.1f)", (double)sparams.temp), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.temp = std::stof(value); sparams.temp = std::max(sparams.temp, 0.0f); } @@ -1217,42 +1217,42 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--top-k"}, "N", format("top-k sampling (default: %d, 0 = disabled)", sparams.top_k), - [&sparams](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { sparams.top_k = value; } )); add_opt(llama_arg( {"--top-p"}, "N", format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.top_p = std::stof(value); } )); add_opt(llama_arg( {"--min-p"}, "N", format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.min_p = std::stof(value); } )); add_opt(llama_arg( {"--tfs"}, "N", format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.tfs_z = std::stof(value); } )); add_opt(llama_arg( {"--typical"}, "N", format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.typical_p = std::stof(value); } )); add_opt(llama_arg( {"--repeat-last-n"}, "N", format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n), - [&sparams](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { sparams.penalty_last_n = value; sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); } @@ -1260,35 +1260,35 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--repeat-penalty"}, "N", format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.penalty_repeat = std::stof(value); } )); add_opt(llama_arg( {"--presence-penalty"}, "N", format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.penalty_present = std::stof(value); } )); add_opt(llama_arg( {"--frequency-penalty"}, "N", format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.penalty_freq = std::stof(value); } )); add_opt(llama_arg( {"--dynatemp-range"}, "N", format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.dynatemp_range = std::stof(value); } )); add_opt(llama_arg( {"--dynatemp-exp"}, "N", format("dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.dynatemp_exponent = std::stof(value); } )); @@ -1296,21 +1296,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example {"--mirostat"}, "N", format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat), - [&sparams](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { sparams.mirostat = value; } )); add_opt(llama_arg( {"--mirostat-lr"}, "N", format("Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.mirostat_eta = std::stof(value); } )); add_opt(llama_arg( {"--mirostat-ent"}, "N", format("Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.mirostat_tau = std::stof(value); } )); @@ -1319,7 +1319,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::stringstream ss(value); llama_token key; char sign; @@ -1338,14 +1338,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cfg-negative-prompt"}, "PROMPT", format("negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str()), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.cfg_negative_prompt = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--cfg-negative-prompt-file"}, "FNAME", "negative prompt file to use for guidance", - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1359,21 +1359,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cfg-scale"}, "N", format("strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.cfg_scale = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--grammar"}, "GRAMMAR", format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str()), - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.grammar = value; } )); add_opt(llama_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1388,14 +1388,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [&sparams](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { sparams.grammar = json_schema_to_grammar(json::parse(value)); } )); add_opt(llama_arg( {"--pooling"}, "{none,mean,cls,last}", "pooling type for embeddings, use model default if unspecified", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } @@ -1406,7 +1406,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--attention"}, "{causal,non,causal}", "attention type for embeddings, use model default if unspecified", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } else { throw std::invalid_argument("invalid value"); } @@ -1415,7 +1415,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--rope-scaling"}, "{none,linear,yarn}", "RoPE frequency scaling method, defaults to linear unless specified by the model", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } @@ -1425,91 +1425,91 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--rope-scale"}, "N", "RoPE context scaling factor, expands context by a factor of N", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.rope_freq_scale = 1.0f / std::stof(value); } )); add_opt(llama_arg( {"--rope-freq-base"}, "N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.rope_freq_base = std::stof(value); } )); add_opt(llama_arg( {"--rope-freq-scale"}, "N", "RoPE frequency scaling factor, expands context by a factor of 1/N", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.rope_freq_scale = std::stof(value); } )); add_opt(llama_arg( {"--yarn-orig-ctx"}, "N", format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.yarn_orig_ctx = value; } )); add_opt(llama_arg( {"--yarn-ext-factor"}, "N", format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.yarn_ext_factor = std::stof(value); } )); add_opt(llama_arg( {"--yarn-attn-factor"}, "N", format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.yarn_attn_factor = std::stof(value); } )); add_opt(llama_arg( {"--yarn-beta-slow"}, "N", format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.yarn_beta_slow = std::stof(value); } )); add_opt(llama_arg( {"--yarn-beta-fast"}, "N", format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.yarn_beta_fast = std::stof(value); } )); add_opt(llama_arg( {"-gan", "--grp-attn-n"}, "N", format("group-attention factor (default: %d)", params.grp_attn_n), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.grp_attn_n = value; } )); add_opt(llama_arg( {"-gaw", "--grp-attn-w"}, "N", format("group-attention width (default: %.1f)", (double)params.grp_attn_w), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.grp_attn_w = value; } )); add_opt(llama_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.dump_kv_cache = true; } )); add_opt(llama_arg( {"-nkvo", "--no-kv-offload"}, "disable KV offload", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.no_kv_offload = true; } )); add_opt(llama_arg( {"-ctk", "--cache-type-k"}, "TYPE", format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { // TODO: get the type right here params.cache_type_k = value; } @@ -1517,7 +1517,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ctv", "--cache-type-v"}, "TYPE", format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { // TODO: get the type right here params.cache_type_v = value; } @@ -1525,119 +1525,119 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--all-logits"}, format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.logits_all = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.hellaswag = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--hellaswag-tasks"}, "N", format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.hellaswag_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.winogrande = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--winogrande-tasks"}, "N", format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.winogrande_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.multiple_choice = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--multiple-choice-tasks"}, "N", format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.multiple_choice_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.kl_divergence = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--ppl-stride"}, "N", format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--ppl-output-type"}, "<0|1>", format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"-dt", "--defrag-thold"}, "N", format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(llama_arg( {"-np", "--parallel"}, "N", format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_parallel = value; } )); add_opt(llama_arg( {"-ns", "--sequences"}, "N", format("number of sequences to decode (default: %d)", params.n_sequences), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_sequences = value; } )); add_opt(llama_arg( {"-cb", "--cont-batching"}, format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.cont_batching = true; } ).set_env("LLAMA_ARG_CONT_BATCHING")); add_opt(llama_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.cont_batching = false; } ).set_env("LLAMA_ARG_NO_CONT_BATCHING")); add_opt(llama_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.mmproj = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(llama_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.image.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_LLAVA})); @@ -1645,7 +1645,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--rpc"}, "SERVERS", "comma separated list of RPC servers", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.rpc_servers = value; } )); @@ -1653,14 +1653,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--mlock"}, "force system to keep model in RAM rather than swapping or compressing", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.use_mlock = true; } )); add_opt(llama_arg( {"--no-mmap"}, "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.use_mmap = false; } )); @@ -1672,7 +1672,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggerganov/llama.cpp/issues/1437", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } @@ -1682,7 +1682,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ngl", "--gpu-layers"}, "N", "number of layers to store in VRAM", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); @@ -1693,7 +1693,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ngld", "--gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_gpu_layers_draft = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); @@ -1707,7 +1707,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" "- row: split rows across GPUs", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { params.split_mode = LLAMA_SPLIT_MODE_NONE; @@ -1732,7 +1732,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ts", "--tensor-split"}, "N0,N1,N2,...", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::string arg_next = value; // split string by , and / @@ -1759,7 +1759,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-mg", "--main-gpu"}, "INDEX", format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.main_gpu = value; #ifndef GGML_USE_CUDA_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); @@ -1769,7 +1769,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--check-tensors"}, format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.check_tensors = true; } )); @@ -1777,7 +1777,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example {"--override-kv"}, "KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); } @@ -1786,21 +1786,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.lora_adapters.push_back({ std::string(value), 1.0 }); } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(llama_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [¶ms](std::string fname, std::string scale) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & fname, const std::string & scale) { params.lora_adapters.push_back({ fname, std::stof(scale) }); } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(llama_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.control_vectors.push_back({ 1.0f, value, }); } )); @@ -1808,14 +1808,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example {"--control-vector-scaled"}, "FNAME", "SCALE", "add a control vector with user defined scaling SCALE\n" "note: this argument can be repeated to add multiple scaled control vectors", - [¶ms](std::string fname, std::string scale) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & fname, const std::string & scale) { params.control_vectors.push_back({ std::stof(scale), fname }); } )); add_opt(llama_arg( {"--control-vector-layer-range"}, "START", "END", "layer range to apply the control vector(s) to, start and end inclusive", - [¶ms](std::string start, std::string end) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & start, const std::string & end) { params.control_vector_layer_start = std::stoi(start); params.control_vector_layer_end = std::stoi(end); } @@ -1823,7 +1823,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-a", "--alias"}, "STRING", "set alias for model name (to be used by REST API)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.model_alias = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL")); @@ -1835,49 +1835,49 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); add_opt(llama_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.model_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.model_url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(llama_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(llama_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(llama_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.hf_token = value; } ).set_env("HF_TOKEN")); add_opt(llama_arg( {"--context-file"}, "FNAME", "file to load context from (repeat to specify multiple files)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1888,28 +1888,28 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--chunk-size"}, "N", format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(llama_arg( {"--chunk-separator"}, "STRING", format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(llama_arg( {"--junk"}, "N", format("number of times to repeat the junk text (default: %d)", params.n_junk), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(llama_arg( {"--pos"}, "N", format("position of the passkey in the junk text (default: %d)", params.i_pos), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); @@ -1921,7 +1921,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR ? params.cvector_outfile.c_str() : params.out_file.c_str()), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.out_file = value; params.cvector_outfile = value; params.lora_outfile = value; @@ -1930,49 +1930,49 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ofreq", "--output-frequency"}, "N", format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--save-frequency"}, "N", format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--process-output"}, format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--no-ppl"}, format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--chunk"}, "N", format("start processing the input from chunk N (default: %d)", params.i_chunk), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"-pps"}, format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.is_pp_shared = true; } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(llama_arg( {"-npp"}, "n0,n1,...", "number of prompt tokens", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { auto p = string_split(value, ','); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); } @@ -1980,7 +1980,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ntg"}, "n0,n1,...", "number of text generation tokens", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { auto p = string_split(value, ','); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); } @@ -1988,7 +1988,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-npl"}, "n0,n1,...", "number of parallel prompts", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { auto p = string_split(value, ','); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); } @@ -1996,63 +1996,63 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--embd-normalize"}, "N", format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.embd_normalize = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(llama_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.embd_out = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(llama_arg( {"--embd-separator"}, "STRING", "separator of embendings (default \\n) for example \"<#sep#>\"", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.embd_sep = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(llama_arg( {"--host"}, "HOST", format("ip address to listen (default: %s)", params.hostname.c_str()), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); add_opt(llama_arg( {"--port"}, "PORT", format("port to listen (default: %d)", params.port), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); add_opt(llama_arg( {"--path"}, "PATH", format("path to serve static files from (default: %s)", params.public_path.c_str()), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--embedding", "--embeddings"}, format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(llama_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.api_keys.push_back(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); add_opt(llama_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::ifstream key_file(value); if (!key_file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -2069,21 +2069,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.ssl_file_key = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--ssl-cert-file"}, "FNAME", "path to file a PEM-encoded SSL certificate", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--timeout"}, "N", format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.timeout_read = value; params.timeout_write = value; } @@ -2091,14 +2091,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--threads-http"}, "N", format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); add_opt(llama_arg( {"-spf", "--system-prompt-file"}, "FNAME", "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -2115,7 +2115,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--log-format"}, "{text, json}", "log output format: json or text (default: json)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { if (value == "json") { params.log_json = true; } else if (value == "text") { @@ -2128,21 +2128,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--metrics"}, format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); add_opt(llama_arg( {"--no-slots"}, format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.endpoint_slots = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); add_opt(llama_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.slot_save_path = value; // if doesn't end with DIRECTORY_SEPARATOR, add it if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { @@ -2155,7 +2155,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "set custom jinja chat template (default: template taken from model's metadata)\n" "if suffix/prefix are specified, template will be disabled\n" "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { if (!llama_chat_verify_template(value)) { throw std::runtime_error(format( "error: the supplied chat template is not supported: %s\n" @@ -2169,28 +2169,28 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--lora-init-without-apply"}, format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.lora_init_without_apply = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", - [¶ms]() { + [](gpt_params & params, llama_sampling_params & sparams) { params.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-ld", "--logdir"}, "LOGDIR", "path under which to save YAML logs (no logging if unset)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.logdir = value; if (params.logdir.back() != DIRECTORY_SEPARATOR) { @@ -2201,35 +2201,35 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--positive-file"}, "FNAME", format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--negative-file"}, "FNAME", format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { params.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--pca-batch"}, "N", format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--pca-iter"}, "N", format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [¶ms](int value) { + [](gpt_params & params, llama_sampling_params & sparams, int value) { params.n_pca_iterations = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--method"}, "{pca, mean}", "dimensionality reduction method to be used (default: pca)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } else { throw std::invalid_argument("invalid value"); } @@ -2238,7 +2238,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--output-format"}, "{md,jsonl}", "output format for batched-bench results (default: md)", - [¶ms](std::string value) { + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } else if (value == "md") { params.batched_bench_output_jsonl = false; } else { std::invalid_argument("invalid value"); } @@ -2249,32 +2249,32 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--log-test"}, "Log test", - []() { log_param_single_parse("--log-test"); } + [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-test"); } )); add_opt(llama_arg( {"--log-disable"}, "Log disable", - []() { log_param_single_parse("--log-disable"); } + [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-disable"); } )); add_opt(llama_arg( {"--log-enable"}, "Log enable", - []() { log_param_single_parse("--log-enable"); } + [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-enable"); } )); add_opt(llama_arg( {"--log-new"}, "Log new", - []() { log_param_single_parse("--log-new"); } + [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-new"); } )); add_opt(llama_arg( {"--log-append"}, "Log append", - []() { log_param_single_parse("--log-append"); } + [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-append"); } )); add_opt(llama_arg( {"--log-file"}, "FNAME", "Log file", - [](std::string value) { log_param_pair_parse(false, "--log-file", value); } + [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } )); #endif // LOG_DISABLE_LOGS diff --git a/common/common.h b/common/common.h index e8dd040e9d994..c4893d17481ca 100644 --- a/common/common.h +++ b/common/common.h @@ -310,20 +310,39 @@ struct llama_arg { std::string value_hint_2; // for second arg value std::string env; std::string help; - std::function handler_void = nullptr; - std::function handler_string = nullptr; - std::function handler_str_str = nullptr; - std::function handler_int = nullptr; - - llama_arg(std::vector args, std::string value_hint, std::string help, std::function handler) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} - - llama_arg(std::vector args, std::string value_hint, std::string help, std::function handler) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} - - llama_arg(std::vector args, std::string help, std::function handler) : args(args), help(help), handler_void(handler) {} + void (*handler_void) (gpt_params & params, llama_sampling_params & sparams) = nullptr; + void (*handler_string) (gpt_params & params, llama_sampling_params & sparams, const std::string &) = nullptr; + void (*handler_str_str)(gpt_params & params, llama_sampling_params & sparams, const std::string &, const std::string &) = nullptr; + void (*handler_int) (gpt_params & params, llama_sampling_params & sparams, int) = nullptr; + + llama_arg( + const std::initializer_list & args, + const std::string & value_hint, + const std::string & help, + void (*handler)(gpt_params & params, llama_sampling_params & sparams, const std::string &) + ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} + + llama_arg( + const std::initializer_list & args, + const std::string & value_hint, + const std::string & help, + void (*handler)(gpt_params & params, llama_sampling_params & sparams, int) + ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} + + llama_arg( + const std::initializer_list & args, + const std::string & help, + void (*handler)(gpt_params & params, llama_sampling_params & sparams) + ) : args(args), help(help), handler_void(handler) {} // support 2 values for arg - // note: env variable is not yet support for 2 values - llama_arg(std::vector args, std::string value_hint, std::string value_hint_2, std::string help, std::function handler) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} + llama_arg( + const std::initializer_list & args, + const std::string & value_hint, + const std::string & value_hint_2, + const std::string & help, + void (*handler)(gpt_params & params, llama_sampling_params & sparams, const std::string &, const std::string &) + ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} llama_arg & set_examples(std::set examples) { this->examples = std::move(examples); @@ -340,7 +359,7 @@ struct llama_arg { return examples.find(ex) != examples.end(); } - bool get_value_from_env(std::string & output) { + bool get_value_from_env(std::string & output) const { if (env.empty()) return false; char * value = std::getenv(env.c_str()); if (value) { @@ -350,7 +369,7 @@ struct llama_arg { return false; } - bool has_value_from_env() { + bool has_value_from_env() const { return std::getenv(env.c_str()); } From eb7d8f85a27a24e65a68c5727c4a0e3a2ea86805 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 7 Sep 2024 18:24:44 +0200 Subject: [PATCH 18/20] params.sparams --- common/common.cpp | 499 +++++++++++++++++++++++----------------------- common/common.h | 16 +- 2 files changed, 257 insertions(+), 258 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 012dd1adc98ef..804af1d943e0f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -362,13 +362,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto if (opt.get_value_from_env(value)) { try { if (opt.handler_void && (value == "1" || value == "true")) { - opt.handler_void(params, sparams); + opt.handler_void(params); } if (opt.handler_int) { - opt.handler_int(params, sparams, std::stoi(value)); + opt.handler_int(params, std::stoi(value)); } if (opt.handler_string) { - opt.handler_string(params, sparams, value); + opt.handler_string(params, value); continue; } } catch (std::exception & e) { @@ -399,7 +399,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto } try { if (opt.handler_void) { - opt.handler_void(params, sparams); + opt.handler_void(params); continue; } @@ -407,11 +407,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto check_arg(i); std::string val = argv[++i]; if (opt.handler_int) { - opt.handler_int(params, sparams, std::stoi(val)); + opt.handler_int(params, std::stoi(val)); continue; } if (opt.handler_string) { - opt.handler_string(params, sparams, val); + opt.handler_string(params, val); continue; } @@ -419,7 +419,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto check_arg(i); std::string val2 = argv[++i]; if (opt.handler_str_str) { - opt.handler_str_str(params, sparams, val, val2); + opt.handler_str_str(params, val, val2); continue; } } catch (std::exception & e) { @@ -650,11 +650,10 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example std::vector options; params.print_usage = print_usage; params.curr_ex = ex; - llama_sampling_params & sparams = params.sparams; std::string sampler_type_chars; std::string sampler_type_names; - for (const auto sampler_type : sparams.samplers_sequence) { + for (const auto sampler_type : params.sparams.samplers_sequence) { sampler_type_chars += static_cast(sampler_type); sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; } @@ -687,14 +686,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-h", "--help", "--usage"}, "print usage and exit", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.usage = true; } )); add_opt(llama_arg( {"--version"}, "show version and build info", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); @@ -703,51 +702,51 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-v", "--verbose"}, "print verbose information", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.verbosity = 1; } )); add_opt(llama_arg( {"--verbosity"}, "N", format("set specific verbosity level (default: %d)", params.verbosity), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.verbosity = value; } )); add_opt(llama_arg( {"--verbose-prompt"}, format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.verbose_prompt = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--no-display-prompt"}, format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-co", "--color"}, format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-s", "--seed"}, "SEED", format("RNG seed (default: %d, use random seed for < 0)", params.seed), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. params.seed = std::stoul(value); - sparams.seed = std::stoul(value); + params.sparams.seed = std::stoul(value); } )); add_opt(llama_arg( {"-t", "--threads"}, "N", format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.cpuparams.n_threads = value; if (params.cpuparams.n_threads <= 0) { params.cpuparams.n_threads = std::thread::hardware_concurrency(); @@ -757,7 +756,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.cpuparams_batch.n_threads = value; if (params.cpuparams_batch.n_threads <= 0) { params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); @@ -767,7 +766,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-td", "--threads-draft"}, "N", "number of threads to use during generation (default: same as --threads)", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.draft_cpuparams.n_threads = value; if (params.draft_cpuparams.n_threads <= 0) { params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); @@ -777,7 +776,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.draft_cpuparams_batch.n_threads = value; if (params.draft_cpuparams_batch.n_threads <= 0) { params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); @@ -787,7 +786,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string mask = value; params.cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { @@ -798,7 +797,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-Cr", "--cpu-range"}, "lo-hi", "range of CPUs for affinity. Complements --cpu-mask", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string range = value; params.cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams.cpumask)) { @@ -809,21 +808,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cpu-strict"}, "<0|1>", format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.cpuparams.strict_cpu = std::stoul(value); } )); add_opt(llama_arg( {"--poll"}, "<0...100>", format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.cpuparams.poll = std::stoul(value); } )); add_opt(llama_arg( {"-Cb", "--cpu-mask-batch"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string mask = value; params.cpuparams_batch.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { @@ -834,7 +833,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-Crb", "--cpu-range-batch"}, "lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string range = value; params.cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { @@ -845,21 +844,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cpu-strict-batch"}, "<0|1>", "use strict CPU placement (default: same as --cpu-strict)", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.cpuparams_batch.strict_cpu = value; } )); add_opt(llama_arg( {"--poll-batch"}, "<0|1>", "use polling to wait for work (default: same as --poll)", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.cpuparams_batch.poll = value; } )); add_opt(llama_arg( {"-Cd", "--cpu-mask-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string mask = value; params.draft_cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { @@ -870,7 +869,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string range = value; params.draft_cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { @@ -881,21 +880,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.draft_cpuparams.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.draft_cpuparams.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string range = value; params.draft_cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { @@ -906,91 +905,91 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.draft_cpuparams_batch.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.draft_cpuparams_batch.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"--draft"}, "N", format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-ps", "--p-split"}, "N", format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.lookup_cache_static = value; } )); add_opt(llama_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.lookup_cache_dynamic = value; } )); add_opt(llama_arg( {"-c", "--ctx-size"}, "N", format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(llama_arg( {"-n", "--predict", "--n-predict"}, "N", format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); add_opt(llama_arg( {"-b", "--batch-size"}, "N", format("logical maximum batch size (default: %d)", params.n_batch), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); add_opt(llama_arg( {"-ub", "--ubatch-size"}, "N", format("physical maximum batch size (default: %d)", params.n_ubatch), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); add_opt(llama_arg( {"--keep"}, "N", format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_keep = value; } )); add_opt(llama_arg( {"--chunks"}, "N", format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_chunks = value; } )); add_opt(llama_arg( {"-fa", "--flash-attn"}, format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); @@ -999,14 +998,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example ex == LLAMA_EXAMPLE_MAIN ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" : "prompt to start generation with", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.prompt = value; } )); add_opt(llama_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1022,7 +1021,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1033,7 +1032,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1049,56 +1048,56 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-e", "--escape"}, format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.escape = true; } )); add_opt(llama_arg( {"--no-escape"}, "do not process escape sequences", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.escape = false; } )); add_opt(llama_arg( {"-ptc", "--print-token-count"}, "N", format("print token count every N tokens (default: %d)", params.n_print), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_print = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.path_prompt_cache = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.prompt_cache_all = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.prompt_cache_ro = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.antiprompt.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-sp", "--special"}, format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); @@ -1111,35 +1110,35 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "(default: %s)", params.conversation ? "true" : "false" ), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.conversation = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-i", "--interactive"}, format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.interactive = true; } ).set_examples({LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-if", "--interactive-first"}, format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.interactive_first = true; } ).set_examples({LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.multiline_input = true; } ).set_examples({LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.input_prefix_bos = true; params.enable_chat_template = false; } @@ -1147,7 +1146,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.input_prefix = value; params.enable_chat_template = false; } @@ -1155,7 +1154,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.input_suffix = value; params.enable_chat_template = false; } @@ -1163,7 +1162,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--no-warmup"}, "skip warming up the model with an empty run", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.warmup = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); @@ -1173,145 +1172,145 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.spm_infill = true; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--samplers"}, "SAMPLERS", format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); - sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); + params.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); } )); add_opt(llama_arg( {"--sampling-seq"}, "SEQUENCE", format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.samplers_sequence = llama_sampling_types_from_chars(value); + [](gpt_params & params, const std::string & value) { + params.sparams.samplers_sequence = llama_sampling_types_from_chars(value); } )); add_opt(llama_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.ignore_eos = true; } )); add_opt(llama_arg( {"--penalize-nl"}, - format("penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { - sparams.penalize_nl = true; + format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + [](gpt_params & params) { + params.sparams.penalize_nl = true; } )); add_opt(llama_arg( {"--temp"}, "N", - format("temperature (default: %.1f)", (double)sparams.temp), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.temp = std::stof(value); - sparams.temp = std::max(sparams.temp, 0.0f); + format("temperature (default: %.1f)", (double)params.sparams.temp), + [](gpt_params & params, const std::string & value) { + params.sparams.temp = std::stof(value); + params.sparams.temp = std::max(params.sparams.temp, 0.0f); } )); add_opt(llama_arg( {"--top-k"}, "N", - format("top-k sampling (default: %d, 0 = disabled)", sparams.top_k), - [](gpt_params & params, llama_sampling_params & sparams, int value) { - sparams.top_k = value; + format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + [](gpt_params & params, int value) { + params.sparams.top_k = value; } )); add_opt(llama_arg( {"--top-p"}, "N", - format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.top_p = std::stof(value); + format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + [](gpt_params & params, const std::string & value) { + params.sparams.top_p = std::stof(value); } )); add_opt(llama_arg( {"--min-p"}, "N", - format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.min_p = std::stof(value); + format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + [](gpt_params & params, const std::string & value) { + params.sparams.min_p = std::stof(value); } )); add_opt(llama_arg( {"--tfs"}, "N", - format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.tfs_z = std::stof(value); + format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + [](gpt_params & params, const std::string & value) { + params.sparams.tfs_z = std::stof(value); } )); add_opt(llama_arg( {"--typical"}, "N", - format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.typical_p = std::stof(value); + format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typical_p), + [](gpt_params & params, const std::string & value) { + params.sparams.typical_p = std::stof(value); } )); add_opt(llama_arg( {"--repeat-last-n"}, "N", - format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n), - [](gpt_params & params, llama_sampling_params & sparams, int value) { - sparams.penalty_last_n = value; - sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); + format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + [](gpt_params & params, int value) { + params.sparams.penalty_last_n = value; + params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); } )); add_opt(llama_arg( {"--repeat-penalty"}, "N", - format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.penalty_repeat = std::stof(value); + format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_repeat = std::stof(value); } )); add_opt(llama_arg( {"--presence-penalty"}, "N", - format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.penalty_present = std::stof(value); + format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_present = std::stof(value); } )); add_opt(llama_arg( {"--frequency-penalty"}, "N", - format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.penalty_freq = std::stof(value); + format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_freq = std::stof(value); } )); add_opt(llama_arg( {"--dynatemp-range"}, "N", - format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.dynatemp_range = std::stof(value); + format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_range = std::stof(value); } )); add_opt(llama_arg( {"--dynatemp-exp"}, "N", - format("dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.dynatemp_exponent = std::stof(value); + format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_exponent = std::stof(value); } )); add_opt(llama_arg( {"--mirostat"}, "N", format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat), - [](gpt_params & params, llama_sampling_params & sparams, int value) { - sparams.mirostat = value; + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), + [](gpt_params & params, int value) { + params.sparams.mirostat = value; } )); add_opt(llama_arg( {"--mirostat-lr"}, "N", - format("Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.mirostat_eta = std::stof(value); + format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_eta = std::stof(value); } )); add_opt(llama_arg( {"--mirostat-ent"}, "N", - format("Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.mirostat_tau = std::stof(value); + format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_tau = std::stof(value); } )); add_opt(llama_arg( @@ -1319,14 +1318,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::stringstream ss(value); llama_token key; char sign; std::string value_str; try { if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + params.sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); } else { throw std::invalid_argument("invalid input format"); } @@ -1337,43 +1336,43 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example )); add_opt(llama_arg( {"--cfg-negative-prompt"}, "PROMPT", - format("negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.cfg_negative_prompt = value; + format("negative prompt to use for guidance (default: '%s')", params.sparams.cfg_negative_prompt.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.cfg_negative_prompt = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--cfg-negative-prompt-file"}, "FNAME", "negative prompt file to use for guidance", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); - if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { - sparams.cfg_negative_prompt.pop_back(); + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.sparams.cfg_negative_prompt)); + if (!params.sparams.cfg_negative_prompt.empty() && params.sparams.cfg_negative_prompt.back() == '\n') { + params.sparams.cfg_negative_prompt.pop_back(); } } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--cfg-scale"}, "N", - format("strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.cfg_scale = std::stof(value); + format("strength of guidance (default: %.1f, 1.0 = disable)", (double)params.sparams.cfg_scale), + [](gpt_params & params, const std::string & value) { + params.sparams.cfg_scale = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--grammar"}, "GRAMMAR", - format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.grammar = value; + format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = value; } )); add_opt(llama_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1381,21 +1380,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(sparams.grammar) + std::back_inserter(params.sparams.grammar) ); } )); add_opt(llama_arg( {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { - sparams.grammar = json_schema_to_grammar(json::parse(value)); + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = json_schema_to_grammar(json::parse(value)); } )); add_opt(llama_arg( {"--pooling"}, "{none,mean,cls,last}", "pooling type for embeddings, use model default if unspecified", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } @@ -1406,7 +1405,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--attention"}, "{causal,non,causal}", "attention type for embeddings, use model default if unspecified", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } else { throw std::invalid_argument("invalid value"); } @@ -1415,7 +1414,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--rope-scaling"}, "{none,linear,yarn}", "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } @@ -1425,91 +1424,91 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--rope-scale"}, "N", "RoPE context scaling factor, expands context by a factor of N", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.rope_freq_scale = 1.0f / std::stof(value); } )); add_opt(llama_arg( {"--rope-freq-base"}, "N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.rope_freq_base = std::stof(value); } )); add_opt(llama_arg( {"--rope-freq-scale"}, "N", "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.rope_freq_scale = std::stof(value); } )); add_opt(llama_arg( {"--yarn-orig-ctx"}, "N", format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.yarn_orig_ctx = value; } )); add_opt(llama_arg( {"--yarn-ext-factor"}, "N", format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } )); add_opt(llama_arg( {"--yarn-attn-factor"}, "N", format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } )); add_opt(llama_arg( {"--yarn-beta-slow"}, "N", format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } )); add_opt(llama_arg( {"--yarn-beta-fast"}, "N", format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } )); add_opt(llama_arg( {"-gan", "--grp-attn-n"}, "N", format("group-attention factor (default: %d)", params.grp_attn_n), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.grp_attn_n = value; } )); add_opt(llama_arg( {"-gaw", "--grp-attn-w"}, "N", format("group-attention width (default: %.1f)", (double)params.grp_attn_w), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.grp_attn_w = value; } )); add_opt(llama_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.dump_kv_cache = true; } )); add_opt(llama_arg( {"-nkvo", "--no-kv-offload"}, "disable KV offload", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.no_kv_offload = true; } )); add_opt(llama_arg( {"-ctk", "--cache-type-k"}, "TYPE", format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_k = value; } @@ -1517,7 +1516,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ctv", "--cache-type-v"}, "TYPE", format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_v = value; } @@ -1525,119 +1524,119 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--all-logits"}, format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.logits_all = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.hellaswag = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--hellaswag-tasks"}, "N", format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.hellaswag_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.winogrande = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--winogrande-tasks"}, "N", format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.winogrande_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.multiple_choice = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--multiple-choice-tasks"}, "N", format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.multiple_choice_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.kl_divergence = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--ppl-stride"}, "N", format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"--ppl-output-type"}, "<0|1>", format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(llama_arg( {"-dt", "--defrag-thold"}, "N", format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(llama_arg( {"-np", "--parallel"}, "N", format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_parallel = value; } )); add_opt(llama_arg( {"-ns", "--sequences"}, "N", format("number of sequences to decode (default: %d)", params.n_sequences), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_sequences = value; } )); add_opt(llama_arg( {"-cb", "--cont-batching"}, format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.cont_batching = true; } ).set_env("LLAMA_ARG_CONT_BATCHING")); add_opt(llama_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.cont_batching = false; } ).set_env("LLAMA_ARG_NO_CONT_BATCHING")); add_opt(llama_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.mmproj = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(llama_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.image.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_LLAVA})); @@ -1645,7 +1644,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--rpc"}, "SERVERS", "comma separated list of RPC servers", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.rpc_servers = value; } )); @@ -1653,14 +1652,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--mlock"}, "force system to keep model in RAM rather than swapping or compressing", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.use_mlock = true; } )); add_opt(llama_arg( {"--no-mmap"}, "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.use_mmap = false; } )); @@ -1672,7 +1671,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } @@ -1682,7 +1681,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ngl", "--gpu-layers"}, "N", "number of layers to store in VRAM", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); @@ -1693,7 +1692,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ngld", "--gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_gpu_layers_draft = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); @@ -1707,7 +1706,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" "- row: split rows across GPUs", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { params.split_mode = LLAMA_SPLIT_MODE_NONE; @@ -1732,7 +1731,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ts", "--tensor-split"}, "N0,N1,N2,...", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::string arg_next = value; // split string by , and / @@ -1759,7 +1758,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-mg", "--main-gpu"}, "INDEX", format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.main_gpu = value; #ifndef GGML_USE_CUDA_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); @@ -1769,7 +1768,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--check-tensors"}, format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.check_tensors = true; } )); @@ -1777,7 +1776,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example {"--override-kv"}, "KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); } @@ -1786,21 +1785,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.lora_adapters.push_back({ std::string(value), 1.0 }); } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(llama_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & fname, const std::string & scale) { + [](gpt_params & params, const std::string & fname, const std::string & scale) { params.lora_adapters.push_back({ fname, std::stof(scale) }); } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(llama_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.control_vectors.push_back({ 1.0f, value, }); } )); @@ -1808,14 +1807,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example {"--control-vector-scaled"}, "FNAME", "SCALE", "add a control vector with user defined scaling SCALE\n" "note: this argument can be repeated to add multiple scaled control vectors", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & fname, const std::string & scale) { + [](gpt_params & params, const std::string & fname, const std::string & scale) { params.control_vectors.push_back({ std::stof(scale), fname }); } )); add_opt(llama_arg( {"--control-vector-layer-range"}, "START", "END", "layer range to apply the control vector(s) to, start and end inclusive", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & start, const std::string & end) { + [](gpt_params & params, const std::string & start, const std::string & end) { params.control_vector_layer_start = std::stoi(start); params.control_vector_layer_end = std::stoi(end); } @@ -1823,7 +1822,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-a", "--alias"}, "STRING", "set alias for model name (to be used by REST API)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.model_alias = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL")); @@ -1835,49 +1834,49 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); add_opt(llama_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.model_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.model_url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(llama_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(llama_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(llama_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.hf_token = value; } ).set_env("HF_TOKEN")); add_opt(llama_arg( {"--context-file"}, "FNAME", "file to load context from (repeat to specify multiple files)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1888,28 +1887,28 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--chunk-size"}, "N", format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(llama_arg( {"--chunk-separator"}, "STRING", format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(llama_arg( {"--junk"}, "N", format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(llama_arg( {"--pos"}, "N", format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); @@ -1921,7 +1920,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR ? params.cvector_outfile.c_str() : params.out_file.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.out_file = value; params.cvector_outfile = value; params.lora_outfile = value; @@ -1930,49 +1929,49 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ofreq", "--output-frequency"}, "N", format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--save-frequency"}, "N", format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--process-output"}, format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--no-ppl"}, format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"--chunk"}, "N", format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(llama_arg( {"-pps"}, format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.is_pp_shared = true; } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(llama_arg( {"-npp"}, "n0,n1,...", "number of prompt tokens", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); } @@ -1980,7 +1979,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-ntg"}, "n0,n1,...", "number of text generation tokens", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); } @@ -1988,7 +1987,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-npl"}, "n0,n1,...", "number of parallel prompts", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); } @@ -1996,63 +1995,63 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--embd-normalize"}, "N", format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.embd_normalize = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(llama_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.embd_out = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(llama_arg( {"--embd-separator"}, "STRING", "separator of embendings (default \\n) for example \"<#sep#>\"", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.embd_sep = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(llama_arg( {"--host"}, "HOST", format("ip address to listen (default: %s)", params.hostname.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); add_opt(llama_arg( {"--port"}, "PORT", format("port to listen (default: %d)", params.port), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); add_opt(llama_arg( {"--path"}, "PATH", format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--embedding", "--embeddings"}, format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(llama_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.api_keys.push_back(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); add_opt(llama_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::ifstream key_file(value); if (!key_file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -2069,21 +2068,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.ssl_file_key = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--ssl-cert-file"}, "FNAME", "path to file a PEM-encoded SSL certificate", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--timeout"}, "N", format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.timeout_read = value; params.timeout_write = value; } @@ -2091,14 +2090,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--threads-http"}, "N", format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); add_opt(llama_arg( {"-spf", "--system-prompt-file"}, "FNAME", "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -2115,7 +2114,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--log-format"}, "{text, json}", "log output format: json or text (default: json)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { if (value == "json") { params.log_json = true; } else if (value == "text") { @@ -2128,21 +2127,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--metrics"}, format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); add_opt(llama_arg( {"--no-slots"}, format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.endpoint_slots = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); add_opt(llama_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.slot_save_path = value; // if doesn't end with DIRECTORY_SEPARATOR, add it if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { @@ -2155,7 +2154,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example "set custom jinja chat template (default: template taken from model's metadata)\n" "if suffix/prefix are specified, template will be disabled\n" "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { if (!llama_chat_verify_template(value)) { throw std::runtime_error(format( "error: the supplied chat template is not supported: %s\n" @@ -2169,28 +2168,28 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--lora-init-without-apply"}, format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.lora_init_without_apply = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", - [](gpt_params & params, llama_sampling_params & sparams) { + [](gpt_params & params) { params.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-ld", "--logdir"}, "LOGDIR", "path under which to save YAML logs (no logging if unset)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.logdir = value; if (params.logdir.back() != DIRECTORY_SEPARATOR) { @@ -2201,35 +2200,35 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--positive-file"}, "FNAME", format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--negative-file"}, "FNAME", format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { params.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--pca-batch"}, "N", format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--pca-iter"}, "N", format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](gpt_params & params, llama_sampling_params & sparams, int value) { + [](gpt_params & params, int value) { params.n_pca_iterations = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(llama_arg( {"--method"}, "{pca, mean}", "dimensionality reduction method to be used (default: pca)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } else { throw std::invalid_argument("invalid value"); } @@ -2238,7 +2237,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--output-format"}, "{md,jsonl}", "output format for batched-bench results (default: md)", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { + [](gpt_params & params, const std::string & value) { /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } else if (value == "md") { params.batched_bench_output_jsonl = false; } else { std::invalid_argument("invalid value"); } @@ -2249,32 +2248,32 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--log-test"}, "Log test", - [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-test"); } + [](gpt_params & params) { log_param_single_parse("--log-test"); } )); add_opt(llama_arg( {"--log-disable"}, "Log disable", - [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-disable"); } + [](gpt_params & params) { log_param_single_parse("--log-disable"); } )); add_opt(llama_arg( {"--log-enable"}, "Log enable", - [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-enable"); } + [](gpt_params & params) { log_param_single_parse("--log-enable"); } )); add_opt(llama_arg( {"--log-new"}, "Log new", - [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-new"); } + [](gpt_params & params) { log_param_single_parse("--log-new"); } )); add_opt(llama_arg( {"--log-append"}, "Log append", - [](gpt_params & params, llama_sampling_params & sparams) { log_param_single_parse("--log-append"); } + [](gpt_params & params) { log_param_single_parse("--log-append"); } )); add_opt(llama_arg( {"--log-file"}, "FNAME", "Log file", - [](gpt_params & params, llama_sampling_params & sparams, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } + [](gpt_params & params, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } )); #endif // LOG_DISABLE_LOGS diff --git a/common/common.h b/common/common.h index c4893d17481ca..1f709271d4038 100644 --- a/common/common.h +++ b/common/common.h @@ -310,29 +310,29 @@ struct llama_arg { std::string value_hint_2; // for second arg value std::string env; std::string help; - void (*handler_void) (gpt_params & params, llama_sampling_params & sparams) = nullptr; - void (*handler_string) (gpt_params & params, llama_sampling_params & sparams, const std::string &) = nullptr; - void (*handler_str_str)(gpt_params & params, llama_sampling_params & sparams, const std::string &, const std::string &) = nullptr; - void (*handler_int) (gpt_params & params, llama_sampling_params & sparams, int) = nullptr; + void (*handler_void) (gpt_params & params) = nullptr; + void (*handler_string) (gpt_params & params, const std::string &) = nullptr; + void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (gpt_params & params, int) = nullptr; llama_arg( const std::initializer_list & args, const std::string & value_hint, const std::string & help, - void (*handler)(gpt_params & params, llama_sampling_params & sparams, const std::string &) + void (*handler)(gpt_params & params, const std::string &) ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} llama_arg( const std::initializer_list & args, const std::string & value_hint, const std::string & help, - void (*handler)(gpt_params & params, llama_sampling_params & sparams, int) + void (*handler)(gpt_params & params, int) ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} llama_arg( const std::initializer_list & args, const std::string & help, - void (*handler)(gpt_params & params, llama_sampling_params & sparams) + void (*handler)(gpt_params & params) ) : args(args), help(help), handler_void(handler) {} // support 2 values for arg @@ -341,7 +341,7 @@ struct llama_arg { const std::string & value_hint, const std::string & value_hint_2, const std::string & help, - void (*handler)(gpt_params & params, llama_sampling_params & sparams, const std::string &, const std::string &) + void (*handler)(gpt_params & params, const std::string &, const std::string &) ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} llama_arg & set_examples(std::set examples) { From e625f5fd1ec435e8f1262d6a1a753cdd44fb7213 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 7 Sep 2024 18:41:42 +0200 Subject: [PATCH 19/20] optimize more --- common/common.cpp | 32 ++++++++++++++----------- common/common.h | 36 ++++++++++++++-------------- examples/export-docs/export-docs.cpp | 7 +++--- 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 804af1d943e0f..0bf01ce2a2b66 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -373,7 +373,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto } } catch (std::exception & e) { throw std::invalid_argument(format( - "error while handling environment variable \"%s\": %s\n\n", opt.env.c_str(), e.what())); + "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); } } } @@ -395,7 +395,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto } auto opt = *arg_to_options[arg]; if (opt.has_value_from_env()) { - fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env.c_str(), arg.c_str()); + fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); } try { if (opt.handler_void) { @@ -595,15 +595,19 @@ std::string llama_arg::to_string() { std::string leading_spaces(n_leading_spaces, ' '); std::ostringstream ss; - for (const auto & arg : args) { + for (const auto arg : args) { if (arg == args.front()) { - ss << (args.size() == 1 ? arg : format("%-7s", (arg + ",").c_str())); + if (args.size() == 1) { + ss << arg; + } else { + ss << format("%-7s", arg) << ", "; + } } else { ss << arg << (arg != args.back() ? ", " : ""); } } - if (!value_hint.empty()) ss << " " << value_hint; - if (!value_hint_2.empty()) ss << " " << value_hint_2; + if (value_hint) ss << " " << value_hint; + if (value_hint_2) ss << " " << value_hint_2; if (ss.tellp() > n_leading_spaces - 3) { // current line is too long, add new line ss << "\n" << leading_spaces; @@ -675,7 +679,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example if (seen_args.find(a) == seen_args.end()) { seen_args.insert(a); } else { - throw std::runtime_error(format("found duplicated argument in source code: %s", a.c_str())); + throw std::runtime_error(format("found duplicated argument in source code: %s", a)); } } options.push_back(std::move(arg)); @@ -693,7 +697,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--version"}, "show version and build info", - [](gpt_params & params) { + [](gpt_params &) { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); @@ -2248,32 +2252,32 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example add_opt(llama_arg( {"--log-test"}, "Log test", - [](gpt_params & params) { log_param_single_parse("--log-test"); } + [](gpt_params &) { log_param_single_parse("--log-test"); } )); add_opt(llama_arg( {"--log-disable"}, "Log disable", - [](gpt_params & params) { log_param_single_parse("--log-disable"); } + [](gpt_params &) { log_param_single_parse("--log-disable"); } )); add_opt(llama_arg( {"--log-enable"}, "Log enable", - [](gpt_params & params) { log_param_single_parse("--log-enable"); } + [](gpt_params &) { log_param_single_parse("--log-enable"); } )); add_opt(llama_arg( {"--log-new"}, "Log new", - [](gpt_params & params) { log_param_single_parse("--log-new"); } + [](gpt_params &) { log_param_single_parse("--log-new"); } )); add_opt(llama_arg( {"--log-append"}, "Log append", - [](gpt_params & params) { log_param_single_parse("--log-append"); } + [](gpt_params &) { log_param_single_parse("--log-append"); } )); add_opt(llama_arg( {"--log-file"}, "FNAME", "Log file", - [](gpt_params & params, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } + [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } )); #endif // LOG_DISABLE_LOGS diff --git a/common/common.h b/common/common.h index 1f709271d4038..b79149da04899 100644 --- a/common/common.h +++ b/common/common.h @@ -305,10 +305,10 @@ struct gpt_params { struct llama_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; - std::vector args; - std::string value_hint; // help text or example for arg value - std::string value_hint_2; // for second arg value - std::string env; + std::vector args; + const char * value_hint = nullptr; // help text or example for arg value + const char * value_hint_2 = nullptr; // for second arg value + const char * env = nullptr; std::string help; void (*handler_void) (gpt_params & params) = nullptr; void (*handler_string) (gpt_params & params, const std::string &) = nullptr; @@ -316,42 +316,42 @@ struct llama_arg { void (*handler_int) (gpt_params & params, int) = nullptr; llama_arg( - const std::initializer_list & args, - const std::string & value_hint, + const std::initializer_list & args, + const char * value_hint, const std::string & help, void (*handler)(gpt_params & params, const std::string &) ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} llama_arg( - const std::initializer_list & args, - const std::string & value_hint, + const std::initializer_list & args, + const char * value_hint, const std::string & help, void (*handler)(gpt_params & params, int) ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} llama_arg( - const std::initializer_list & args, + const std::initializer_list & args, const std::string & help, void (*handler)(gpt_params & params) ) : args(args), help(help), handler_void(handler) {} // support 2 values for arg llama_arg( - const std::initializer_list & args, - const std::string & value_hint, - const std::string & value_hint_2, + const std::initializer_list & args, + const char * value_hint, + const char * value_hint_2, const std::string & help, void (*handler)(gpt_params & params, const std::string &, const std::string &) ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - llama_arg & set_examples(std::set examples) { + llama_arg & set_examples(std::initializer_list examples) { this->examples = std::move(examples); return *this; } - llama_arg & set_env(std::string env) { + llama_arg & set_env(const char * env) { help = help + "\n(env: " + env + ")"; - this->env = std::move(env); + this->env = env; return *this; } @@ -360,8 +360,8 @@ struct llama_arg { } bool get_value_from_env(std::string & output) const { - if (env.empty()) return false; - char * value = std::getenv(env.c_str()); + if (env == nullptr) return false; + char * value = std::getenv(env); if (value) { output = value; return true; @@ -370,7 +370,7 @@ struct llama_arg { } bool has_value_from_env() const { - return std::getenv(env.c_str()); + return env != nullptr && std::getenv(env); } std::string to_string(); diff --git a/examples/export-docs/export-docs.cpp b/examples/export-docs/export-docs.cpp index 86c041a811d12..a09036dcf346d 100644 --- a/examples/export-docs/export-docs.cpp +++ b/examples/export-docs/export-docs.cpp @@ -22,18 +22,19 @@ static void export_md(std::string fname, llama_example ex) { // args for (const auto & arg : opt.args) { if (arg == opt.args.front()) { - file << (opt.args.size() == 1 ? arg : (arg + ", ")); + file << arg; + if (opt.args.size() > 1) file << ", "; } else { file << arg << (arg != opt.args.back() ? ", " : ""); } } // value hint - if (!opt.value_hint.empty()) { + if (opt.value_hint) { std::string md_value_hint(opt.value_hint); string_replace_all(md_value_hint, "|", "\\|"); file << " " << md_value_hint; } - if (!opt.value_hint_2.empty()) { + if (opt.value_hint_2) { std::string md_value_hint_2(opt.value_hint_2); string_replace_all(md_value_hint_2, "|", "\\|"); file << " " << md_value_hint_2; From 4b96c69a08a0e0e6f09883f7d8bad1375fbcaf86 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 7 Sep 2024 19:20:56 +0200 Subject: [PATCH 20/20] export-docs --> gen-docs --- Makefile | 6 +++--- common/common.cpp | 2 +- common/common.h | 2 -- examples/{export-docs => gen-docs}/CMakeLists.txt | 4 ++-- .../{export-docs/export-docs.cpp => gen-docs/gen-docs.cpp} | 4 ---- 5 files changed, 6 insertions(+), 12 deletions(-) rename examples/{export-docs => gen-docs}/CMakeLists.txt (70%) rename examples/{export-docs/export-docs.cpp => gen-docs/gen-docs.cpp} (95%) diff --git a/Makefile b/Makefile index 8b8605d5558d5..6053bc17b60db 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ BUILD_TARGETS = \ llama-tokenize \ llama-vdot \ llama-cvector-generator \ - llama-export-docs \ + llama-gen-docs \ tests/test-c.o # Binaries only useful for tests @@ -1444,11 +1444,11 @@ examples/server/%.hpp: examples/server/public/% Makefile echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ ) > $@ -llama-export-docs: examples/export-docs/export-docs.cpp \ +llama-gen-docs: examples/gen-docs/gen-docs.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - ./llama-export-docs + ./llama-gen-docs libllava.a: examples/llava/llava.cpp \ examples/llava/llava.h \ diff --git a/common/common.cpp b/common/common.cpp index 3b70fd53b50f0..e92dee7a7f6ec 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -298,7 +298,7 @@ static std::string format(const char * fmt, ...) { return std::string(buf.data(), size); } -void gpt_params_handle_model_default(gpt_params & params) { +static void gpt_params_handle_model_default(gpt_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (params.hf_file.empty()) { diff --git a/common/common.h b/common/common.h index bdb16f412fe6b..d7c08f20a124b 100644 --- a/common/common.h +++ b/common/common.h @@ -386,8 +386,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vecto // print full usage message; it will be called internally by gpt_params_parse() if "-h" is set void gpt_params_print_usage(gpt_params & params, std::vector & options); -void gpt_params_handle_model_default(gpt_params & params); - std::string gpt_params_get_system_info(const gpt_params & params); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); diff --git a/examples/export-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt similarity index 70% rename from examples/export-docs/CMakeLists.txt rename to examples/gen-docs/CMakeLists.txt index 0e953167ed653..c94cda7764341 100644 --- a/examples/export-docs/CMakeLists.txt +++ b/examples/gen-docs/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET llama-export-docs) -add_executable(${TARGET} export-docs.cpp) +set(TARGET llama-gen-docs) +add_executable(${TARGET} gen-docs.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/export-docs/export-docs.cpp b/examples/gen-docs/gen-docs.cpp similarity index 95% rename from examples/export-docs/export-docs.cpp rename to examples/gen-docs/gen-docs.cpp index a09036dcf346d..8b1dafd63a5e8 100644 --- a/examples/export-docs/export-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -1,11 +1,7 @@ #include "common.h" -#include "llama.h" -#include -#include #include #include -#include // Export usage message (-h) to markdown format