Skip to content

Commit 226460c

Browse files
authored
llama-bench : add no-kv-offload parameter (#4812)
1 parent d5a410e commit 226460c

File tree

1 file changed

+31
-3
lines changed

1 file changed

+31
-3
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ struct cmd_params {
138138
std::vector<int> n_threads;
139139
std::vector<int> n_gpu_layers;
140140
std::vector<int> main_gpu;
141+
std::vector<bool> no_kv_offload;
141142
std::vector<bool> mul_mat_q;
142143
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
143144
int reps;
@@ -155,6 +156,7 @@ static const cmd_params cmd_params_defaults = {
155156
/* n_threads */ {get_num_physical_cores()},
156157
/* n_gpu_layers */ {99},
157158
/* main_gpu */ {0},
159+
/* no_kv_offload */ {false},
158160
/* mul_mat_q */ {true},
159161
/* tensor_split */ {{}},
160162
/* reps */ 5,
@@ -176,6 +178,7 @@ static void print_usage(int /* argc */, char ** argv) {
176178
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
177179
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
178180
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
181+
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
179182
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
180183
printf(" -ts, --tensor_split <ts0/ts1/..> \n");
181184
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@@ -309,6 +312,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
309312
break;
310313
}
311314
params.main_gpu = split<int>(argv[i], split_delim);
315+
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
316+
if (++i >= argc) {
317+
invalid_param = true;
318+
break;
319+
}
320+
auto p = split<bool>(argv[i], split_delim);
321+
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
312322
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
313323
if (++i >= argc) {
314324
invalid_param = true;
@@ -383,6 +393,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
383393
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
384394
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
385395
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
396+
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
386397
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
387398
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
388399
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
@@ -400,6 +411,7 @@ struct cmd_params_instance {
400411
int n_threads;
401412
int n_gpu_layers;
402413
int main_gpu;
414+
bool no_kv_offload;
403415
bool mul_mat_q;
404416
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
405417

@@ -428,6 +440,7 @@ struct cmd_params_instance {
428440
cparams.type_k = type_k;
429441
cparams.type_v = type_v;
430442
cparams.mul_mat_q = mul_mat_q;
443+
cparams.offload_kqv = !no_kv_offload;
431444

432445
return cparams;
433446
}
@@ -444,6 +457,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
444457
for (const auto & tk : params.type_k)
445458
for (const auto & tv : params.type_v)
446459
for (const auto & mmq : params.mul_mat_q)
460+
for (const auto & nkvo : params.no_kv_offload)
447461
for (const auto & nt : params.n_threads) {
448462
cmd_params_instance instance = {
449463
/* .model = */ m,
@@ -455,6 +469,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
455469
/* .n_threads = */ nt,
456470
/* .n_gpu_layers = */ nl,
457471
/* .main_gpu = */ mg,
472+
/* .no_kv_offload= */ nkvo,
458473
/* .mul_mat_q = */ mmq,
459474
/* .tensor_split = */ ts,
460475
};
@@ -476,6 +491,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
476491
for (const auto & tk : params.type_k)
477492
for (const auto & tv : params.type_v)
478493
for (const auto & mmq : params.mul_mat_q)
494+
for (const auto & nkvo : params.no_kv_offload)
479495
for (const auto & nt : params.n_threads) {
480496
for (const auto & n_prompt : params.n_prompt) {
481497
if (n_prompt == 0) {
@@ -491,6 +507,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
491507
/* .n_threads = */ nt,
492508
/* .n_gpu_layers = */ nl,
493509
/* .main_gpu = */ mg,
510+
/* .no_kv_offload= */ nkvo,
494511
/* .mul_mat_q = */ mmq,
495512
/* .tensor_split = */ ts,
496513
};
@@ -511,6 +528,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
511528
/* .n_threads = */ nt,
512529
/* .n_gpu_layers = */ nl,
513530
/* .main_gpu = */ mg,
531+
/* .no_kv_offload= */ nkvo,
514532
/* .mul_mat_q = */ mmq,
515533
/* .tensor_split = */ ts,
516534
};
@@ -559,6 +577,7 @@ struct test {
559577
ggml_type type_v;
560578
int n_gpu_layers;
561579
int main_gpu;
580+
bool no_kv_offload;
562581
bool mul_mat_q;
563582
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
564583
int n_prompt;
@@ -579,6 +598,7 @@ struct test {
579598
type_v = inst.type_v;
580599
n_gpu_layers = inst.n_gpu_layers;
581600
main_gpu = inst.main_gpu;
601+
no_kv_offload = inst.no_kv_offload;
582602
mul_mat_q = inst.mul_mat_q;
583603
tensor_split = inst.tensor_split;
584604
n_prompt = inst.n_prompt;
@@ -640,7 +660,8 @@ struct test {
640660
"cpu_info", "gpu_info",
641661
"model_filename", "model_type", "model_size", "model_n_params",
642662
"n_batch", "n_threads", "type_k", "type_v",
643-
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
663+
"n_gpu_layers", "main_gpu", "no_kv_offload",
664+
"mul_mat_q", "tensor_split",
644665
"n_prompt", "n_gen", "test_time",
645666
"avg_ns", "stddev_ns",
646667
"avg_ts", "stddev_ts"
@@ -659,7 +680,7 @@ struct test {
659680
return INT;
660681
}
661682
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
662-
field == "f16_kv" || field == "mul_mat_q") {
683+
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
663684
return BOOL;
664685
}
665686
if (field == "avg_ts" || field == "stddev_ts") {
@@ -690,7 +711,8 @@ struct test {
690711
cpu_info, gpu_info,
691712
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
692713
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
693-
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
714+
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload),
715+
std::to_string(mul_mat_q), tensor_split_str,
694716
std::to_string(n_prompt), std::to_string(n_gen), test_time,
695717
std::to_string(avg_ns()), std::to_string(stdev_ns()),
696718
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -851,6 +873,9 @@ struct markdown_printer : public printer {
851873
if (field == "mul_mat_q") {
852874
return "mmq";
853875
}
876+
if (field == "no_kv_offload") {
877+
return "nkvo";
878+
}
854879
if (field == "tensor_split") {
855880
return "ts";
856881
}
@@ -885,6 +910,9 @@ struct markdown_printer : public printer {
885910
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
886911
fields.push_back("mul_mat_q");
887912
}
913+
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
914+
fields.push_back("no_kv_offload");
915+
}
888916
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
889917
fields.push_back("tensor_split");
890918
}

0 commit comments

Comments
 (0)