@@ -138,6 +138,7 @@ struct cmd_params {
138
138
std::vector<int > n_threads;
139
139
std::vector<int > n_gpu_layers;
140
140
std::vector<int > main_gpu;
141
+ std::vector<bool > no_kv_offload;
141
142
std::vector<bool > mul_mat_q;
142
143
std::vector<std::array<float , LLAMA_MAX_DEVICES>> tensor_split;
143
144
int reps;
@@ -155,6 +156,7 @@ static const cmd_params cmd_params_defaults = {
155
156
/* n_threads */ {get_num_physical_cores ()},
156
157
/* n_gpu_layers */ {99 },
157
158
/* main_gpu */ {0 },
159
+ /* no_kv_offload */ {false },
158
160
/* mul_mat_q */ {true },
159
161
/* tensor_split */ {{}},
160
162
/* reps */ 5 ,
@@ -176,6 +178,7 @@ static void print_usage(int /* argc */, char ** argv) {
176
178
printf (" -t, --threads <n> (default: %s)\n " , join (cmd_params_defaults.n_threads , " ," ).c_str ());
177
179
printf (" -ngl, --n-gpu-layers <n> (default: %s)\n " , join (cmd_params_defaults.n_gpu_layers , " ," ).c_str ());
178
180
printf (" -mg, --main-gpu <i> (default: %s)\n " , join (cmd_params_defaults.main_gpu , " ," ).c_str ());
181
+ printf (" -nkvo, --no-kv-offload <0|1> (default: %s)\n " , join (cmd_params_defaults.no_kv_offload , " ," ).c_str ());
179
182
printf (" -mmq, --mul-mat-q <0|1> (default: %s)\n " , join (cmd_params_defaults.mul_mat_q , " ," ).c_str ());
180
183
printf (" -ts, --tensor_split <ts0/ts1/..> \n " );
181
184
printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
@@ -309,6 +312,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
309
312
break ;
310
313
}
311
314
params.main_gpu = split<int >(argv[i], split_delim);
315
+ } else if (arg == " -nkvo" || arg == " --no-kv-offload" ) {
316
+ if (++i >= argc) {
317
+ invalid_param = true ;
318
+ break ;
319
+ }
320
+ auto p = split<bool >(argv[i], split_delim);
321
+ params.no_kv_offload .insert (params.no_kv_offload .end (), p.begin (), p.end ());
312
322
} else if (arg == " -mmq" || arg == " --mul-mat-q" ) {
313
323
if (++i >= argc) {
314
324
invalid_param = true ;
@@ -383,6 +393,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
383
393
if (params.type_v .empty ()) { params.type_v = cmd_params_defaults.type_v ; }
384
394
if (params.n_gpu_layers .empty ()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers ; }
385
395
if (params.main_gpu .empty ()) { params.main_gpu = cmd_params_defaults.main_gpu ; }
396
+ if (params.no_kv_offload .empty ()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload ; }
386
397
if (params.mul_mat_q .empty ()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q ; }
387
398
if (params.tensor_split .empty ()) { params.tensor_split = cmd_params_defaults.tensor_split ; }
388
399
if (params.n_threads .empty ()) { params.n_threads = cmd_params_defaults.n_threads ; }
@@ -400,6 +411,7 @@ struct cmd_params_instance {
400
411
int n_threads;
401
412
int n_gpu_layers;
402
413
int main_gpu;
414
+ bool no_kv_offload;
403
415
bool mul_mat_q;
404
416
std::array<float , LLAMA_MAX_DEVICES> tensor_split;
405
417
@@ -428,6 +440,7 @@ struct cmd_params_instance {
428
440
cparams.type_k = type_k;
429
441
cparams.type_v = type_v;
430
442
cparams.mul_mat_q = mul_mat_q;
443
+ cparams.offload_kqv = !no_kv_offload;
431
444
432
445
return cparams;
433
446
}
@@ -444,6 +457,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
444
457
for (const auto & tk : params.type_k )
445
458
for (const auto & tv : params.type_v )
446
459
for (const auto & mmq : params.mul_mat_q )
460
+ for (const auto & nkvo : params.no_kv_offload )
447
461
for (const auto & nt : params.n_threads ) {
448
462
cmd_params_instance instance = {
449
463
/* .model = */ m,
@@ -455,6 +469,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
455
469
/* .n_threads = */ nt,
456
470
/* .n_gpu_layers = */ nl,
457
471
/* .main_gpu = */ mg,
472
+ /* .no_kv_offload= */ nkvo,
458
473
/* .mul_mat_q = */ mmq,
459
474
/* .tensor_split = */ ts,
460
475
};
@@ -476,6 +491,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
476
491
for (const auto & tk : params.type_k )
477
492
for (const auto & tv : params.type_v )
478
493
for (const auto & mmq : params.mul_mat_q )
494
+ for (const auto & nkvo : params.no_kv_offload )
479
495
for (const auto & nt : params.n_threads ) {
480
496
for (const auto & n_prompt : params.n_prompt ) {
481
497
if (n_prompt == 0 ) {
@@ -491,6 +507,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
491
507
/* .n_threads = */ nt,
492
508
/* .n_gpu_layers = */ nl,
493
509
/* .main_gpu = */ mg,
510
+ /* .no_kv_offload= */ nkvo,
494
511
/* .mul_mat_q = */ mmq,
495
512
/* .tensor_split = */ ts,
496
513
};
@@ -511,6 +528,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
511
528
/* .n_threads = */ nt,
512
529
/* .n_gpu_layers = */ nl,
513
530
/* .main_gpu = */ mg,
531
+ /* .no_kv_offload= */ nkvo,
514
532
/* .mul_mat_q = */ mmq,
515
533
/* .tensor_split = */ ts,
516
534
};
@@ -559,6 +577,7 @@ struct test {
559
577
ggml_type type_v;
560
578
int n_gpu_layers;
561
579
int main_gpu;
580
+ bool no_kv_offload;
562
581
bool mul_mat_q;
563
582
std::array<float , LLAMA_MAX_DEVICES> tensor_split;
564
583
int n_prompt;
@@ -579,6 +598,7 @@ struct test {
579
598
type_v = inst.type_v ;
580
599
n_gpu_layers = inst.n_gpu_layers ;
581
600
main_gpu = inst.main_gpu ;
601
+ no_kv_offload = inst.no_kv_offload ;
582
602
mul_mat_q = inst.mul_mat_q ;
583
603
tensor_split = inst.tensor_split ;
584
604
n_prompt = inst.n_prompt ;
@@ -640,7 +660,8 @@ struct test {
640
660
" cpu_info" , " gpu_info" ,
641
661
" model_filename" , " model_type" , " model_size" , " model_n_params" ,
642
662
" n_batch" , " n_threads" , " type_k" , " type_v" ,
643
- " n_gpu_layers" , " main_gpu" , " mul_mat_q" , " tensor_split" ,
663
+ " n_gpu_layers" , " main_gpu" , " no_kv_offload" ,
664
+ " mul_mat_q" , " tensor_split" ,
644
665
" n_prompt" , " n_gen" , " test_time" ,
645
666
" avg_ns" , " stddev_ns" ,
646
667
" avg_ts" , " stddev_ts"
@@ -659,7 +680,7 @@ struct test {
659
680
return INT;
660
681
}
661
682
if (field == " cuda" || field == " opencl" || field == " metal" || field == " gpu_blas" || field == " blas" ||
662
- field == " f16_kv" || field == " mul_mat_q" ) {
683
+ field == " f16_kv" || field == " no_kv_offload " || field == " mul_mat_q" ) {
663
684
return BOOL;
664
685
}
665
686
if (field == " avg_ts" || field == " stddev_ts" ) {
@@ -690,7 +711,8 @@ struct test {
690
711
cpu_info, gpu_info,
691
712
model_filename, model_type, std::to_string (model_size), std::to_string (model_n_params),
692
713
std::to_string (n_batch), std::to_string (n_threads), ggml_type_name (type_k), ggml_type_name (type_v),
693
- std::to_string (n_gpu_layers), std::to_string (main_gpu), std::to_string (mul_mat_q), tensor_split_str,
714
+ std::to_string (n_gpu_layers), std::to_string (main_gpu), std::to_string (no_kv_offload),
715
+ std::to_string (mul_mat_q), tensor_split_str,
694
716
std::to_string (n_prompt), std::to_string (n_gen), test_time,
695
717
std::to_string (avg_ns ()), std::to_string (stdev_ns ()),
696
718
std::to_string (avg_ts ()), std::to_string (stdev_ts ())
@@ -851,6 +873,9 @@ struct markdown_printer : public printer {
851
873
if (field == " mul_mat_q" ) {
852
874
return " mmq" ;
853
875
}
876
+ if (field == " no_kv_offload" ) {
877
+ return " nkvo" ;
878
+ }
854
879
if (field == " tensor_split" ) {
855
880
return " ts" ;
856
881
}
@@ -885,6 +910,9 @@ struct markdown_printer : public printer {
885
910
if (params.mul_mat_q .size () > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q ) {
886
911
fields.push_back (" mul_mat_q" );
887
912
}
913
+ if (params.no_kv_offload .size () > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload ) {
914
+ fields.push_back (" no_kv_offload" );
915
+ }
888
916
if (params.tensor_split .size () > 1 || params.tensor_split != cmd_params_defaults.tensor_split ) {
889
917
fields.push_back (" tensor_split" );
890
918
}
0 commit comments