Skip to content

Commit 82b34d8

Browse files
KV cache quantized to q8_0
1 parent 9e20231 commit 82b34d8

File tree

14 files changed

+799
-318
lines changed

14 files changed

+799
-318
lines changed

common/common.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
198198
break;
199199
}
200200
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
201+
} else if (arg == "--kv-type" || arg == "-kvt") {
202+
if (++i >= argc) {
203+
invalid_param = true;
204+
break;
205+
}
206+
207+
std::string type_name(argv[i]);
208+
for (char & c : type_name) {
209+
c = std::tolower(c);
210+
}
211+
212+
if (type_name == "q8_0") {
213+
params.kv_type = GGML_TYPE_Q8_0;
214+
} else if (type_name == "f16") {
215+
params.kv_type = GGML_TYPE_F16;
216+
} else if (type_name == "f32") {
217+
params.kv_type = GGML_TYPE_F32;
218+
} else {
219+
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
220+
invalid_param = true;
221+
break;
222+
}
201223
} else if (arg == "--memory-f32") {
202-
params.memory_f16 = false;
224+
params.kv_type = GGML_TYPE_F32;
203225
} else if (arg == "--top-p") {
204226
if (++i >= argc) {
205227
invalid_param = true;
@@ -643,8 +665,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
643665
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
644666
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
645667
printf(" --no-penalize-nl do not penalize newline token\n");
646-
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
647-
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
668+
printf(" -kvt, --kv-type the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
648669
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
649670
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
650671
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -725,7 +746,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
725746
lparams.low_vram = params.low_vram;
726747
lparams.mul_mat_q = params.mul_mat_q;
727748
lparams.seed = params.seed;
728-
lparams.f16_kv = params.memory_f16;
749+
lparams.kv_type = params.kv_type;
729750
lparams.use_mmap = params.use_mmap;
730751
lparams.use_mlock = params.use_mlock;
731752
lparams.logits_all = params.perplexity;
@@ -1191,6 +1212,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
11911212
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
11921213
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
11931214
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
1215+
fprintf(stream, "kv_type: %s # default: false\n", ggml_type_name(params.kv_type));
11941216
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
11951217

11961218
fprintf(stream, "logit_bias:\n");
@@ -1205,7 +1227,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
12051227
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
12061228
fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
12071229
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1208-
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
12091230
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
12101231
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
12111232
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);

common/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,10 @@ struct gpt_params {
8484
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
8585
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
8686

87+
ggml_type kv_type = GGML_TYPE_Q8_0; // the type to use for the KV cache
88+
8789
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
8890
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
89-
bool memory_f16 = true; // use f16 instead of f32 for memory kv
9091
bool random_prompt = false; // do not randomize prompt if none provided
9192
bool use_color = false; // use color to distinguish generations and inputs
9293
bool interactive = false; // interactive mode

examples/llama-bench/llama-bench.cpp

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ struct cmd_params {
135135
std::vector<int> n_prompt;
136136
std::vector<int> n_gen;
137137
std::vector<int> n_batch;
138-
std::vector<bool> f32_kv;
138+
std::vector<ggml_type> kv_type;
139139
std::vector<int> n_threads;
140140
std::vector<int> n_gpu_layers;
141141
std::vector<int> main_gpu;
@@ -152,7 +152,7 @@ static const cmd_params cmd_params_defaults = {
152152
/* n_prompt */ {512},
153153
/* n_gen */ {128},
154154
/* n_batch */ {512},
155-
/* f32_kv */ {false},
155+
/* kv_type */ {GGML_TYPE_Q8_0},
156156
/* n_threads */ {get_num_physical_cores()},
157157
/* n_gpu_layers */ {99},
158158
/* main_gpu */ {0},
@@ -173,7 +173,16 @@ static void print_usage(int /* argc */, char ** argv) {
173173
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
174174
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
175175
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
176-
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
176+
177+
std::string kv_type_default;
178+
for (unsigned int i = 0; i < cmd_params_defaults.kv_type.size(); ++i) {
179+
if (i > 0) {
180+
kv_type_default += ",";
181+
}
182+
kv_type_default += ggml_type_name(cmd_params_defaults.kv_type[i]);
183+
}
184+
printf(" -kvt, kv_type <q8_0|f16|f32> (default: %s)\n", kv_type_default.c_str());
185+
177186
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
178187
printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
179188
printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -185,7 +194,6 @@ static void print_usage(int /* argc */, char ** argv) {
185194
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
186195
printf("\n");
187196
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
188-
189197
}
190198

191199
static cmd_params parse_cmd_params(int argc, char ** argv) {
@@ -236,13 +244,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
236244
}
237245
auto p = split<int>(argv[i], split_delim);
238246
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
239-
} else if (arg == "--memory-f32") {
247+
} else if (arg == "-kvt" || arg == "--kv-type") {
240248
if (++i >= argc) {
241249
invalid_param = true;
242250
break;
243251
}
244-
auto p = split<int>(argv[i], split_delim);
245-
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
252+
auto p = split<std::string>(argv[i], split_delim);
253+
254+
std::vector<ggml_type> kvt;
255+
for (const std::string & type_name : p) {
256+
if (type_name == "q8_0") {
257+
kvt.push_back(GGML_TYPE_Q8_0);
258+
} else if (type_name == "f16") {
259+
kvt.push_back(GGML_TYPE_F16);
260+
} else if (type_name == "f32") {
261+
kvt.push_back(GGML_TYPE_F32);
262+
} else {
263+
invalid_param = true;
264+
break;
265+
}
266+
}
267+
if (invalid_param) {
268+
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
269+
break;
270+
}
271+
272+
params.kv_type.insert(params.kv_type.end(), kvt.begin(), kvt.end());
246273
} else if (arg == "-t" || arg == "--threads") {
247274
if (++i >= argc) {
248275
invalid_param = true;
@@ -340,7 +367,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
340367
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
341368
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
342369
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
343-
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
370+
if (params.kv_type.empty()) { params.kv_type = cmd_params_defaults.kv_type; }
344371
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
345372
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
346373
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -356,7 +383,7 @@ struct cmd_params_instance {
356383
int n_prompt;
357384
int n_gen;
358385
int n_batch;
359-
bool f32_kv;
386+
ggml_type kv_type;
360387
int n_threads;
361388
int n_gpu_layers;
362389
int main_gpu;
@@ -368,7 +395,7 @@ struct cmd_params_instance {
368395
llama_context_params lparams = llama_context_default_params();
369396
lparams.n_ctx = n_prompt + n_gen;
370397
lparams.n_batch = n_batch;
371-
lparams.f16_kv = !f32_kv;
398+
lparams.kv_type = kv_type;
372399
lparams.n_gpu_layers = n_gpu_layers;
373400
lparams.main_gpu = main_gpu;
374401
lparams.mul_mat_q = mul_mat_q;
@@ -384,7 +411,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
384411

385412
for (const auto & m : params.model)
386413
for (const auto & nb : params.n_batch)
387-
for (const auto & fk : params.f32_kv)
414+
for (const auto & kvt : params.kv_type)
388415
for (const auto & nl : params.n_gpu_layers)
389416
for (const auto & mg : params.main_gpu)
390417
for (const auto & mmq : params.mul_mat_q)
@@ -396,7 +423,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
396423
/* .n_prompt = */ n_prompt,
397424
/* .n_gen = */ n_gen,
398425
/* .n_batch = */ nb,
399-
/* .f32_kv = */ fk,
426+
/* .kv_type = */ kvt,
400427
/* .n_threads = */ nt,
401428
/* .n_gpu_layers = */ nl,
402429
/* .main_gpu = */ mg,
@@ -447,7 +474,7 @@ struct test {
447474
uint64_t model_n_params;
448475
int n_batch;
449476
int n_threads;
450-
bool f32_kv;
477+
ggml_type kv_type;
451478
int n_gpu_layers;
452479
int main_gpu;
453480
bool mul_mat_q;
@@ -467,7 +494,7 @@ struct test {
467494
model_n_params = llama_model_n_params(lmodel);
468495
n_batch = inst.n_batch;
469496
n_threads = inst.n_threads;
470-
f32_kv = inst.f32_kv;
497+
kv_type = inst.kv_type;
471498
n_gpu_layers = inst.n_gpu_layers;
472499
main_gpu = inst.main_gpu;
473500
mul_mat_q = inst.mul_mat_q;
@@ -531,7 +558,7 @@ struct test {
531558
"cuda", "opencl", "metal", "gpu_blas", "blas",
532559
"cpu_info", "gpu_info",
533560
"model_filename", "model_type", "model_size", "model_n_params",
534-
"n_batch", "n_threads", "f16_kv",
561+
"n_batch", "n_threads", "kv_type",
535562
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
536563
"n_prompt", "n_gen", "test_time",
537564
"avg_ns", "stddev_ns",
@@ -551,7 +578,7 @@ struct test {
551578
return INT;
552579
}
553580
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
554-
field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
581+
field == "mul_mat_q" || field == "low_vram") {
555582
return BOOL;
556583
}
557584
if (field == "avg_ts" || field == "stddev_ts") {
@@ -581,7 +608,7 @@ struct test {
581608
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
582609
cpu_info, gpu_info,
583610
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
584-
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
611+
std::to_string(n_batch), std::to_string(n_threads), std::string(ggml_type_name(kv_type)),
585612
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
586613
std::to_string(n_prompt), std::to_string(n_gen), test_time,
587614
std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -765,8 +792,8 @@ struct markdown_printer : public printer {
765792
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
766793
fields.push_back("n_batch");
767794
}
768-
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
769-
fields.push_back("f16_kv");
795+
if (params.kv_type.size() > 1 || params.kv_type != cmd_params_defaults.kv_type) {
796+
fields.push_back("kv_type");
770797
}
771798
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
772799
fields.push_back("main_gpu");
@@ -834,6 +861,9 @@ struct markdown_printer : public printer {
834861
} else if (field == "t/s") {
835862
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
836863
value = buf;
864+
} else if (field == "kv_type") {
865+
snprintf(buf, sizeof(buf), "%s", ggml_type_name(t.kv_type));
866+
value = buf;
837867
} else if (vmap.find(field) != vmap.end()) {
838868
value = vmap.at(field);
839869
} else {

examples/main/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -276,9 +276,9 @@ These options help improve the performance and memory usage of the LLaMA models.
276276

277277
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
278278

279-
### Memory Float 32
279+
### KV cache type
280280

281-
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
281+
- `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
282282

283283
### Batch Size
284284

examples/quantize-stats/quantize-stats.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
328328

329329
lparams.n_ctx = 256;
330330
lparams.seed = 1;
331-
lparams.f16_kv = false;
331+
lparams.kv_type = GGML_TYPE_F32;
332332
lparams.use_mlock = false;
333333

334334
model = llama_load_model_from_file(params.model.c_str(), lparams);

examples/save-load-state/save-load-state.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
2727

2828
lparams.n_ctx = params.n_ctx;
2929
lparams.seed = params.seed;
30-
lparams.f16_kv = params.memory_f16;
30+
lparams.kv_type = params.kv_type;
3131
lparams.use_mmap = params.use_mmap;
3232
lparams.use_mlock = params.use_mlock;
3333

examples/server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Command line options:
1313
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
1414
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
1515
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
16-
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
16+
- `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
1717
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
1818
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
1919
- `--numa`: Attempt optimizations that help on some NUMA systems.

examples/server/server.cpp

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -704,8 +704,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
704704
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
705705
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
706706
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
707-
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
708-
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
707+
printf(" -kvt, --kv-type the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
709708
if (llama_mlock_supported())
710709
{
711710
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
@@ -838,9 +837,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
838837
}
839838
params.rope_freq_scale = std::stof(argv[i]);
840839
}
840+
else if (arg == "--kv-type" || arg == "-kvt")
841+
{
842+
if (++i >= argc) {
843+
invalid_param = true;
844+
break;
845+
}
846+
847+
std::string type_name(argv[i]);
848+
for (char & c : type_name) {
849+
c = std::tolower(c);
850+
}
851+
852+
if (type_name == "q8_0") {
853+
params.kv_type = GGML_TYPE_Q8_0;
854+
} else if (type_name == "f16") {
855+
params.kv_type = GGML_TYPE_F16;
856+
} else if (type_name == "f32") {
857+
params.kv_type = GGML_TYPE_F32;
858+
} else {
859+
fprintf(stderr, "error: unknown KV type: %s. Known types: q8_0, f16, f32.\n", argv[i]);
860+
invalid_param = true;
861+
break;
862+
}
863+
}
841864
else if (arg == "--memory-f32" || arg == "--memory_f32")
842865
{
843-
params.memory_f16 = false;
866+
params.kv_type = GGML_TYPE_F32;
844867
}
845868
else if (arg == "--threads" || arg == "-t")
846869
{

0 commit comments

Comments
 (0)