From b860f654464903b2f07351de46aff5e1014c32fd Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 31 Aug 2023 05:41:20 -0600 Subject: [PATCH 1/2] Allow quantize to only copy tensors, other improvements --- examples/quantize/quantize.cpp | 16 +++++++++++++++- llama.cpp | 25 +++++++++++++++++-------- llama.h | 1 + 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index df9a214fc5864..bb0ca5d0c14a0 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -35,6 +35,8 @@ static const std::vector QUANT_OPTIONS = { { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; @@ -76,7 +78,12 @@ void usage(const char * executable) { fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); fprintf(stderr, "\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { - printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); + if (it.name != "COPY") { + printf(" %2d or ", it.ftype); + } else { + printf(" "); + } + printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str()); } exit(1); } @@ -121,6 +128,9 @@ int main(int argc, char ** argv) { // export as [inp path]/ggml-model-[ftype].gguf fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; arg_idx++; + if (ftype_str == "COPY") { + params.only_copy = true; + } } else { fname_out = argv[arg_idx]; @@ -133,6 +143,10 @@ int main(int argc, char ** argv) { if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); return 1; + } else { + if (ftype_str == "COPY") { + params.only_copy = true; + } } arg_idx++; } diff --git a/llama.cpp b/llama.cpp index 95ee6ffe41c3a..0a135744d9a7d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4678,6 +4678,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llm_load_arch(*ml, model); llm_load_hparams(*ml, model, 0, 0, 0); + if (params->only_copy) { + ftype = model.ftype; + } + const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); @@ -4764,18 +4768,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // quantize only 2D tensors quantize &= (tensor->n_dims == 2); quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= quantized_type != tensor->type; + quantize &= !params->only_copy; enum ggml_type new_type; void * new_data; size_t new_size; - if (!quantize) { - new_type = tensor->type; - new_data = tensor->data; - new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); - } else { + if (quantize) { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -4874,7 +4873,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } #endif - + // If we've decided to quantize to the same type the tensor is already + // in then there's nothing to do. + quantize = tensor->type != new_type; + } + if (!quantize) { + new_type = tensor->type; + new_data = tensor->data; + new_size = ggml_nbytes(tensor); + LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); + } else { const size_t nelements = ggml_nelements(tensor); float * f32_data; @@ -5305,6 +5313,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, + /*.only_copy =*/ false, }; return result; diff --git a/llama.h b/llama.h index 6e5e1df633f7f..422f28527a0c1 100644 --- a/llama.h +++ b/llama.h @@ -164,6 +164,7 @@ extern "C" { enum llama_ftype ftype; // quantize to this llama_ftype bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored } llama_model_quantize_params; // grammar types From 1e05731a33039ef157b8434a037c07671fa015f7 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Fri, 1 Sep 2023 07:57:41 -0600 Subject: [PATCH 2/2] quantize: Use stdout for help message. --- examples/quantize/quantize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index bb0ca5d0c14a0..c174be069a922 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -73,10 +73,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // void usage(const char * executable) { - fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); - fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - fprintf(stderr, "\nAllowed quantization types:\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { if (it.name != "COPY") { printf(" %2d or ", it.ftype);