From b860f654464903b2f07351de46aff5e1014c32fd Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 31 Aug 2023 05:41:20 -0600
Subject: [PATCH 1/2] Allow quantize to only copy tensors, other improvements

---
 examples/quantize/quantize.cpp | 16 +++++++++++++++-
 llama.cpp                      | 25 +++++++++++++++++--------
 llama.h                        |  1 +
 3 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index df9a214fc5864..bb0ca5d0c14a0 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
+    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
+    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };
 
 
@@ -76,7 +78,12 @@ void usage(const char * executable) {
     fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     fprintf(stderr, "\nAllowed quantization types:\n");
     for (auto & it : QUANT_OPTIONS) {
-        printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
+        if (it.name != "COPY") {
+            printf("  %2d  or  ", it.ftype);
+        } else {
+            printf("          ");
+        }
+        printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
     }
     exit(1);
 }
@@ -121,6 +128,9 @@ int main(int argc, char ** argv) {
         // export as [inp path]/ggml-model-[ftype].gguf
         fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
         arg_idx++;
+        if (ftype_str == "COPY") {
+            params.only_copy = true;
+        }
     }
     else {
         fname_out = argv[arg_idx];
@@ -133,6 +143,10 @@ int main(int argc, char ** argv) {
         if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
             fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
             return 1;
+        } else {
+            if (ftype_str == "COPY") {
+               params.only_copy = true;
+            }
         }
         arg_idx++;
     }
diff --git a/llama.cpp b/llama.cpp
index 95ee6ffe41c3a..0a135744d9a7d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4678,6 +4678,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     llm_load_arch(*ml, model);
     llm_load_hparams(*ml, model, 0, 0, 0);
 
+    if (params->only_copy) {
+        ftype = model.ftype;
+    }
+
     const size_t align = GGUF_DEFAULT_ALIGNMENT;
     struct gguf_context * ctx_out = gguf_init_empty();
 
@@ -4764,18 +4768,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         // quantize only 2D tensors
         quantize &= (tensor->n_dims == 2);
         quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= quantized_type != tensor->type;
+        quantize &= !params->only_copy;
 
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;
 
-        if (!quantize) {
-            new_type = tensor->type;
-            new_data = tensor->data;
-            new_size = ggml_nbytes(tensor);
-            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
-        } else {
+        if (quantize) {
             new_type = quantized_type;
 #ifdef GGML_USE_K_QUANTS
             // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -4874,7 +4873,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 }
             }
 #endif
-
+            // If we've decided to quantize to the same type the tensor is already
+            // in then there's nothing to do.
+            quantize = tensor->type != new_type;
+        }
+        if (!quantize) {
+            new_type = tensor->type;
+            new_data = tensor->data;
+            new_size = ggml_nbytes(tensor);
+            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+        } else {
             const size_t nelements = ggml_nelements(tensor);
 
             float * f32_data;
@@ -5305,6 +5313,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
         /*.allow_requantize            =*/ false,
         /*.quantize_output_tensor      =*/ true,
+        /*.only_copy                   =*/ false,
     };
 
     return result;
diff --git a/llama.h b/llama.h
index 6e5e1df633f7f..422f28527a0c1 100644
--- a/llama.h
+++ b/llama.h
@@ -164,6 +164,7 @@ extern "C" {
         enum llama_ftype ftype;      // quantize to this llama_ftype
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
+        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
     } llama_model_quantize_params;
 
     // grammar types

From 1e05731a33039ef157b8434a037c07671fa015f7 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Fri, 1 Sep 2023 07:57:41 -0600
Subject: [PATCH 2/2] quantize: Use stdout for help message.

---
 examples/quantize/quantize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index bb0ca5d0c14a0..c174be069a922 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -73,10 +73,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
-    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    fprintf(stderr, "\nAllowed quantization types:\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    printf("\nAllowed quantization types:\n");
     for (auto & it : QUANT_OPTIONS) {
         if (it.name != "COPY") {
             printf("  %2d  or  ", it.ftype);