From 63c10d1804aabb80d11db7cf6eefb254117d16d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 25 Oct 2024 15:29:12 +0200
Subject: [PATCH] Convert: mixed k-quant with legacy fallback

---
 examples/cli/main.cpp | 33 +++++++++++++++++++++++++++--
 model.cpp             | 48 ++++++++++++++++++++++++++++++-------------
 model.h               |  8 +++++---
 stable-diffusion.h    |  2 +-
 4 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index f1bdc698b..18c405cd7 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -82,6 +82,7 @@ struct SDParams {
     std::string stacked_id_embeddings_path;
     std::string input_id_images_path;
     sd_type_t wtype = SD_TYPE_COUNT;
+    sd_type_t ftype = SD_TYPE_COUNT;
     std::string lora_model_dir;
     std::string output_path = "output.png";
     std::string input_path;
@@ -126,7 +127,8 @@ void print_params(SDParams params) {
     printf("    n_threads:         %d\n", params.n_threads);
     printf("    mode:              %s\n", modes_str[params.mode]);
     printf("    model_path:        %s\n", params.model_path.c_str());
-    printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
+    printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");    
+    printf("    fallback_type:     %s\n", params.ftype < SD_TYPE_COUNT ? sd_type_name(params.ftype) : "unspecified");
     printf("    clip_l_path:       %s\n", params.clip_l_path.c_str());
     printf("    clip_g_path:       %s\n", params.clip_g_path.c_str());
     printf("    t5xxl_path:        %s\n", params.t5xxl_path.c_str());
@@ -190,6 +192,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)\n");
     printf("                                     If not specified, the default is the type of the weight file\n");
+    printf("  --fallback-type [TYPE]             weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0) to be used as fallback for convert\n");
+    printf("                                     Used only if --type is q2_k, q3_k, or q4_k. The default is the type of the weight file\n");
     printf("  --lora-model-dir [DIR]             lora model directory\n");
     printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
     printf("  --control-image [IMAGE]            path to image condition, control net\n");
@@ -355,6 +359,31 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                         type.c_str());
                 exit(1);
             }
+        } else if (arg == "--fallback-type") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string type = argv[i];
+            if (type == "f32") {
+                params.ftype = SD_TYPE_F32;
+            } else if (type == "f16") {
+                params.ftype = SD_TYPE_F16;
+            } else if (type == "q4_0") {
+                params.ftype = SD_TYPE_Q4_0;
+            } else if (type == "q4_1") {
+                params.ftype = SD_TYPE_Q4_1;
+            } else if (type == "q5_0") {
+                params.ftype = SD_TYPE_Q5_0;
+            } else if (type == "q5_1") {
+                params.ftype = SD_TYPE_Q5_1;
+            } else if (type == "q8_0") {
+                params.ftype = SD_TYPE_Q8_0;
+            } else {
+                fprintf(stderr, "error: invalid fallback weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
+                        type.c_str());
+                exit(1);
+            }
         } else if (arg == "--lora-model-dir") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -694,7 +723,7 @@ int main(int argc, const char* argv[]) {
     }
 
     if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.ftype);
         if (!success) {
             fprintf(stderr,
                     "convert '%s'/'%s' to '%s' failed\n",
diff --git a/model.cpp b/model.cpp
index 26451cdc5..9025e1669 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1758,9 +1758,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
 bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
     const std::string& name = tensor_storage.name;
     if (type != GGML_TYPE_COUNT) {
-        if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
-            // Pass, do not convert
-        } else if (ends_with(name, ".bias")) {
+        if (ends_with(name, ".bias")) {
             // Pass, do not convert
         } else if (ends_with(name, ".scale")) {
             // Pass, do not convert
@@ -1786,11 +1784,37 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
     return false;
 }
 
-bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
+bool ModelLoader::tensor_can_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
+    return !ggml_is_quantized(type) || tensor_storage.ne[0] % ggml_blck_size(type) == 0;
+}
+
+void ModelLoader::tensor_set_type(ggml_type& tensor_type, const TensorStorage& tensor_storage, ggml_type type, ggml_type fallback_type) {
+    if (tensor_should_be_converted(tensor_storage, type)) {
+        if (tensor_can_be_converted(tensor_storage, type)) {
+            tensor_type = type;
+        } else {
+            if ((type == GGML_TYPE_Q2_K ||
+                 type == GGML_TYPE_Q3_K ||
+                 type == GGML_TYPE_Q4_K ||
+                 type == GGML_TYPE_Q5_K ||
+                 type == GGML_TYPE_Q6_K ||
+                 type == GGML_TYPE_Q8_K) &&
+                fallback_type != GGML_TYPE_COUNT) {
+                // try use fallback quant instead of k quant
+                if (tensor_can_be_converted(tensor_storage, fallback_type)) {
+                    // fallback works
+                    tensor_type = fallback_type;
+                }
+            }
+        }
+    }
+}
+
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, ggml_type fallback_type /*= GGML_TYPE_COUNT*/) {
     auto backend    = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
-    mem_size += get_params_mem_size(backend, type);
+    mem_size += get_params_mem_size(backend, type, fallback_type);
     LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
     ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false});
 
@@ -1800,9 +1824,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
         const std::string& name = tensor_storage.name;
 
         ggml_type tensor_type = tensor_storage.type;
-        if (tensor_should_be_converted(tensor_storage, type)) {
-            tensor_type = type;
-        }
+        tensor_set_type(tensor_type, tensor_storage, type, fallback_type);
 
         ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
         if (tensor == NULL) {
@@ -1836,7 +1858,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
     return success;
 }
 
-int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) {
+int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type, ggml_type fallback_type /*= GGML_TYPE_COUNT*/) {
     size_t alignment = 128;
     if (backend != NULL) {
         alignment = ggml_backend_get_alignment(backend);
@@ -1851,16 +1873,14 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     }
 
     for (auto& tensor_storage : processed_tensor_storages) {
-        if (tensor_should_be_converted(tensor_storage, type)) {
-            tensor_storage.type = type;
-        }
+        tensor_set_type(tensor_storage.type, tensor_storage, type, fallback_type);
         mem_size += tensor_storage.nbytes() + alignment;
     }
 
     return mem_size;
 }
 
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, sd_type_t fallback_type /*= SD_TYPE_COUNT*/) {
     ModelLoader model_loader;
 
     if (!model_loader.init_from_file(input_path)) {
@@ -1874,6 +1894,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
             return false;
         }
     }
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, (ggml_type)fallback_type);
     return success;
 }
diff --git a/model.h b/model.h
index 4efbdf813..6beb5c7a5 100644
--- a/model.h
+++ b/model.h
@@ -157,12 +157,14 @@ class ModelLoader {
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
-    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type, ggml_type fallback_type = GGML_TYPE_COUNT);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
-    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
+    bool tensor_can_be_converted(const TensorStorage& tensor_storage, ggml_type type);
+    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT, ggml_type fallback_type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;
+    void tensor_set_type(ggml_type& tensor_type, const TensorStorage& tensor_storage, ggml_type type = GGML_TYPE_COUNT, ggml_type fallback_type = GGML_TYPE_COUNT);
 
-    static std::string load_merges();
+        static std::string load_merges();
     static std::string load_t5_tokenizer_json();
 };
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 812e8fc94..f461a5317 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -208,7 +208,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, enum sd_type_t fallback_type = SD_TYPE_COUNT);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  int width,