From 63c10d1804aabb80d11db7cf6eefb254117d16d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 25 Oct 2024 15:29:12 +0200 Subject: [PATCH] Convert: mixed k-quant with legacy fallback --- examples/cli/main.cpp | 33 +++++++++++++++++++++++++++-- model.cpp | 48 ++++++++++++++++++++++++++++++------------- model.h | 8 +++++--- stable-diffusion.h | 2 +- 4 files changed, 71 insertions(+), 20 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index f1bdc698b..18c405cd7 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -82,6 +82,7 @@ struct SDParams { std::string stacked_id_embeddings_path; std::string input_id_images_path; sd_type_t wtype = SD_TYPE_COUNT; + sd_type_t ftype = SD_TYPE_COUNT; std::string lora_model_dir; std::string output_path = "output.png"; std::string input_path; @@ -126,7 +127,8 @@ void print_params(SDParams params) { printf(" n_threads: %d\n", params.n_threads); printf(" mode: %s\n", modes_str[params.mode]); printf(" model_path: %s\n", params.model_path.c_str()); - printf(" wtype: %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified"); + printf(" wtype: %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified"); + printf(" fallback_type: %s\n", params.ftype < SD_TYPE_COUNT ? sd_type_name(params.ftype) : "unspecified"); printf(" clip_l_path: %s\n", params.clip_l_path.c_str()); printf(" clip_g_path: %s\n", params.clip_g_path.c_str()); printf(" t5xxl_path: %s\n", params.t5xxl_path.c_str()); @@ -190,6 +192,8 @@ void print_usage(int argc, const char* argv[]) { printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n"); printf(" --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)\n"); printf(" If not specified, the default is the type of the weight file\n"); + printf(" --fallback-type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0) to be used as fallback for convert\n"); + printf(" Used only if --type is q2_k, q3_k, or q4_k. The default is the type of the weight file\n"); printf(" --lora-model-dir [DIR] lora model directory\n"); printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n"); printf(" --control-image [IMAGE] path to image condition, control net\n"); @@ -355,6 +359,31 @@ void parse_args(int argc, const char** argv, SDParams& params) { type.c_str()); exit(1); } + } else if (arg == "--fallback-type") { + if (++i >= argc) { + invalid_arg = true; + break; + } + std::string type = argv[i]; + if (type == "f32") { + params.ftype = SD_TYPE_F32; + } else if (type == "f16") { + params.ftype = SD_TYPE_F16; + } else if (type == "q4_0") { + params.ftype = SD_TYPE_Q4_0; + } else if (type == "q4_1") { + params.ftype = SD_TYPE_Q4_1; + } else if (type == "q5_0") { + params.ftype = SD_TYPE_Q5_0; + } else if (type == "q5_1") { + params.ftype = SD_TYPE_Q5_1; + } else if (type == "q8_0") { + params.ftype = SD_TYPE_Q8_0; + } else { + fprintf(stderr, "error: invalid fallback weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n", + type.c_str()); + exit(1); + } } else if (arg == "--lora-model-dir") { if (++i >= argc) { invalid_arg = true; @@ -694,7 +723,7 @@ int main(int argc, const char* argv[]) { } if (params.mode == CONVERT) { - bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype); + bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.ftype); if (!success) { fprintf(stderr, "convert '%s'/'%s' to '%s' failed\n", diff --git a/model.cpp b/model.cpp index 26451cdc5..9025e1669 100644 --- a/model.cpp +++ b/model.cpp @@ -1758,9 +1758,7 @@ bool ModelLoader::load_tensors(std::map& tenso bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) { const std::string& name = tensor_storage.name; if (type != GGML_TYPE_COUNT) { - if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) { - // Pass, do not convert - } else if (ends_with(name, ".bias")) { + if (ends_with(name, ".bias")) { // Pass, do not convert } else if (ends_with(name, ".scale")) { // Pass, do not convert @@ -1786,11 +1784,37 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage return false; } -bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) { +bool ModelLoader::tensor_can_be_converted(const TensorStorage& tensor_storage, ggml_type type) { + return !ggml_is_quantized(type) || tensor_storage.ne[0] % ggml_blck_size(type) == 0; +} + +void ModelLoader::tensor_set_type(ggml_type& tensor_type, const TensorStorage& tensor_storage, ggml_type type, ggml_type fallback_type) { + if (tensor_should_be_converted(tensor_storage, type)) { + if (tensor_can_be_converted(tensor_storage, type)) { + tensor_type = type; + } else { + if ((type == GGML_TYPE_Q2_K || + type == GGML_TYPE_Q3_K || + type == GGML_TYPE_Q4_K || + type == GGML_TYPE_Q5_K || + type == GGML_TYPE_Q6_K || + type == GGML_TYPE_Q8_K) && + fallback_type != GGML_TYPE_COUNT) { + // try use fallback quant instead of k quant + if (tensor_can_be_converted(tensor_storage, fallback_type)) { + // fallback works + tensor_type = fallback_type; + } + } + } + } +} + +bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, ggml_type fallback_type /*= GGML_TYPE_COUNT*/) { auto backend = ggml_backend_cpu_init(); size_t mem_size = 1 * 1024 * 1024; // for padding mem_size += tensor_storages.size() * ggml_tensor_overhead(); - mem_size += get_params_mem_size(backend, type); + mem_size += get_params_mem_size(backend, type, fallback_type); LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f); ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false}); @@ -1800,9 +1824,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type const std::string& name = tensor_storage.name; ggml_type tensor_type = tensor_storage.type; - if (tensor_should_be_converted(tensor_storage, type)) { - tensor_type = type; - } + tensor_set_type(tensor_type, tensor_storage, type, fallback_type); ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); if (tensor == NULL) { @@ -1836,7 +1858,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type return success; } -int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) { +int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type, ggml_type fallback_type /*= GGML_TYPE_COUNT*/) { size_t alignment = 128; if (backend != NULL) { alignment = ggml_backend_get_alignment(backend); @@ -1851,16 +1873,14 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) } for (auto& tensor_storage : processed_tensor_storages) { - if (tensor_should_be_converted(tensor_storage, type)) { - tensor_storage.type = type; - } + tensor_set_type(tensor_storage.type, tensor_storage, type, fallback_type); mem_size += tensor_storage.nbytes() + alignment; } return mem_size; } -bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) { +bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, sd_type_t fallback_type /*= SD_TYPE_COUNT*/) { ModelLoader model_loader; if (!model_loader.init_from_file(input_path)) { @@ -1874,6 +1894,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa return false; } } - bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type); + bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, (ggml_type)fallback_type); return success; } diff --git a/model.h b/model.h index 4efbdf813..6beb5c7a5 100644 --- a/model.h +++ b/model.h @@ -157,12 +157,14 @@ class ModelLoader { bool load_tensors(std::map& tensors, ggml_backend_t backend, std::set ignore_tensors = {}); - bool save_to_gguf_file(const std::string& file_path, ggml_type type); + bool save_to_gguf_file(const std::string& file_path, ggml_type type, ggml_type fallback_type = GGML_TYPE_COUNT); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); - int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); + bool tensor_can_be_converted(const TensorStorage& tensor_storage, ggml_type type); + int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT, ggml_type fallback_type = GGML_TYPE_COUNT); ~ModelLoader() = default; + void tensor_set_type(ggml_type& tensor_type, const TensorStorage& tensor_storage, ggml_type type = GGML_TYPE_COUNT, ggml_type fallback_type = GGML_TYPE_COUNT); - static std::string load_merges(); + static std::string load_merges(); static std::string load_t5_tokenizer_json(); }; diff --git a/stable-diffusion.h b/stable-diffusion.h index 812e8fc94..f461a5317 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -208,7 +208,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); -SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type); +SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, enum sd_type_t fallback_type = SD_TYPE_COUNT); SD_API uint8_t* preprocess_canny(uint8_t* img, int width,