From 8f7a7ee3702f88d932a27e874405865c0874c1be Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 19:23:21 +0100 Subject: [PATCH 01/18] Update quantize.cpp - Quant option IQ1_XS --- examples/quantize/quantize.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 79e60ea7ba6b9..debbafedcad93 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,13 +26,14 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, + { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M", }, + { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, From 3d88431113de1f18d741b4cfa4ccbf2acf70cb90 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 19:25:31 +0100 Subject: [PATCH 02/18] Update llama.h - Enum IQ1_XS --- llama.h | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.h b/llama.h index 74f0e56de71c6..e83fbe78b7304 100644 --- a/llama.h +++ b/llama.h @@ -117,6 +117,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_XS = 31, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; From 8eff402498c8f6707479408b431597232ef32100 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 19:30:19 +0100 Subject: [PATCH 03/18] Update llama.cpp - Case IQ1_XS --- llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama.cpp b/llama.cpp index 61587cb7abf5a..3aafb0a0bc1ca 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3018,6 +3018,7 @@ struct llama_model_loader { case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case GGML_TYPE_IQ1_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; @@ -12754,6 +12755,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; + case LLAMA_FTYPE_MOSTLY_IQ1_XS: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; From 51ff04e77e6f90e4f086d1fcf73f7210033f323f Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 19:31:51 +0100 Subject: [PATCH 04/18] Update llama.cpp - Fix possible typo LLAMA_FTYPE should be GGML_TYPE there. --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 3aafb0a0bc1ca..7f1a54e21fca0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12656,7 +12656,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || - new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) { + new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { From 1c4da5ddac23de5e29209a05a7e13bccdd8d59e4 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 20:37:11 +0100 Subject: [PATCH 05/18] Update llama.cpp - Embeddings and output tensors strategy. --- llama.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llama.cpp b/llama.cpp index 7f1a54e21fca0..6427fbe4aa1c1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12447,6 +12447,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_Q5_K; @@ -12462,6 +12466,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = GGML_TYPE_Q2_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q2_K; + new_type = GGML_TYPE_IQ2_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_IQ3_S; } From ddc7701588d10686aba1fec8a392228e7bd56923 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 21:04:01 +0100 Subject: [PATCH 06/18] Update llama.cpp - Non-FFN layer-tensors strategy --- llama.cpp | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 6427fbe4aa1c1..5360f13ceff21 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12468,7 +12468,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q2_K; - new_type = GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_IQ3_S; @@ -12477,6 +12477,35 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ3_S; } } + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (name.find("attn_q.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; + } + if (name.find("attn_k.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; + else if (qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ2_XS; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; + } + else if (name.find("attn_v.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_Q2_K; + ++qs.i_attention_wv; + } + else if (name.find("attn_output.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ2_XXS; + } + else if (name.find("attn_qkv.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS; + else new_type = GGML_TYPE_Q2_K; + } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { if (name.find("attn_v.weight") != std::string::npos) { From b3553335a3282940aa7fd415b7c65477e28d6626 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 21:06:46 +0100 Subject: [PATCH 07/18] Update llama.h - change IQ1_XS enum number From 31 to 32, because IQ1_M will come with 31. --- llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.h b/llama.h index e83fbe78b7304..da9597b677bde 100644 --- a/llama.h +++ b/llama.h @@ -117,7 +117,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_XS = 31, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_XS = 32, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; From 066efbb18f5f512a11d845abc2c09721a7247bba Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 23:08:19 +0100 Subject: [PATCH 08/18] Update llama.cpp - adjustements non-FFN layer tensors --- llama.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5360f13ceff21..2469e5efd8dc1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12479,17 +12479,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (name.find("attn_q.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ2_S; + else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ2_XS; + else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XXS; } if (name.find("attn_k.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS; else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; + else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_IQ2_S; else if (qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ2_XS; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; } else if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS; + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K; else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS; else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; From 3031c01db0e66499f5f45602d659a39c310701c9 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 26 Mar 2024 00:06:41 +0100 Subject: [PATCH 09/18] Update llama.cpp - correction wrong case declaration --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 2469e5efd8dc1..3910d50f74b0f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3018,7 +3018,6 @@ struct llama_model_loader { case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; - case GGML_TYPE_IQ1_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; @@ -3414,6 +3413,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_XS :return "IQ1_S mix - 1.6-17 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; From 9c27b0e6ea7e46b7cf9800b73a7486538b58829f Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 26 Mar 2024 01:12:35 +0100 Subject: [PATCH 10/18] Update quantize.cpp - mix label --- examples/quantize/quantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index debbafedcad93..ec9848d8bae23 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,7 +26,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, - { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization", }, + { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization mix", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, From f162b2ef3fd267dfe4076e204e551de8d2d8c07e Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 26 Mar 2024 02:22:04 +0100 Subject: [PATCH 11/18] Update llama.cpp - correction embd.weight GQA-4 & qkv.weight to K-Quants Q2_K embed for GQ4 because it helps Mistral 7b. I didn't test a model with attn.qkv weight, so better to be conservative with a K-Quant. --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 3910d50f74b0f..f38d159f16358 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12467,7 +12467,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { - if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q2_K; + if (qs.model.hparams.n_gqa() == 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q2_K; else new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { @@ -12506,7 +12506,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = GGML_TYPE_IQ2_XXS; } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS; + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q2_K; } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || From 62c1f5b68179e16aafe9162d2bc7c18a790d167d Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 26 Mar 2024 02:25:07 +0100 Subject: [PATCH 12/18] Update llama.cpp typo --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index f38d159f16358..0fabcc75dafb5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3413,7 +3413,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ1_XS :return "IQ1_S mix - 1.6-17 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_XS :return "IQ1_S mix - 1.6-1.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; From d1839362fc14bdc1b0df3f222cdabc1c5d2450e7 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 26 Mar 2024 09:17:09 +0100 Subject: [PATCH 13/18] Update llama.cpp - remove trailing space --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 0fabcc75dafb5..68f18453b8173 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12486,7 +12486,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (name.find("attn_k.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS; else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; - else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; + else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_IQ2_S; else if (qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ2_XS; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; From eaf9571d9b85712d9e37843e76abaa9ad85bc3a8 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 26 Mar 2024 10:11:46 +0100 Subject: [PATCH 14/18] Update llama.cpp - exception for the IQ2_S token embedding error --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 68f18453b8173..b940ea3d2a7d8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12984,8 +12984,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS || - new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ1_S || + (new_type == GGML_TYPE_IQ2_S && strcmp(tensor->name, "token_embd.weight")) || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); From 599a4b2cc6fd7ff0b676b64a5951eebb063c2a66 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:41:16 +0100 Subject: [PATCH 15/18] Update llama.cpp - switch from IQ4_XS to Q4_K in related cases. - There's indeed a slight bonus worthy of not being missed for such a cheap cost with Q4_K compared to IQ4_XS, especially on the K & V attention tensors. - Obsession on size doesn't matter much for the smallest models which are small anyway and need an offset toward quality for the sake of logic, while the bigger models which can actually be usable almost won't be impacted in size but will appreciate the slight quality bump offered by Q4_K vs IQ4_XS. --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index b940ea3d2a7d8..c8fa464965708 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12449,7 +12449,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; - else new_type = GGML_TYPE_IQ4_XS; + else new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { @@ -12484,7 +12484,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XXS; } if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS; + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K; else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_IQ2_S; @@ -12493,14 +12493,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K; - else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_Q2_K; ++qs.i_attention_wv; } else if (name.find("attn_output.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS; + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K; else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ2_XXS; From ed4be6bb0d7f6645aca23f2a87b01a634f0a23cd Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 29 Mar 2024 13:12:27 +0100 Subject: [PATCH 16/18] Update llama.cpp - IQ4_XS output for models lesser than 8 experts or GQA 8 - granularity for QKV tensor when existing - also, drop for Mistral & Yi attn.k.weight from IQ2_XS to IQ2_XXS --- llama.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 30dae82ab2f6f..6798995edf3b6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12773,7 +12773,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; - else new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || @@ -12814,7 +12815,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_IQ2_S; - else if (qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ2_XS; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; } else if (name.find("attn_v.weight") != std::string::npos) { @@ -12832,7 +12832,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = GGML_TYPE_IQ2_XXS; } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() > 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_Q2_K; } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || From dce3e27ba29e1594ecdc980a2c23b4188ba6d9fa Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 1 Apr 2024 14:11:20 +0200 Subject: [PATCH 17/18] Update llama.cpp - adjustements attn.v.weight in Q4_K for all MOEs & models with GQA4, Mistral (PPL4096 benefits quite a lot) and incidentally CodeLlama34b (which is for coding anyway and isn't exploitable in IQ1 quants). Yi 34b gets IQ3_S for now, more tests are needed due to perplexity huge increase problems with IQ4_XS and Q4_K for attn.v.weight on my test model (Kyllene 1.1). --- llama.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 6798995edf3b6..49ce7a59280ff 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12819,9 +12819,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K; - else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; - else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; - else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; + else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() == 7) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_S; else new_type = GGML_TYPE_Q2_K; ++qs.i_attention_wv; } From e4ac8ae720847b674c681e3b6218fc1c67683725 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 10 May 2024 14:57:26 +0200 Subject: [PATCH 18/18] Update llama.h respect current numerology --- llama.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.h b/llama.h index 9f89a31fc0cd8..02c8e1ec7d0cb 100644 --- a/llama.h +++ b/llama.h @@ -139,8 +139,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_XS = 32, // except 1d tensors - LLAMA_FTYPE_MOSTLY_BF16 = 33, // except 1d tensors + LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_XS = 33, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file };