Skip to content

Commit 66d575c

Browse files
ikawrakowKawrakow
andauthored
llama : add Q3_K_XS (#5060)
* Add Q3_K_XS - intermediate size between Q2_K and Q3_K_S * Q3_K_XS: quanize first 1/8 of ffn_down layers with Q4_K Together with an importance matrix, this brings perplexity for LLaMA-v2-70B below the perplexity of the former Q2_K with a 800 MB smaller quantized model size. --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 5774493 commit 66d575c

File tree

3 files changed

+48
-16
lines changed

3 files changed

+48
-16
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
2626
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
2727
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
2828
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
29+
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
2930
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
3031
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
3132
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },

llama.cpp

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2661,6 +2661,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
26612661
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
26622662
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
26632663
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2664+
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
26642665

26652666
default: return "unknown, may not work";
26662667
}
@@ -8765,9 +8766,13 @@ struct quantize_state_internal {
87658766
const llama_model_quantize_params * params;
87668767

87678768
int n_attention_wv = 0;
8768-
int n_feed_forward_w2 = 0;
8769+
int n_ffn_down = 0;
8770+
int n_ffn_gate = 0;
8771+
int n_ffn_up = 0;
87698772
int i_attention_wv = 0;
8770-
int i_feed_forward_w2 = 0;
8773+
int i_ffn_down = 0;
8774+
int i_ffn_gate = 0;
8775+
int i_ffn_up = 0;
87718776

87728777
int n_k_quantized = 0;
87738778
int n_fallback = 0;
@@ -8870,8 +8875,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
88708875
++qs.i_attention_wv;
88718876
}
88728877
else if (name.find("ffn_down") != std::string::npos) {
8873-
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
8874-
++qs.i_feed_forward_w2;
8878+
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
8879+
++qs.i_ffn_down;
88758880
}
88768881
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
88778882
} else if (name.find("attn_v.weight") != std::string::npos) {
@@ -8908,18 +8913,21 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89088913
// TODO: explore better strategies
89098914
new_type = GGML_TYPE_Q8_0;
89108915
}
8916+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
8917+
new_type = GGML_TYPE_Q2_K;
8918+
}
89118919
} else if (name.find("ffn_down") != std::string::npos) {
89128920
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
89138921
int i_layer, n_layer;
89148922
if (n_expert == 1) {
8915-
i_layer = qs.i_feed_forward_w2;
8916-
n_layer = qs.n_feed_forward_w2;
8923+
i_layer = qs.i_ffn_down;
8924+
n_layer = qs.n_ffn_down;
89178925
} else {
89188926
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8919-
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8927+
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
89208928
// for getting the current layer as I initially thought, and we need to resort to parsing the
89218929
// tensor name.
8922-
n_layer = qs.n_feed_forward_w2 / n_expert;
8930+
n_layer = qs.n_ffn_down / n_expert;
89238931
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
89248932
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
89258933
}
@@ -8928,7 +8936,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89288936
}
89298937
}
89308938
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8931-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8939+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
89328940
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
89338941
}
89348942
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8958,11 +8966,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89588966
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
89598967
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
89608968
}
8961-
++qs.i_feed_forward_w2;
8969+
++qs.i_ffn_down;
89628970
} else if (name.find("attn_output.weight") != std::string::npos) {
89638971
if (arch != LLM_ARCH_FALCON) {
89648972
if (qs.model.hparams.n_expert == 8) {
8965-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
8973+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
8974+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
89668975
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
89678976
new_type = GGML_TYPE_Q5_K;
89688977
}
@@ -8980,6 +8989,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89808989
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
89818990
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
89828991
}
8992+
else if (name.find("ffn_gate") != std::string::npos) {
8993+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
8994+
new_type = GGML_TYPE_Q2_K;
8995+
}
8996+
++qs.i_ffn_gate;
8997+
}
8998+
else if (name.find("ffn_up") != std::string::npos) {
8999+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
9000+
new_type = GGML_TYPE_Q2_K;
9001+
}
9002+
++qs.i_ffn_up;
9003+
}
9004+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9005+
//}
89839006
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
89849007
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
89859008
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -9034,8 +9057,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
90349057
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
90359058

90369059
// K-quants
9060+
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
90379061
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9038-
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
9062+
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
90399063
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
90409064
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
90419065
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9103,12 +9127,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
91039127
++qs.n_attention_wv;
91049128
}
91059129
else if (name.find("ffn_down") != std::string::npos) {
9106-
++qs.n_feed_forward_w2;
9130+
++qs.n_ffn_down;
9131+
}
9132+
else if (name.find("ffn_gate") != std::string::npos) {
9133+
++qs.n_ffn_gate;
9134+
}
9135+
else if (name.find("ffn_up") != std::string::npos) {
9136+
++qs.n_ffn_up;
91079137
}
91089138
}
9109-
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
9110-
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
9111-
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
9139+
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
9140+
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
9141+
__func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
91129142
}
91139143

91149144
size_t total_size_org = 0;

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ extern "C" {
107107
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
108108
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
109109
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
110+
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
110111

111112
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
112113
};

0 commit comments

Comments
 (0)