@@ -2661,6 +2661,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2661
2661
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2662
2662
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2663
2663
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2664
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2664
2665
2665
2666
default: return "unknown, may not work";
2666
2667
}
@@ -8765,9 +8766,13 @@ struct quantize_state_internal {
8765
8766
const llama_model_quantize_params * params;
8766
8767
8767
8768
int n_attention_wv = 0;
8768
- int n_feed_forward_w2 = 0;
8769
+ int n_ffn_down = 0;
8770
+ int n_ffn_gate = 0;
8771
+ int n_ffn_up = 0;
8769
8772
int i_attention_wv = 0;
8770
- int i_feed_forward_w2 = 0;
8773
+ int i_ffn_down = 0;
8774
+ int i_ffn_gate = 0;
8775
+ int i_ffn_up = 0;
8771
8776
8772
8777
int n_k_quantized = 0;
8773
8778
int n_fallback = 0;
@@ -8870,8 +8875,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8870
8875
++qs.i_attention_wv;
8871
8876
}
8872
8877
else if (name.find("ffn_down") != std::string::npos) {
8873
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /8) new_type = GGML_TYPE_Q2_K;
8874
- ++qs.i_feed_forward_w2 ;
8878
+ if (qs.i_ffn_down < qs.n_ffn_down /8) new_type = GGML_TYPE_Q2_K;
8879
+ ++qs.i_ffn_down ;
8875
8880
}
8876
8881
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8877
8882
} else if (name.find("attn_v.weight") != std::string::npos) {
@@ -8908,18 +8913,21 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8908
8913
// TODO: explore better strategies
8909
8914
new_type = GGML_TYPE_Q8_0;
8910
8915
}
8916
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
8917
+ new_type = GGML_TYPE_Q2_K;
8918
+ }
8911
8919
} else if (name.find("ffn_down") != std::string::npos) {
8912
8920
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8913
8921
int i_layer, n_layer;
8914
8922
if (n_expert == 1) {
8915
- i_layer = qs.i_feed_forward_w2 ;
8916
- n_layer = qs.n_feed_forward_w2 ;
8923
+ i_layer = qs.i_ffn_down ;
8924
+ n_layer = qs.n_ffn_down ;
8917
8925
} else {
8918
8926
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8919
- // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8927
+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
8920
8928
// for getting the current layer as I initially thought, and we need to resort to parsing the
8921
8929
// tensor name.
8922
- n_layer = qs.n_feed_forward_w2 / n_expert;
8930
+ n_layer = qs.n_ffn_down / n_expert;
8923
8931
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8924
8932
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8925
8933
}
@@ -8928,7 +8936,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8928
8936
}
8929
8937
}
8930
8938
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8931
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8939
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ) {
8932
8940
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
8933
8941
}
8934
8942
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8958,11 +8966,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8958
8966
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
8959
8967
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
8960
8968
}
8961
- ++qs.i_feed_forward_w2 ;
8969
+ ++qs.i_ffn_down ;
8962
8970
} else if (name.find("attn_output.weight") != std::string::npos) {
8963
8971
if (arch != LLM_ARCH_FALCON) {
8964
8972
if (qs.model.hparams.n_expert == 8) {
8965
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
8973
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
8974
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
8966
8975
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8967
8976
new_type = GGML_TYPE_Q5_K;
8968
8977
}
@@ -8980,6 +8989,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8980
8989
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8981
8990
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8982
8991
}
8992
+ else if (name.find("ffn_gate") != std::string::npos) {
8993
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
8994
+ new_type = GGML_TYPE_Q2_K;
8995
+ }
8996
+ ++qs.i_ffn_gate;
8997
+ }
8998
+ else if (name.find("ffn_up") != std::string::npos) {
8999
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
9000
+ new_type = GGML_TYPE_Q2_K;
9001
+ }
9002
+ ++qs.i_ffn_up;
9003
+ }
9004
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9005
+ //}
8983
9006
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
8984
9007
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
8985
9008
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -9034,8 +9057,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9034
9057
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
9035
9058
9036
9059
// K-quants
9060
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S:
9037
9061
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9038
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
9062
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
9039
9063
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9040
9064
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9041
9065
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9103,12 +9127,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9103
9127
++qs.n_attention_wv;
9104
9128
}
9105
9129
else if (name.find("ffn_down") != std::string::npos) {
9106
- ++qs.n_feed_forward_w2;
9130
+ ++qs.n_ffn_down;
9131
+ }
9132
+ else if (name.find("ffn_gate") != std::string::npos) {
9133
+ ++qs.n_ffn_gate;
9134
+ }
9135
+ else if (name.find("ffn_up") != std::string::npos) {
9136
+ ++qs.n_ffn_up;
9107
9137
}
9108
9138
}
9109
- if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
9110
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
9111
- __func__, qs.n_attention_wv, qs.n_feed_forward_w2 , model.hparams.n_layer);
9139
+ if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
9140
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
9141
+ __func__, qs.n_attention_wv, qs.n_ffn_down , model.hparams.n_layer);
9112
9142
}
9113
9143
9114
9144
size_t total_size_org = 0;
0 commit comments