@@ -15847,9 +15847,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15847
15847
const llm_arch arch = qs.model.arch;
15848
15848
const auto tn = LLM_TN(arch);
15849
15849
15850
+ auto use_few_bits = [](int i_layer, int n_layers) -> bool {
15851
+ return i_layer <= n_layers/8 || i_layer > 7*n_layers/8;
15852
+ };
15853
+ //few_bits has a broad 25% bump to the upper quant.
15854
+ auto use_some_bits = [](int i_layer, int n_layers) -> bool {
15855
+ return i_layer <= n_layers/8 || i_layer > 7*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
15856
+ };
15857
+ // return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
15858
+ // The intervals of 3 are replaced by a broad bump in the central layers. some_bits has a broad 37.5% bump to the upper quant.
15850
15859
auto use_more_bits = [](int i_layer, int n_layers) -> bool {
15851
- return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
15860
+ return i_layer <= n_layers/8 || i_layer > 6*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
15861
+ };
15862
+ //more_bits has a broad 50% bump to the upper quant.
15863
+ auto use_many_bits = [](int i_layer, int n_layers) -> bool {
15864
+ return i_layer <= n_layers/8 || i_layer > 5*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 4*n_layers/8);
15852
15865
};
15866
+ //many_bits has a broad 75% bump to the upper quant.
15853
15867
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
15854
15868
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
15855
15869
if (n_expert > 1) {
@@ -15917,10 +15931,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15917
15931
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
15918
15932
else new_type = GGML_TYPE_IQ3_S;
15919
15933
}
15920
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
15921
- if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_S;
15922
- else new_type = GGML_TYPE_IQ4_XS;
15923
- }
15934
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
15924
15935
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
15925
15936
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
15926
15937
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
@@ -15969,7 +15980,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15969
15980
}
15970
15981
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K;
15971
15982
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
15972
- new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15983
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
15984
+ else new_type = GGML_TYPE_IQ4_XS;
15973
15985
}
15974
15986
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15975
15987
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -15988,7 +16000,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15988
16000
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15989
16001
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15990
16002
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
15991
- use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16003
+ use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
15992
16004
}
15993
16005
}
15994
16006
++qs.i_attention_wv;
@@ -16027,9 +16039,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16027
16039
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) {
16028
16040
new_type = GGML_TYPE_IQ3_XXS;
16029
16041
}
16030
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16031
- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16032
- else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ4_XS;
16042
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16043
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16044
+ new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16045
+ else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16046
+ }
16047
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16048
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16049
+ new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16050
+ else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16033
16051
}
16034
16052
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
16035
16053
new_type = GGML_TYPE_Q4_K;
@@ -16059,8 +16077,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16059
16077
}
16060
16078
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16061
16079
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16062
- new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
16063
- use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS ;
16080
+ new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ3_S :
16081
+ use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S ;
16064
16082
}
16065
16083
}
16066
16084
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ3_S;
@@ -16091,11 +16109,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16091
16109
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
16092
16110
: GGML_TYPE_Q3_K;
16093
16111
}
16094
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
16112
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (use_some_bits( i_layer, n_layer) ||
16095
16113
(qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) {
16096
16114
new_type = GGML_TYPE_Q4_K;
16097
16115
}
16098
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16116
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16117
+ new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16118
+ }
16099
16119
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
16100
16120
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
16101
16121
}
@@ -16193,30 +16213,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16193
16213
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
16194
16214
int i_layer = info.first, n_layer = info.second;
16195
16215
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
16196
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
16197
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
16216
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
16217
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16218
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16198
16219
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16199
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS;
16200
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S;
16201
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS;
16220
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
16221
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
16222
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
16223
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16202
16224
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16203
16225
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16204
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16226
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16205
16227
++qs.i_ffn_gate;
16206
16228
}
16207
16229
else if (name.find("ffn_up") != std::string::npos) {
16208
16230
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
16209
16231
int i_layer = info.first, n_layer = info.second;
16210
16232
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
16211
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
16212
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
16233
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
16234
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16235
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16213
16236
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16214
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS;
16215
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S;
16216
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS;
16237
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
16238
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
16239
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
16240
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16217
16241
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16218
16242
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16219
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16243
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16220
16244
++qs.i_ffn_up;
16221
16245
}
16222
16246
0 commit comments