Skip to content

Commit cd92ba6

Browse files
committed
IQ4_XSR (test FTYPE) and attention_wv logic for all attn_*.weights
Also, Advise iMatrix for IQ2_M and Q2_K FTypes
1 parent 3e2eb6d commit cd92ba6

File tree

3 files changed

+68
-5
lines changed

3 files changed

+68
-5
lines changed

examples/quantize/quantize.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4343
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
4444
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
4545
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
46+
{ "IQ4_XSR", LLAMA_FTYPE_MOSTLY_IQ4_XSR, " 4.xx bpw non-linear quantization", },
4647
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
4748
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
4849
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
@@ -409,13 +410,13 @@ int main(int argc, char ** argv) {
409410
}
410411

411412
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
412-
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
413-
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
413+
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
414+
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K ||
414415
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
415416
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
416-
fprintf(stderr, "\n==========================================================================================================\n");
417-
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
418-
fprintf(stderr, "==========================================================================================================\n\n\n");
417+
fprintf(stderr, "\n==========================================================================================\n");
418+
fprintf(stderr, "Please do not use IQ1_*, IQ2_*, Q2_K_S, or Q2_K quantization without an importance matrix!\n");
419+
fprintf(stderr, "==========================================================================================\n\n\n");
419420
return 1;
420421
}
421422

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ extern "C" {
171171
LLAMA_FTYPE_MOSTLY_Q2_K_L = 38, // except 1d tensors
172172
LLAMA_FTYPE_MOSTLY_IQ1_XS = 39, // except 1d tensors
173173
LLAMA_FTYPE_MOSTLY_IQ1_XL = 40, // except 1d tensors
174+
LLAMA_FTYPE_MOSTLY_IQ4_XSR = 41, // except 1d tensors
174175
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
175176
};
176177

src/llama.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4489,6 +4489,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
44894489
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
44904490
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
44914491
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw";
4492+
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
44924493
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
44934494
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
44944495
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
@@ -15347,10 +15348,17 @@ struct quantize_state_internal {
1534715348
const llama_model_quantize_params * params;
1534815349

1534915350
int n_attention_wv = 0;
15351+
int n_attention_wk = 0;
15352+
int n_attention_wq = 0;
15353+
int n_attention_wo = 0;
1535015354
int n_ffn_down = 0;
1535115355
int n_ffn_gate = 0;
1535215356
int n_ffn_up = 0;
15357+
1535315358
int i_attention_wv = 0;
15359+
int i_attention_wk = 0;
15360+
int i_attention_wq = 0;
15361+
int i_attention_wo = 0;
1535415362
int i_ffn_down = 0;
1535515363
int i_ffn_gate = 0;
1535615364
int i_ffn_up = 0;
@@ -15505,6 +15513,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1550515513
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
1550615514
new_type = GGML_TYPE_Q4_0;
1550715515
}
15516+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q8_0;
1550815517
}
1550915518
} else if (name.find("attn_v.weight") != std::string::npos) {
1551015519
if (qs.model.hparams.n_expert >= 4) {
@@ -15556,9 +15565,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1555615565
(qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1555715566
new_type = GGML_TYPE_Q5_K;
1555815567
}
15568+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
15569+
new_type = GGML_TYPE_Q5_K;
15570+
}
1555915571
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
1556015572
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
1556115573
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
15574+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15575+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15576+
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
15577+
use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
15578+
}
15579+
}
1556215580
++qs.i_attention_wv;
1556315581
} else if (name.find("attn_k.weight") != std::string::npos) {
1556415582
if (qs.model.hparams.n_expert >= 4) {
@@ -15606,6 +15624,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1560615624
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1560715625
new_type = GGML_TYPE_Q5_K;
1560815626
}
15627+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15628+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15629+
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
15630+
use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15631+
}
15632+
}
15633+
++qs.i_attention_wk;
1560915634
} else if (name.find("attn_q.weight") != std::string::npos) {
1561015635
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1561115636
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
@@ -15618,6 +15643,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1561815643
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
1561915644
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K;
1562015645
}
15646+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15647+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15648+
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
15649+
use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15650+
}
15651+
}
15652+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ3_S;
15653+
++qs.i_attention_wq;
1562115654
} else if (name.find("ffn_down") != std::string::npos) {
1562215655
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
1562315656
int i_layer = info.first, n_layer = info.second;
@@ -15674,6 +15707,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1567415707
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
1567515708
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
1567615709
}
15710+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15711+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15712+
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K :
15713+
use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15714+
}
15715+
}
1567715716
++qs.i_ffn_down;
1567815717
} else if (name.find("attn_output.weight") != std::string::npos) {
1567915718
if (arch != LLM_ARCH_FALCON) {
@@ -15682,6 +15721,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1568215721
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
1568315722
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
1568415723
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15724+
ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
1568515725
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
1568615726
}
1568715727
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
@@ -15700,10 +15740,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1570015740
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
1570115741
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
1570215742
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
15743+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15744+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15745+
new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_Q5_K :
15746+
use_more_bits(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15747+
}
15748+
}
1570315749
}
1570415750
} else {
1570515751
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
1570615752
}
15753+
++qs.i_attention_wo;
1570715754
}
1570815755
else if (name.find("attn_qkv.weight") != std::string::npos) {
1570915756
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -15723,8 +15770,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1572315770
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
1572415771
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_Q4_K;
1572515772
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
15773+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q5_K;
1572615774
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
1572715775
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
15776+
++qs.i_attention_wv;
1572815777
}
1572915778
else if (name.find("ffn_gate") != std::string::npos) {
1573015779
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
@@ -15739,6 +15788,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1573915788
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1574015789
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1574115790
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15791+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
1574215792
++qs.i_ffn_gate;
1574315793
}
1574415794
else if (name.find("ffn_up") != std::string::npos) {
@@ -15754,6 +15804,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1575415804
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1575515805
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1575615806
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15807+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
1575715808
++qs.i_ffn_up;
1575815809
}
1575915810

@@ -15900,6 +15951,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1590015951
case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break;
1590115952
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
1590215953
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
15954+
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: default_type = GGML_TYPE_IQ4_XS; break;
1590315955
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
1590415956
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1590515957
case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
@@ -15998,6 +16050,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1599816050
if (name.find("attn_v.weight") != std::string::npos ||
1599916051
name.find("attn_qkv.weight") != std::string::npos) {
1600016052
++qs.n_attention_wv;
16053+
} else if (name.find("attn_k.weight") != std::string::npos) {
16054+
++qs.n_attention_wk;
16055+
} else if (name.find("attn_q.weight") != std::string::npos) {
16056+
++qs.n_attention_wq;
16057+
} else if (name.find("attn_output.weight") != std::string::npos) {
16058+
++qs.n_attention_wo;
1600116059
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
1600216060
qs.has_output = true;
1600316061
}
@@ -16012,6 +16070,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1601216070
// - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models
1601316071
//
1601416072
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
16073+
GGML_ASSERT((qs.n_attention_wk == 0 || qs.n_attention_wk == (int)model.hparams.n_layer || qs.n_attention_wk == 3 * (int)model.hparams.n_layer) && "n_attention_wk is unexpected");
16074+
GGML_ASSERT((qs.n_attention_wq == 0 || qs.n_attention_wq == (int)model.hparams.n_layer || qs.n_attention_wq == 3 * (int)model.hparams.n_layer) && "n_attention_wq is unexpected");
16075+
GGML_ASSERT((qs.n_attention_wo == 0 || qs.n_attention_wo == (int)model.hparams.n_layer || qs.n_attention_wo == 3 * (int)model.hparams.n_layer) && "n_attention_wo is unexpected");
1601516076

1601616077
size_t total_size_org = 0;
1601716078
size_t total_size_new = 0;

0 commit comments

Comments
 (0)