@@ -4489,6 +4489,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
4489
4489
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
4490
4490
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
4491
4491
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw";
4492
+ case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
4492
4493
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
4493
4494
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
4494
4495
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
@@ -15347,10 +15348,17 @@ struct quantize_state_internal {
15347
15348
const llama_model_quantize_params * params;
15348
15349
15349
15350
int n_attention_wv = 0;
15351
+ int n_attention_wk = 0;
15352
+ int n_attention_wq = 0;
15353
+ int n_attention_wo = 0;
15350
15354
int n_ffn_down = 0;
15351
15355
int n_ffn_gate = 0;
15352
15356
int n_ffn_up = 0;
15357
+
15353
15358
int i_attention_wv = 0;
15359
+ int i_attention_wk = 0;
15360
+ int i_attention_wq = 0;
15361
+ int i_attention_wo = 0;
15354
15362
int i_ffn_down = 0;
15355
15363
int i_ffn_gate = 0;
15356
15364
int i_ffn_up = 0;
@@ -15505,6 +15513,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15505
15513
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
15506
15514
new_type = GGML_TYPE_Q4_0;
15507
15515
}
15516
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q8_0;
15508
15517
}
15509
15518
} else if (name.find("attn_v.weight") != std::string::npos) {
15510
15519
if (qs.model.hparams.n_expert >= 4) {
@@ -15556,9 +15565,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15556
15565
(qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
15557
15566
new_type = GGML_TYPE_Q5_K;
15558
15567
}
15568
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
15569
+ new_type = GGML_TYPE_Q5_K;
15570
+ }
15559
15571
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
15560
15572
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
15561
15573
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
15574
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15575
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15576
+ new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
15577
+ use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
15578
+ }
15579
+ }
15562
15580
++qs.i_attention_wv;
15563
15581
} else if (name.find("attn_k.weight") != std::string::npos) {
15564
15582
if (qs.model.hparams.n_expert >= 4) {
@@ -15606,6 +15624,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15606
15624
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
15607
15625
new_type = GGML_TYPE_Q5_K;
15608
15626
}
15627
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15628
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15629
+ new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
15630
+ use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15631
+ }
15632
+ }
15633
+ ++qs.i_attention_wk;
15609
15634
} else if (name.find("attn_q.weight") != std::string::npos) {
15610
15635
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
15611
15636
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
@@ -15618,6 +15643,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15618
15643
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
15619
15644
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K;
15620
15645
}
15646
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15647
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15648
+ new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
15649
+ use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15650
+ }
15651
+ }
15652
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ3_S;
15653
+ ++qs.i_attention_wq;
15621
15654
} else if (name.find("ffn_down") != std::string::npos) {
15622
15655
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
15623
15656
int i_layer = info.first, n_layer = info.second;
@@ -15674,6 +15707,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15674
15707
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
15675
15708
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
15676
15709
}
15710
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15711
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15712
+ new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K :
15713
+ use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15714
+ }
15715
+ }
15677
15716
++qs.i_ffn_down;
15678
15717
} else if (name.find("attn_output.weight") != std::string::npos) {
15679
15718
if (arch != LLM_ARCH_FALCON) {
@@ -15682,6 +15721,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15682
15721
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
15683
15722
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
15684
15723
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15724
+ ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
15685
15725
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
15686
15726
}
15687
15727
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
@@ -15700,10 +15740,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15700
15740
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
15701
15741
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
15702
15742
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
15743
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
15744
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
15745
+ new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_Q5_K :
15746
+ use_more_bits(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
15747
+ }
15748
+ }
15703
15749
}
15704
15750
} else {
15705
15751
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
15706
15752
}
15753
+ ++qs.i_attention_wo;
15707
15754
}
15708
15755
else if (name.find("attn_qkv.weight") != std::string::npos) {
15709
15756
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -15723,8 +15770,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15723
15770
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
15724
15771
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_Q4_K;
15725
15772
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
15773
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q5_K;
15726
15774
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
15727
15775
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
15776
+ ++qs.i_attention_wv;
15728
15777
}
15729
15778
else if (name.find("ffn_gate") != std::string::npos) {
15730
15779
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
@@ -15739,6 +15788,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15739
15788
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
15740
15789
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
15741
15790
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15791
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
15742
15792
++qs.i_ffn_gate;
15743
15793
}
15744
15794
else if (name.find("ffn_up") != std::string::npos) {
@@ -15754,6 +15804,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15754
15804
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
15755
15805
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
15756
15806
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
15807
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K;
15757
15808
++qs.i_ffn_up;
15758
15809
}
15759
15810
@@ -15900,6 +15951,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15900
15951
case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break;
15901
15952
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
15902
15953
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
15954
+ case LLAMA_FTYPE_MOSTLY_IQ4_XSR: default_type = GGML_TYPE_IQ4_XS; break;
15903
15955
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
15904
15956
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
15905
15957
case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
@@ -15998,6 +16050,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15998
16050
if (name.find("attn_v.weight") != std::string::npos ||
15999
16051
name.find("attn_qkv.weight") != std::string::npos) {
16000
16052
++qs.n_attention_wv;
16053
+ } else if (name.find("attn_k.weight") != std::string::npos) {
16054
+ ++qs.n_attention_wk;
16055
+ } else if (name.find("attn_q.weight") != std::string::npos) {
16056
+ ++qs.n_attention_wq;
16057
+ } else if (name.find("attn_output.weight") != std::string::npos) {
16058
+ ++qs.n_attention_wo;
16001
16059
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
16002
16060
qs.has_output = true;
16003
16061
}
@@ -16012,6 +16070,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16012
16070
// - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models
16013
16071
//
16014
16072
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
16073
+ GGML_ASSERT((qs.n_attention_wk == 0 || qs.n_attention_wk == (int)model.hparams.n_layer || qs.n_attention_wk == 3 * (int)model.hparams.n_layer) && "n_attention_wk is unexpected");
16074
+ GGML_ASSERT((qs.n_attention_wq == 0 || qs.n_attention_wq == (int)model.hparams.n_layer || qs.n_attention_wq == 3 * (int)model.hparams.n_layer) && "n_attention_wq is unexpected");
16075
+ GGML_ASSERT((qs.n_attention_wo == 0 || qs.n_attention_wo == (int)model.hparams.n_layer || qs.n_attention_wo == 3 * (int)model.hparams.n_layer) && "n_attention_wo is unexpected");
16015
16076
16016
16077
size_t total_size_org = 0;
16017
16078
size_t total_size_new = 0;
0 commit comments