@@ -16495,6 +16495,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16495
16495
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K;
16496
16496
else new_type = GGML_TYPE_Q3_K;
16497
16497
}
16498
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
16499
+ if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q3_K;
16500
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16501
+ }
16498
16502
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
16499
16503
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
16500
16504
new_type = GGML_TYPE_IQ2_S;
@@ -16512,7 +16516,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16512
16516
}
16513
16517
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16514
16518
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_S;
16515
- new_type = GGML_TYPE_IQ4_XS;
16519
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS;
16516
16520
}
16517
16521
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
16518
16522
new_type = GGML_TYPE_Q4_0;
@@ -16833,7 +16837,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16833
16837
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
16834
16838
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16835
16839
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16836
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16840
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16837
16841
}
16838
16842
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
16839
16843
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16852,7 +16856,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16852
16856
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
16853
16857
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16854
16858
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16855
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16859
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16856
16860
}
16857
16861
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
16858
16862
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16871,7 +16875,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16871
16875
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
16872
16876
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16873
16877
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16874
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16878
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16875
16879
}
16876
16880
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
16877
16881
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16886,7 +16890,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16886
16890
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
16887
16891
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16888
16892
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16889
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16893
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16890
16894
}
16891
16895
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16892
16896
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16896,7 +16900,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16896
16900
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16897
16901
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16898
16902
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16899
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16903
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16900
16904
}
16901
16905
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16902
16906
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16905,7 +16909,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16905
16909
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16906
16910
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16907
16911
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16908
- else new_type = (difquant_fl_more_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16912
+ else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16909
16913
}
16910
16914
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16911
16915
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16924,7 +16928,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16924
16928
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16925
16929
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16926
16930
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16927
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16931
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16928
16932
}
16929
16933
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16930
16934
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17055,7 +17059,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17055
17059
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
17056
17060
difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17057
17061
} else {
17058
- if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K ;
17062
+ if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K ;
17059
17063
}
17060
17064
}
17061
17065
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
@@ -17081,7 +17085,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17081
17085
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
17082
17086
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17083
17087
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17084
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17088
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17085
17089
}
17086
17090
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
17087
17091
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17100,7 +17104,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17100
17104
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
17101
17105
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17102
17106
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17103
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17107
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17104
17108
}
17105
17109
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
17106
17110
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17119,7 +17123,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17119
17123
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
17120
17124
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17121
17125
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
17122
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
17126
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
17123
17127
}
17124
17128
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
17125
17129
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17134,7 +17138,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17134
17138
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
17135
17139
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17136
17140
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
17137
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
17141
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
17138
17142
}
17139
17143
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
17140
17144
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17144,7 +17148,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17144
17148
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17145
17149
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17146
17150
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
17147
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
17151
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
17148
17152
}
17149
17153
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
17150
17154
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17153,7 +17157,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17153
17157
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
17154
17158
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17155
17159
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17156
- else new_type = (difquant_fl_more_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17160
+ else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17157
17161
}
17158
17162
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
17159
17163
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17172,7 +17176,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17172
17176
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17173
17177
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17174
17178
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17175
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17179
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17176
17180
}
17177
17181
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
17178
17182
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
0 commit comments