Skip to content

Commit 8fc46df

Browse files
committed
Bump a bit ffn_gate and down for some GQA<2 models
1 parent 53b8eaa commit 8fc46df

File tree

1 file changed

+20
-16
lines changed

1 file changed

+20
-16
lines changed

src/llama.cpp

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16495,6 +16495,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1649516495
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K;
1649616496
else new_type = GGML_TYPE_Q3_K;
1649716497
}
16498+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
16499+
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q3_K;
16500+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16501+
}
1649816502
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
1649916503
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1650016504
new_type = GGML_TYPE_IQ2_S;
@@ -16512,7 +16516,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1651216516
}
1651316517
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1651416518
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_S;
16515-
new_type = GGML_TYPE_IQ4_XS;
16519+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS;
1651616520
}
1651716521
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
1651816522
new_type = GGML_TYPE_Q4_0;
@@ -16833,7 +16837,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1683316837
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
1683416838
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1683516839
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16836-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16840+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
1683716841
}
1683816842
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
1683916843
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16852,7 +16856,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1685216856
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
1685316857
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1685416858
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16855-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16859+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1685616860
}
1685716861
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1685816862
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16871,7 +16875,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1687116875
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
1687216876
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1687316877
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16874-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
16878+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
1687516879
}
1687616880
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
1687716881
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16886,7 +16890,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1688616890
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
1688716891
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1688816892
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16889-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16893+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1689016894
}
1689116895
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1689216896
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16896,7 +16900,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1689616900
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
1689716901
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1689816902
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16899-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16903+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1690016904
}
1690116905
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1690216906
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16905,7 +16909,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1690516909
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1690616910
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1690716911
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16908-
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16912+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1690916913
}
1691016914
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
1691116915
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16924,7 +16928,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1692416928
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1692516929
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1692616930
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16927-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16931+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1692816932
}
1692916933
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1693016934
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17055,7 +17059,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1705517059
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
1705617060
difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1705717061
} else {
17058-
if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
17062+
if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
1705917063
}
1706017064
}
1706117065
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
@@ -17081,7 +17085,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1708117085
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
1708217086
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1708317087
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17084-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17088+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
1708517089
}
1708617090
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
1708717091
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17100,7 +17104,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1710017104
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
1710117105
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1710217106
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17103-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17107+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1710417108
}
1710517109
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1710617110
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17119,7 +17123,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1711917123
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
1712017124
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1712117125
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
17122-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
17126+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
1712317127
}
1712417128
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
1712517129
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17134,7 +17138,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1713417138
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
1713517139
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1713617140
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
17137-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
17141+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1713817142
}
1713917143
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1714017144
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17144,7 +17148,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1714417148
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
1714517149
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1714617150
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
17147-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
17151+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1714817152
}
1714917153
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1715017154
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17153,7 +17157,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1715317157
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1715417158
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1715517159
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17156-
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17160+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1715717161
}
1715817162
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
1715917163
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17172,7 +17176,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1717217176
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1717317177
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1717417178
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17175-
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17179+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1717617180
}
1717717181
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1717817182
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)

0 commit comments

Comments
 (0)