Skip to content

Commit f1814f1

Browse files
committed
Rebump attn_v
1 parent b94a9b0 commit f1814f1

File tree

1 file changed

+25
-11
lines changed

1 file changed

+25
-11
lines changed

src/llama.cpp

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18241,14 +18241,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1824118241
// new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
1824218242
// }
1824318243
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
18244-
new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
18244+
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1824518245
}
1824618246
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
1824718247
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1824818248
}
1824918249
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q4_K;
1825018250
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
18251-
new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18251+
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1825218252
}
1825318253
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
1825418254
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -18270,47 +18270,61 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1827018270
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1827118271
}
1827218272
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18273-
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18273+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1827418274
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1827518275
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1827618276
}
1827718277
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18278-
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18278+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1827918279
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1828018280
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1828118281
}
1828218282
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
18283-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18283+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18284+
new_type = GGML_TYPE_Q6_K;
18285+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1828418286
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1828518287
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1828618288
}
1828718289
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
18288-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18290+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18291+
new_type = GGML_TYPE_Q6_K;
18292+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1828918293
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1829018294
else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1829118295
}
1829218296
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML) {
18293-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18297+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18298+
new_type = GGML_TYPE_Q6_K;
18299+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1829418300
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1829518301
else new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1829618302
}
1829718303
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
18298-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18304+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18305+
new_type = GGML_TYPE_Q6_K;
18306+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1829918307
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1830018308
else new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1830118309
}
1830218310
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18303-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18311+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18312+
new_type = GGML_TYPE_Q6_K;
18313+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1830418314
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1830518315
else new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1830618316
}
1830718317
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
18308-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18318+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18319+
new_type = GGML_TYPE_Q6_K;
18320+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1830918321
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1831018322
else new_type = difquant_seven_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1831118323
}
1831218324
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18313-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18325+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18326+
new_type = GGML_TYPE_Q6_K;
18327+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1831418328
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1831518329
else new_type = GGML_TYPE_Q5_K;
1831618330
}

0 commit comments

Comments
 (0)