@@ -18241,14 +18241,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18241
18241
// new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
18242
18242
// }
18243
18243
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
18244
- new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
18244
+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
18245
18245
}
18246
18246
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
18247
18247
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
18248
18248
}
18249
18249
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q4_K;
18250
18250
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
18251
- new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18251
+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18252
18252
}
18253
18253
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
18254
18254
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -18270,47 +18270,61 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18270
18270
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18271
18271
}
18272
18272
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18273
- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18273
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18274
18274
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
18275
18275
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18276
18276
}
18277
18277
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18278
- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18278
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18279
18279
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
18280
18280
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18281
18281
}
18282
18282
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
18283
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18283
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18284
+ new_type = GGML_TYPE_Q6_K;
18285
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18284
18286
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18285
18287
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18286
18288
}
18287
18289
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
18288
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18290
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18291
+ new_type = GGML_TYPE_Q6_K;
18292
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18289
18293
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18290
18294
else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18291
18295
}
18292
18296
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML) {
18293
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18297
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18298
+ new_type = GGML_TYPE_Q6_K;
18299
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18294
18300
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18295
18301
else new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18296
18302
}
18297
18303
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
18298
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18304
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18305
+ new_type = GGML_TYPE_Q6_K;
18306
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18299
18307
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18300
18308
else new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18301
18309
}
18302
18310
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18303
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18311
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18312
+ new_type = GGML_TYPE_Q6_K;
18313
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18304
18314
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18305
18315
else new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18306
18316
}
18307
18317
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
18308
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18318
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18319
+ new_type = GGML_TYPE_Q6_K;
18320
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18309
18321
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18310
18322
else new_type = difquant_seven_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18311
18323
}
18312
18324
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18313
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18325
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18326
+ new_type = GGML_TYPE_Q6_K;
18327
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18314
18328
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18315
18329
else new_type = GGML_TYPE_Q5_K;
18316
18330
}
0 commit comments