@@ -18744,7 +18744,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18744
18744
}
18745
18745
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18746
18746
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18747
- new_type = difquant_seven_eights_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S;
18747
+ new_type = difquant_first_last_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18748
18748
else new_type = GGML_TYPE_Q3_K;
18749
18749
}
18750
18750
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_MR) {
@@ -18933,7 +18933,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18933
18933
}
18934
18934
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18935
18935
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18936
- new_type = (difquant_seven_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S ;
18936
+ new_type = (difquant_seven_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS ;
18937
18937
else new_type = GGML_TYPE_IQ4_XS;
18938
18938
}
18939
18939
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
@@ -19133,11 +19133,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19133
19133
// new_type = GGML_TYPE_Q3_K;
19134
19134
// else new_type = difquant_seven_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19135
19135
// }
19136
- // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
19137
- // if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19138
- // new_type = difquant_seven_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S ;
19139
- // else new_type = GGML_TYPE_Q3_K;
19140
- // }
19136
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
19137
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19138
+ new_type = difquant_seven_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS ;
19139
+ else new_type = GGML_TYPE_Q3_K;
19140
+ }
19141
19141
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_MR) {
19142
19142
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
19143
19143
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q5_K :
@@ -19496,7 +19496,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19496
19496
}
19497
19497
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
19498
19498
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19499
- new_type = (difquant_seven_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19499
+ new_type = (difquant_first_last_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19500
19500
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19501
19501
}
19502
19502
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_MR) {
@@ -19640,7 +19640,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19640
19640
}
19641
19641
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
19642
19642
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19643
- new_type = (difquant_seven_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S ;
19643
+ new_type = (difquant_seven_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS ;
19644
19644
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19645
19645
}
19646
19646
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_MR) {
0 commit comments