@@ -16567,7 +16567,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16567
16567
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
16568
16568
difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16569
16569
}
16570
- else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16570
+ else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16571
16571
}
16572
16572
++qs.i_attention_wv;
16573
16573
} else if (name.find("attn_k.weight") != std::string::npos) {
@@ -16710,7 +16710,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16710
16710
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
16711
16711
difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16712
16712
}
16713
- else difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16713
+ else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16714
16714
}
16715
16715
++qs.i_attention_wq;
16716
16716
} else if (name.find("attn_output.weight") != std::string::npos) {
@@ -16737,7 +16737,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16737
16737
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q5_K :
16738
16738
difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16739
16739
}
16740
- else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16740
+ else new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16741
16741
}
16742
16742
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) {
16743
16743
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)
@@ -16787,7 +16787,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16787
16787
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
16788
16788
difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16789
16789
}
16790
- else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16790
+ else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16791
16791
}
16792
16792
++qs.i_attention_wv;
16793
16793
} else if (name.find("ffn_gate") != std::string::npos) {
@@ -16983,7 +16983,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16983
16983
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
16984
16984
difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16985
16985
}
16986
- else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16986
+ else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16987
16987
}
16988
16988
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
16989
16989
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
0 commit comments