Skip to content

Commit 584b369

Browse files
committed
iq1_s: use IQ2_XXS for attn_output
At a cost of 0.04 extra bpw this gives a big improvement in PPL.
1 parent 92e1d21 commit 584b369

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

llama.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10112,6 +10112,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
1011210112
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
1011310113
++qs.i_ffn_down;
1011410114
}
10115+
else if (name.find("attn_output.weight") != std::string::npos) {
10116+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10117+
}
1011510118
} else if (name.find("attn_v.weight") != std::string::npos) {
1011610119
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
1011710120
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;

0 commit comments

Comments
 (0)