Fix IQ4_XSR

Nexesenex · Nexesenex · commit 380b53d0617e · 2024-08-25T03:04:17.000+02:00
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -16567,7 +16567,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
                            difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
             }
-            else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
+            else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
         }
         ++qs.i_attention_wv;
     } else if (name.find("attn_k.weight") != std::string::npos) {
@@ -16710,7 +16710,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
                             difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
             }
-            else difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
+            else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
         }
         ++qs.i_attention_wq;
     } else if (name.find("attn_output.weight") != std::string::npos) {
@@ -16737,7 +16737,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q5_K :
                            difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
             }
-            else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
+            else new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
         }
         else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) {
             if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)
@@ -16787,7 +16787,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
                             difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
             }
-            else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }
         ++qs.i_attention_wv;
     } else if (name.find("ffn_gate") != std::string::npos) {
@@ -16983,7 +16983,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                  new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
                             difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
             }
-            else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
+            else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
             if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)

Original file line number	Diff line number	Diff line change
`@@ -16567,7 +16567,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n`
`16567`	`16567`	`new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :`
`16568`	`16568`	`difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;`
`16569`	`16569`	`}`
`16570`		`- else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;`
	`16570`	`+ else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;`
`16571`	`16571`	`}`
`16572`	`16572`	`++qs.i_attention_wv;`
`16573`	`16573`	`} else if (name.find("attn_k.weight") != std::string::npos) {`
`@@ -16710,7 +16710,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n`
`16710`	`16710`	`new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :`
`16711`	`16711`	`difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;`
`16712`	`16712`	`}`
`16713`		`- else difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;`
	`16713`	`+ else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;`
`16714`	`16714`	`}`
`16715`	`16715`	`++qs.i_attention_wq;`
`16716`	`16716`	`} else if (name.find("attn_output.weight") != std::string::npos) {`
`@@ -16737,7 +16737,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n`
`16737`	`16737`	`new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q5_K :`
`16738`	`16738`	`difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;`
`16739`	`16739`	`}`
`16740`		`- else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;`
	`16740`	`+ else new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;`
`16741`	`16741`	`}`
`16742`	`16742`	`else if (qs.model.hparams.n_gqa() >= 4 \|\| qs.model.hparams.n_expert >= 2) {`
`16743`	`16743`	`if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ1_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)`
`@@ -16787,7 +16787,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n`
`16787`	`16787`	`new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :`
`16788`	`16788`	`difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;`
`16789`	`16789`	`}`
`16790`		`- else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;`
	`16790`	`+ else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;`
`16791`	`16791`	`}`
`16792`	`16792`	`++qs.i_attention_wv;`
`16793`	`16793`	`} else if (name.find("ffn_gate") != std::string::npos) {`
`@@ -16983,7 +16983,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n`
`16983`	`16983`	`new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :`
`16984`	`16984`	`difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;`
`16985`	`16985`	`}`
`16986`		`- else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;`
	`16986`	`+ else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;`
`16987`	`16987`	`}`
`16988`	`16988`	`else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {`
`16989`	`16989`	`if (qs.model.hparams.n_gqa() >= 2 \|\| qs.model.hparams.n_expert >= 2)`