@@ -4697,6 +4697,116 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4697
4697
}
4698
4698
}
4699
4699
4700
+ #ifdef GGML_USE_K_QUANTS
4701
+ ggml_type get_k_quant_type (
4702
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
4703
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
4704
+ ) {
4705
+ const std::string name = ggml_get_name (tensor);
4706
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
4707
+ const auto tn = LLM_TN (model.arch );
4708
+
4709
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
4710
+ return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
4711
+ };
4712
+
4713
+ if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4714
+ int nx = tensor->ne [0 ];
4715
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
4716
+ new_type = GGML_TYPE_Q8_0;
4717
+ }
4718
+ else if (new_type != GGML_TYPE_Q8_0) {
4719
+ new_type = GGML_TYPE_Q6_K;
4720
+ }
4721
+ } else if (name.find (" attn_v.weight" ) != std::string::npos) {
4722
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4723
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4724
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4725
+ }
4726
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4727
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4728
+ use_more_bits (*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4729
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
4730
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4731
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
4732
+ if (model.type == MODEL_70B) {
4733
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4734
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4735
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
4736
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4737
+ }
4738
+ ++*i_attention_wv;
4739
+ } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
4740
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4741
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4742
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4743
+ : model.arch != LLM_ARCH_FALCON || use_more_bits (*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4744
+ : GGML_TYPE_Q3_K;
4745
+ }
4746
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4747
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4748
+ }
4749
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4750
+ if (model.arch == LLM_ARCH_FALCON) {
4751
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4752
+ use_more_bits (*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4753
+ } else {
4754
+ if (use_more_bits (*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4755
+ }
4756
+ }
4757
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4758
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4 ) {
4759
+ new_type = GGML_TYPE_Q5_K;
4760
+ }
4761
+ ++*i_feed_forward_w2;
4762
+ } else if (name.find (" attn_output.weight" ) != std::string::npos) {
4763
+ if (model.arch != LLM_ARCH_FALCON) {
4764
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4765
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4766
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4767
+ } else {
4768
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4769
+ }
4770
+ }
4771
+ else if (name.find (" attn_qkv.weight" ) != std::string::npos) {
4772
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4773
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4774
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4775
+ }
4776
+ else if (name.find (" ffn_gate.weight" ) != std::string::npos || name.find (" ffn_up.weight" ) != std::string::npos) {
4777
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4778
+ }
4779
+ // This can be used to reduce the size of the Q5_K_S model.
4780
+ // The associated PPL increase is fully in line with the size reduction
4781
+ // else {
4782
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4783
+ // }
4784
+ bool convert_incompatible_tensor = false ;
4785
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4786
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4787
+ int nx = tensor->ne [0 ];
4788
+ int ny = tensor->ne [1 ];
4789
+ if (nx % QK_K != 0 ) {
4790
+ LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for k-quants\n " , __func__, nx, ny, QK_K);
4791
+ convert_incompatible_tensor = true ;
4792
+ }
4793
+ }
4794
+ if (convert_incompatible_tensor) {
4795
+ if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4796
+ new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
4797
+ LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
4798
+ } else if (name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" )) {
4799
+ new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
4800
+ LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
4801
+ } else {
4802
+ throw std::runtime_error (" Unsupported tensor size encountered\n " );
4803
+ }
4804
+ }
4805
+
4806
+ return new_type;
4807
+ }
4808
+ #endif
4809
+
4700
4810
static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
4701
4811
ggml_type quantized_type;
4702
4812
llama_ftype ftype = params->ftype ;
@@ -4782,12 +4892,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4782
4892
std::vector<std::thread> workers;
4783
4893
std::mutex mutex;
4784
4894
4785
- #ifdef GGML_USE_K_QUANTS
4786
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4787
- return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
4788
- };
4789
- #endif
4790
-
4791
4895
int idx = 0 ;
4792
4896
4793
4897
std::vector<uint8_t > read_data;
@@ -4838,101 +4942,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4838
4942
if (quantize) {
4839
4943
new_type = quantized_type;
4840
4944
#ifdef GGML_USE_K_QUANTS
4841
- // TODO: avoid hardcoded tensor names - use the TN_* constants
4842
- const auto tn = LLM_TN (ml->get_arch ());
4843
-
4844
- if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4845
- int nx = tensor->ne [0 ];
4846
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
4847
- new_type = GGML_TYPE_Q8_0;
4848
- }
4849
- else if (new_type != GGML_TYPE_Q8_0) {
4850
- new_type = GGML_TYPE_Q6_K;
4851
- }
4852
- } else if (name.find (" attn_v.weight" ) != std::string::npos) {
4853
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4854
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4855
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4856
- }
4857
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4858
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4859
- use_more_bits (i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4860
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
4861
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4862
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
4863
- if (model.type == MODEL_70B) {
4864
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4865
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4866
- // nearly negligible increase in model size by quantizing this tensor with more bits:
4867
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4868
- }
4869
- ++i_attention_wv;
4870
- } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
4871
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4872
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4873
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4874
- : model.arch != LLM_ARCH_FALCON || use_more_bits (i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4875
- : GGML_TYPE_Q3_K;
4876
- }
4877
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4878
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4879
- }
4880
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4881
- if (model.arch == LLM_ARCH_FALCON) {
4882
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4883
- use_more_bits (i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4884
- } else {
4885
- if (use_more_bits (i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4886
- }
4887
- }
4888
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4889
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4 ) {
4890
- new_type = GGML_TYPE_Q5_K;
4891
- }
4892
- ++i_feed_forward_w2;
4893
- } else if (name.find (" attn_output.weight" ) != std::string::npos) {
4894
- if (model.arch != LLM_ARCH_FALCON) {
4895
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4896
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4897
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4898
- } else {
4899
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4900
- }
4901
- }
4902
- else if (name.find (" attn_qkv.weight" ) != std::string::npos) {
4903
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4904
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4905
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4906
- }
4907
- else if (name.find (" ffn_gate.weight" ) != std::string::npos || name.find (" ffn_up.weight" ) != std::string::npos) {
4908
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4909
- }
4910
- // This can be used to reduce the size of the Q5_K_S model.
4911
- // The associated PPL increase is fully in line with the size reduction
4912
- // else {
4913
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4914
- // }
4915
- bool convert_incompatible_tensor = false ;
4916
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4917
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4918
- int nx = tensor->ne [0 ];
4919
- int ny = tensor->ne [1 ];
4920
- if (nx % QK_K != 0 ) {
4921
- LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for k-quants\n " , __func__, nx, ny, QK_K);
4922
- convert_incompatible_tensor = true ;
4923
- }
4924
- }
4925
- if (convert_incompatible_tensor) {
4926
- if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4927
- new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
4928
- LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
4929
- } else if (name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" )) {
4930
- new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
4931
- LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
4932
- } else {
4933
- throw std::runtime_error (" Unsupported tensor size encountered\n " );
4934
- }
4935
- }
4945
+ new_type = get_k_quant_type (
4946
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
4947
+ );
4936
4948
#endif
4937
4949
// If we've decided to quantize to the same type the tensor is already
4938
4950
// in then there's nothing to do.
0 commit comments