Skip to content

Commit fb2bf51

Browse files
committed
llama : refactor k-quant mixture logic into a function
1 parent 6eeb4d9 commit fb2bf51

File tree

1 file changed

+113
-101
lines changed

1 file changed

+113
-101
lines changed

llama.cpp

Lines changed: 113 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -4697,6 +4697,116 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
46974697
}
46984698
}
46994699

4700+
#ifdef GGML_USE_K_QUANTS
4701+
ggml_type get_k_quant_type(
4702+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
4703+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
4704+
) {
4705+
const std::string name = ggml_get_name(tensor);
4706+
// TODO: avoid hardcoded tensor names - use the TN_* constants
4707+
const auto tn = LLM_TN(model.arch);
4708+
4709+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
4710+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4711+
};
4712+
4713+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4714+
int nx = tensor->ne[0];
4715+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4716+
new_type = GGML_TYPE_Q8_0;
4717+
}
4718+
else if (new_type != GGML_TYPE_Q8_0) {
4719+
new_type = GGML_TYPE_Q6_K;
4720+
}
4721+
} else if (name.find("attn_v.weight") != std::string::npos) {
4722+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4723+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4724+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4725+
}
4726+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4727+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4728+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4729+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4730+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4731+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4732+
if (model.type == MODEL_70B) {
4733+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4734+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4735+
// nearly negligible increase in model size by quantizing this tensor with more bits:
4736+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4737+
}
4738+
++*i_attention_wv;
4739+
} else if (name.find("ffn_down.weight") != std::string::npos) {
4740+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4741+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4742+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4743+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4744+
: GGML_TYPE_Q3_K;
4745+
}
4746+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4747+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4748+
}
4749+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4750+
if (model.arch == LLM_ARCH_FALCON) {
4751+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4752+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4753+
} else {
4754+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4755+
}
4756+
}
4757+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4758+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
4759+
new_type = GGML_TYPE_Q5_K;
4760+
}
4761+
++*i_feed_forward_w2;
4762+
} else if (name.find("attn_output.weight") != std::string::npos) {
4763+
if (model.arch != LLM_ARCH_FALCON) {
4764+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4765+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4766+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4767+
} else {
4768+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4769+
}
4770+
}
4771+
else if (name.find("attn_qkv.weight") != std::string::npos) {
4772+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4773+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4774+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4775+
}
4776+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4777+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4778+
}
4779+
// This can be used to reduce the size of the Q5_K_S model.
4780+
// The associated PPL increase is fully in line with the size reduction
4781+
//else {
4782+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4783+
//}
4784+
bool convert_incompatible_tensor = false;
4785+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4786+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4787+
int nx = tensor->ne[0];
4788+
int ny = tensor->ne[1];
4789+
if (nx % QK_K != 0) {
4790+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4791+
convert_incompatible_tensor = true;
4792+
}
4793+
}
4794+
if (convert_incompatible_tensor) {
4795+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4796+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4797+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4798+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4799+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4800+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4801+
} else {
4802+
throw std::runtime_error("Unsupported tensor size encountered\n");
4803+
}
4804+
}
4805+
4806+
return new_type;
4807+
}
4808+
#endif
4809+
47004810
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
47014811
ggml_type quantized_type;
47024812
llama_ftype ftype = params->ftype;
@@ -4782,12 +4892,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
47824892
std::vector<std::thread> workers;
47834893
std::mutex mutex;
47844894

4785-
#ifdef GGML_USE_K_QUANTS
4786-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4787-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4788-
};
4789-
#endif
4790-
47914895
int idx = 0;
47924896

47934897
std::vector<uint8_t> read_data;
@@ -4838,101 +4942,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
48384942
if (quantize) {
48394943
new_type = quantized_type;
48404944
#ifdef GGML_USE_K_QUANTS
4841-
// TODO: avoid hardcoded tensor names - use the TN_* constants
4842-
const auto tn = LLM_TN(ml->get_arch());
4843-
4844-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4845-
int nx = tensor->ne[0];
4846-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4847-
new_type = GGML_TYPE_Q8_0;
4848-
}
4849-
else if (new_type != GGML_TYPE_Q8_0) {
4850-
new_type = GGML_TYPE_Q6_K;
4851-
}
4852-
} else if (name.find("attn_v.weight") != std::string::npos) {
4853-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4854-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4855-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4856-
}
4857-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4858-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4859-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4860-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4861-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4862-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4863-
if (model.type == MODEL_70B) {
4864-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4865-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4866-
// nearly negligible increase in model size by quantizing this tensor with more bits:
4867-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4868-
}
4869-
++i_attention_wv;
4870-
} else if (name.find("ffn_down.weight") != std::string::npos) {
4871-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4872-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4873-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4874-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4875-
: GGML_TYPE_Q3_K;
4876-
}
4877-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4878-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4879-
}
4880-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4881-
if (model.arch == LLM_ARCH_FALCON) {
4882-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4883-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4884-
} else {
4885-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4886-
}
4887-
}
4888-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4889-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4890-
new_type = GGML_TYPE_Q5_K;
4891-
}
4892-
++i_feed_forward_w2;
4893-
} else if (name.find("attn_output.weight") != std::string::npos) {
4894-
if (model.arch != LLM_ARCH_FALCON) {
4895-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4896-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4897-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4898-
} else {
4899-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4900-
}
4901-
}
4902-
else if (name.find("attn_qkv.weight") != std::string::npos) {
4903-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4904-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4905-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4906-
}
4907-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4908-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4909-
}
4910-
// This can be used to reduce the size of the Q5_K_S model.
4911-
// The associated PPL increase is fully in line with the size reduction
4912-
//else {
4913-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4914-
//}
4915-
bool convert_incompatible_tensor = false;
4916-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4917-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4918-
int nx = tensor->ne[0];
4919-
int ny = tensor->ne[1];
4920-
if (nx % QK_K != 0) {
4921-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4922-
convert_incompatible_tensor = true;
4923-
}
4924-
}
4925-
if (convert_incompatible_tensor) {
4926-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4927-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4928-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4929-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4930-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4931-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4932-
} else {
4933-
throw std::runtime_error("Unsupported tensor size encountered\n");
4934-
}
4935-
}
4945+
new_type = get_k_quant_type(
4946+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
4947+
);
49364948
#endif
49374949
// If we've decided to quantize to the same type the tensor is already
49384950
// in then there's nothing to do.

0 commit comments

Comments
 (0)