Skip to content

Commit 0da3fd2

Browse files
committed
llama : valign + remove unused ftype
1 parent 090fca7 commit 0da3fd2

File tree

3 files changed

+72
-76
lines changed

3 files changed

+72
-76
lines changed

examples/quantize/quantize.cpp

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,44 +16,44 @@ struct quant_option {
1616
};
1717

1818
static const std::vector<struct quant_option> QUANT_OPTIONS = {
19-
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
20-
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
21-
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
22-
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
23-
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
24-
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
25-
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
26-
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
27-
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
28-
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29-
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
30-
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
31-
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
32-
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
33-
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
34-
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
35-
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
36-
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
37-
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
38-
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
39-
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
40-
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
41-
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
42-
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
43-
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
44-
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
45-
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
46-
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
47-
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
48-
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
49-
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
50-
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
51-
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52-
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
53-
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
54-
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
19+
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
20+
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
21+
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
22+
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
23+
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
24+
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
25+
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
26+
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
27+
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
28+
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29+
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
30+
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
31+
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
32+
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
33+
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
34+
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
35+
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
36+
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
37+
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
38+
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
39+
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
40+
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
41+
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
42+
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
43+
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
44+
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
45+
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
46+
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
47+
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
48+
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
49+
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
50+
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
51+
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52+
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
53+
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
54+
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
5555
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
56-
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
56+
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
5757
};
5858

5959
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";

include/llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ extern "C" {
133133
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
134134
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
135135
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
136-
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
136+
// LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
137137
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
138138
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
139139
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors

src/llama.cpp

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4450,40 +4450,36 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
44504450
}
44514451

44524452
switch (ftype) {
4453-
case LLAMA_FTYPE_ALL_F32: return "all F32";
4454-
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
4455-
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
4456-
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
4457-
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
4458-
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
4459-
return "Q4_1, some F16";
4460-
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
4461-
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
4462-
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
4463-
4464-
// K-quants
4465-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
4466-
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
4467-
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
4468-
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
4469-
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
4470-
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
4471-
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
4472-
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
4473-
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
4474-
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
4475-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
4476-
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
4477-
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
4478-
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
4479-
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
4480-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
4481-
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
4482-
case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
4483-
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
4484-
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
4485-
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
4486-
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
4453+
case LLAMA_FTYPE_ALL_F32: return "all F32";
4454+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
4455+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
4456+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
4457+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
4458+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
4459+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
4460+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
4461+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
4462+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
4463+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
4464+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
4465+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
4466+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
4467+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
4468+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
4469+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
4470+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
4471+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
4472+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
4473+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
4474+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
4475+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
4476+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
4477+
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
4478+
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
4479+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
4480+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
4481+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
4482+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
44874483
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
44884484
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
44894485
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
@@ -17958,10 +17954,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1795817954
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
1795917955
//}
1796017956
bool convert_incompatible_tensor = false;
17961-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
17962-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
17963-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
17964-
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
17957+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
17958+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
17959+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
17960+
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
1796517961
new_type == GGML_TYPE_IQ1_M) {
1796617962
int nx = tensor->ne[0];
1796717963
int ny = tensor->ne[1];

0 commit comments

Comments
 (0)