Skip to content

Commit e693074

Browse files
committed
ggml : sync latest ggml
- New Q4 and Q5 formats - Various improvements
1 parent d652cf1 commit e693074

File tree

10 files changed

+4737
-2327
lines changed

10 files changed

+4737
-2327
lines changed

examples/common-ggml.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
77
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
88
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
9-
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
109
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
1110
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
1211
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
@@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
4645
switch (ftype) {
4746
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
4847
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
49-
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
5048
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
5149
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
5250
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
@@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
171169
{
172170
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
173171
} break;
174-
case GGML_TYPE_Q4_2:
175-
{
176-
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
177-
} break;
178172
case GGML_TYPE_Q5_0:
179173
{
180174
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());

examples/common.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
3838
} else if (arg == "-h" || arg == "--help") {
3939
gpt_print_usage(argc, argv, params);
4040
exit(0);
41+
} else if (arg == "-f" || arg == "--file") {
42+
if (++i > argc) {
43+
fprintf(stderr, "Invalid file param");
44+
break;
45+
}
46+
std::ifstream file(argv[i]);
47+
if (!file) {
48+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
49+
break;
50+
}
51+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
52+
if (params.prompt.back() == '\n') {
53+
params.prompt.pop_back();
54+
}
4155
} else {
4256
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
4357
gpt_print_usage(argc, argv, params);
@@ -57,6 +71,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
5771
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
5872
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
5973
fprintf(stderr, " prompt to start generation with (default: random)\n");
74+
fprintf(stderr, " -f FNAME, --file FNAME\n");
75+
fprintf(stderr, " load prompt from a file\n");
6076
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
6177
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
6278
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
@@ -192,6 +208,10 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
192208
return result;
193209
}
194210

211+
void gpt_vocab::add_special_token(const std::string & token) {
212+
special_tokens.push_back(token);
213+
}
214+
195215
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
196216
std::vector<std::string> words;
197217

@@ -200,6 +220,20 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
200220
std::string str = text;
201221
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
202222

223+
// Generate the subpattern from the special_tokens vector if it's not empty
224+
if (!vocab.special_tokens.empty()) {
225+
std::string special_tokens_subpattern;
226+
for (const auto & token : vocab.special_tokens) {
227+
if (!special_tokens_subpattern.empty()) {
228+
special_tokens_subpattern += "|";
229+
}
230+
special_tokens_subpattern += token;
231+
}
232+
233+
// Modify the regex pattern with the generated special tokens subpattern
234+
pat = special_tokens_subpattern + "|" + pat;
235+
}
236+
203237
std::regex re(pat);
204238
std::smatch m;
205239

examples/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ struct gpt_vocab {
5353

5454
std::map<token, id> token_to_id;
5555
std::map<id, token> id_to_token;
56+
std::vector<std::string> special_tokens;
57+
58+
void add_special_token(const std::string & token);
5659
};
5760

5861
// poor-man's JSON parsing

examples/quantize/quantize.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ struct whisper_hparams {
2525
int32_t n_text_head = 6;
2626
int32_t n_text_layer = 4;
2727
int32_t n_mels = 80;
28-
int32_t f16 = 1;
28+
int32_t ftype = 1;
2929
};
3030

3131
struct whisper_filters {
@@ -79,7 +79,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
7979
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
8080
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
8181
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
82-
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
82+
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
83+
84+
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
85+
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
8386

8487
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
8588
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
@@ -91,7 +94,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
9194
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
9295
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
9396
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
94-
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
97+
fprintf(stderr, "%s: ftype (src) = %d\n", __func__, hparams.ftype);
98+
fprintf(stderr, "%s: qntvr (src) = %d\n", __func__, qntvr_src);
99+
fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
100+
fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
95101

96102
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
97103
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
@@ -103,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
103109
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
104110
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
105111
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
106-
fout.write((char *) &ftype, sizeof(hparams.f16));
112+
fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
107113
}
108114

109115
// load mel filters

0 commit comments

Comments
 (0)