Skip to content

Commit 120cf37

Browse files
committed
models : add phi-3, mpt, gpt-2, starcoder
1 parent c21ab18 commit 120cf37

20 files changed

+645
-10
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,13 @@ test: $(TEST_TARGETS)
6464
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
6565
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
6666
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
67+
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
6768
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
6869
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
6970
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
7071
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
72+
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
73+
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
7174
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
7275
continue; \
7376
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \

convert-hf-to-gguf-update.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,16 @@ class TOKENIZER_TYPE(IntEnum):
4646

4747
# TODO: add models here, base models preferred
4848
models = [
49-
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
50-
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
51-
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
52-
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
53-
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
54-
{ "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
49+
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
50+
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
51+
{ "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
52+
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
53+
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
54+
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
55+
{ "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
56+
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
57+
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
58+
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
5559
]
5660

5761
# make directory "models/tokenizers" if it doesn't exist

convert-hf-to-gguf.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
296296
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
297297
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
298298
res = "bert-bge"
299+
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
300+
# ref: https://huggingface.co/mosaicml/mpt-7b
301+
res = "mpt"
302+
if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
303+
# ref: https://huggingface.co/bigcode/starcoder2-3b
304+
res = "starcoder"
305+
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
306+
# ref: https://huggingface.co/openai-community/gpt2
307+
res = "gpt-2"
299308

300309
if res is None:
301310
print("\n")

llama.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4352,6 +4352,15 @@ static void llm_load_vocab(
43524352
} else if (
43534353
tokenizer_pre == "falcon") {
43544354
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4355+
} else if (
4356+
tokenizer_pre == "mpt") {
4357+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4358+
} else if (
4359+
tokenizer_pre == "starcoder") {
4360+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4361+
} else if (
4362+
tokenizer_pre == "gpt-2") {
4363+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
43554364
} else {
43564365
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
43574366
}
@@ -12124,6 +12133,23 @@ struct llm_tokenizer_bpe {
1212412133
"[0-9][0-9][0-9]",
1212512134
});
1212612135
break;
12136+
case LLAMA_VOCAB_PRE_TYPE_MPT:
12137+
// TODO: MPT pre-tokenization regexes are unknown
12138+
// the following are close, but not exact. run the following:
12139+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12140+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12141+
word_collection = unicode_regex_split(text, {
12142+
"\\s?\\p{L}+",
12143+
"\\s?\\p{P}+",
12144+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12145+
});
12146+
break;
12147+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12148+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
12149+
word_collection = unicode_regex_split(text, {
12150+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12151+
});
12152+
break;
1212712153
default:
1212812154
// default regex for BPE tokenization pre-processing
1212912155
word_collection = unicode_regex_split(text, {

llama.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ extern "C" {
7676
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
7777
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
7878
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
79+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
80+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
81+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
7982
};
8083

8184
// note: these values should be synchronized with ggml_rope

models/ggml-vocab-gpt-2.gguf

1.68 MB
Binary file not shown.

models/ggml-vocab-gpt-2.gguf.inp

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
__ggml_vocab_test__
3+
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
13+
__ggml_vocab_test__
14+
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
22+
__ggml_vocab_test__
23+
24+
25+
__ggml_vocab_test__
26+
Hello world
27+
__ggml_vocab_test__
28+
Hello world
29+
__ggml_vocab_test__
30+
Hello World
31+
__ggml_vocab_test__
32+
Hello World
33+
__ggml_vocab_test__
34+
Hello World!
35+
__ggml_vocab_test__
36+
Hello, world!
37+
__ggml_vocab_test__
38+
Hello, world!
39+
__ggml_vocab_test__
40+
this is 🦙.cpp
41+
__ggml_vocab_test__
42+
w048 7tuijk dsdfhu
43+
__ggml_vocab_test__
44+
нещо на Български
45+
__ggml_vocab_test__
46+
កាន់តែពិសេសអាចខលចេញ
47+
__ggml_vocab_test__
48+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
49+
__ggml_vocab_test__
50+
Hello
51+
__ggml_vocab_test__
52+
Hello
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
Hello
62+
__ggml_vocab_test__
63+
(
64+
__ggml_vocab_test__
65+
66+
=
67+
__ggml_vocab_test__
68+
' era
69+
__ggml_vocab_test__
70+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
71+
__ggml_vocab_test__
72+
3
73+
__ggml_vocab_test__
74+
33
75+
__ggml_vocab_test__
76+
333
77+
__ggml_vocab_test__
78+
3333
79+
__ggml_vocab_test__
80+
33333
81+
__ggml_vocab_test__
82+
333333
83+
__ggml_vocab_test__
84+
3333333
85+
__ggml_vocab_test__
86+
33333333
87+
__ggml_vocab_test__
88+
333333333
89+
__ggml_vocab_test__
90+
91+
92+
93+
94+
95+
96+
97+
98+
99+
100+
101+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
102+
__ggml_vocab_test__

models/ggml-vocab-gpt-2.gguf.out

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
220
3+
220 220
4+
220 220 220
5+
197
6+
198
7+
628
8+
628 198
9+
197 198
10+
15496 995
11+
18435 995
12+
15496 2159
13+
18435 2159
14+
18435 2159 0
15+
15496 11 995 0
16+
18435 11 995 0
17+
428 318 12520 99 247 13 20322
18+
86 47202 767 28047 45961 288 82 7568 13415
19+
22177 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849
20+
157 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 157 252 223 157 252 249 157 252 227 157 253 223 157 252 231
21+
8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 357 8807 44805 326 468 663 898 11241 8
22+
15496
23+
18435
24+
220 18435
25+
220 220 18435
26+
220 220 220 18435
27+
220 220 220 18435 198 220 220 220 18435
28+
357
29+
198 796
30+
6 6980
31+
15496 11 331 6 439 0 1374 389 345 30325 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252
32+
18
33+
2091
34+
20370
35+
24840
36+
2091 20370
37+
24840 2091
38+
24840 20370
39+
24840 24840
40+
24840 2091 20370
41+
198 220 628 220 628 198 220 197 220 197 197 220 197 198 220 220 198 220 220 220 198 220 220 220 220 198 220 220 220 220 220 198 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 12520 99 247 8582 99 247 513 4747 23460 513 20370 23460 2091 23460 20370 23460 24840 23460 2091 20370 513 13 18 513 492 18 513 986 18 28053 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 47249 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252 40103 1421 18604 12466 121 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849 705 39115 6 33153 15506 63 15931 15931 16317 13896 3228 9805 3548 314 1053 587 705 44040 339 338 612 11 705 2200 345 1654 30 705 44 407 1654 314 1183 787 340 11 705 35 345 588 617 8887 30 775 6 26979 257 6 75 43

models/ggml-vocab-mpt.gguf

-13 Bytes
Binary file not shown.

models/ggml-vocab-mpt.gguf.inp

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
__ggml_vocab_test__
3+
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
13+
__ggml_vocab_test__
14+
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
22+
__ggml_vocab_test__
23+
24+
25+
__ggml_vocab_test__
26+
Hello world
27+
__ggml_vocab_test__
28+
Hello world
29+
__ggml_vocab_test__
30+
Hello World
31+
__ggml_vocab_test__
32+
Hello World
33+
__ggml_vocab_test__
34+
Hello World!
35+
__ggml_vocab_test__
36+
Hello, world!
37+
__ggml_vocab_test__
38+
Hello, world!
39+
__ggml_vocab_test__
40+
this is 🦙.cpp
41+
__ggml_vocab_test__
42+
w048 7tuijk dsdfhu
43+
__ggml_vocab_test__
44+
нещо на Български
45+
__ggml_vocab_test__
46+
កាន់តែពិសេសអាចខលចេញ
47+
__ggml_vocab_test__
48+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
49+
__ggml_vocab_test__
50+
Hello
51+
__ggml_vocab_test__
52+
Hello
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
Hello
62+
__ggml_vocab_test__
63+
(
64+
__ggml_vocab_test__
65+
66+
=
67+
__ggml_vocab_test__
68+
' era
69+
__ggml_vocab_test__
70+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
71+
__ggml_vocab_test__
72+
3
73+
__ggml_vocab_test__
74+
33
75+
__ggml_vocab_test__
76+
333
77+
__ggml_vocab_test__
78+
3333
79+
__ggml_vocab_test__
80+
33333
81+
__ggml_vocab_test__
82+
333333
83+
__ggml_vocab_test__
84+
3333333
85+
__ggml_vocab_test__
86+
33333333
87+
__ggml_vocab_test__
88+
333333333
89+
__ggml_vocab_test__
90+
91+
92+
93+
94+
95+
96+
97+
98+
99+
100+
101+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
102+
__ggml_vocab_test__

0 commit comments

Comments
 (0)