Skip to content

Commit b619d2b

Browse files
ggerganovteleprint-me
authored andcommitted
tests : add test-tokenizer-0.sh + fix some tokenizers (ggml-org#7036)
* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
1 parent 2782855 commit b619d2b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+922
-728
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[flake8]
22
max-line-length = 125
33
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
4-
exclude = examples/*,examples/*/**,*/**/__init__.py
4+
exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py

Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
7777
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
7878
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
7979
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
80-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
8280
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
8381
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
8482
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
83+
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
8584
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
8685
continue; \
8786
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \

convert-hf-to-gguf-update.py

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import requests
3232
from transformers import AutoTokenizer
3333

34+
logging.basicConfig(level=logging.DEBUG)
3435
logger = logging.getLogger("convert-hf-to-gguf-update")
3536

3637

@@ -69,8 +70,8 @@ class TOKENIZER_TYPE(IntEnum):
6970
{"name": "mistral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", },
7071
{"name": "mixtral-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
7172
{"name": "mixtral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
72-
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
7373
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
74+
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
7475
]
7576

7677
# make directory "models/tokenizers" if it doesn't exist
@@ -167,8 +168,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
167168
chktok = tokenizer.encode(chktxt)
168169
chkhsh = sha256(str(chktok).encode()).hexdigest()
169170
170-
print(f"chktok: {{chktok}}")
171-
print(f"chkhsh: {{chkhsh}}")
171+
logger.debug(f"chktok: {{chktok}}")
172+
logger.debug(f"chkhsh: {{chkhsh}}")
172173
173174
res = None
174175
@@ -177,22 +178,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
177178
# don't edit the hashes manually!
178179
{src_ifs}
179180
if res is None:
180-
print("\\n")
181-
print("**************************************************************************************")
182-
print("** WARNING: The BPE pre-tokenizer was not recognized!")
183-
print("** There are 2 possible reasons for this:")
184-
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
185-
print("** - the pre-tokenization config has changed upstream")
186-
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
187-
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
188-
print("**")
189-
print(f"** chkhsh: {{chkhsh}}")
190-
print("**************************************************************************************")
191-
print("\\n")
181+
logger.warning("\\n")
182+
logger.warning("**************************************************************************************")
183+
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
184+
logger.warning("** There are 2 possible reasons for this:")
185+
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
186+
logger.warning("** - the pre-tokenization config has changed upstream")
187+
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
188+
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
189+
logger.warning("**")
190+
logger.warning(f"** chkhsh: {{chkhsh}}")
191+
logger.warning("**************************************************************************************")
192+
logger.warning("\\n")
192193
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
193194
194-
print(f"tokenizer.ggml.pre: {{repr(res)}}")
195-
print(f"chkhsh: {{chkhsh}}")
195+
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
196+
logger.debug(f"chkhsh: {{chkhsh}}")
196197
197198
return res
198199
"""
@@ -206,6 +207,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
206207
# generate tests for each tokenizer model
207208

208209
tests = [
210+
"ied 4 ½ months",
211+
"Führer",
209212
"",
210213
" ",
211214
" ",
@@ -285,17 +288,16 @@ def get_vocab_base_pre(self, tokenizer) -> str:
285288

286289
# generate commands for creating vocab files
287290
shscript = "#!/usr/bin/env bash\n\n"
288-
logging.info("\nRun the following commands to generate the vocab files for testing:\n")
289-
with open("generate-vocab.sh", "w", encoding="utf-8") as f:
290-
f.writelines(shscript)
291-
logging.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh")
292291

293292
for model in models:
294293
name = model["name"]
295294
tmpline = f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only\n"
296295
shscript += tmpline
297-
logging.info(tmpline)
298-
logging.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
296+
logging.info(tmpline.strip())
297+
298+
with open("generate-vocab.sh", "w", encoding="utf-8") as f:
299+
f.writelines(shscript)
300+
logging.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh")
299301

300302
logging.info("Run the following commands to generate the vocab files for testing:")
301303
logging.info("Enable execution: chmod +x generate-vocab.sh")

convert-hf-to-gguf.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,27 @@
22

33
from __future__ import annotations
44

5-
import logging
65
import argparse
76
import contextlib
87
import json
8+
import logging
99
import os
1010
import re
1111
import sys
1212
from abc import ABC, abstractmethod
1313
from enum import IntEnum
14-
from pathlib import Path
1514
from hashlib import sha256
16-
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
15+
from pathlib import Path
16+
from typing import (
17+
TYPE_CHECKING,
18+
Any,
19+
Callable,
20+
ContextManager,
21+
Iterator,
22+
Sequence,
23+
TypeVar,
24+
cast,
25+
)
1726

1827
import numpy as np
1928
import torch
@@ -323,6 +332,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
323332
if chkhsh == "e750a9b14dfed9b73287639bd1ecda50c38fa6011138f2f609804c6dab9ed5c2":
324333
# ref: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
325334
res = "mixtral-bpe"
335+
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
336+
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
337+
res = "refact"
338+
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
339+
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
340+
res = "command-r"
326341

327342
if res is None:
328343
logger.warning("\n")
@@ -339,7 +354,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
339354
logger.warning("\n")
340355
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
341356

342-
logger.debug(f"tokenizer.ggml.pre: {res}")
357+
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
343358
logger.debug(f"chkhsh: {chkhsh}")
344359

345360
return res

llama.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4383,6 +4383,9 @@ static void llm_load_vocab(
43834383
} else if (
43844384
tokenizer_pre == "gpt-2") {
43854385
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4386+
} else if (
4387+
tokenizer_pre == "refact") {
4388+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
43864389
} else {
43874390
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
43884391
}
@@ -11952,7 +11955,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
1195211955
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
1195311956
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
1195411957
GGML_ASSERT(llama_is_byte_token(vocab, id));
11955-
const auto& token_data = vocab.id_to_token.at(id);
11958+
const auto & token_data = vocab.id_to_token.at(id);
1195611959
switch (llama_vocab_get_type(vocab)) {
1195711960
case LLAMA_VOCAB_TYPE_SPM: {
1195811961
auto buf = token_data.text.substr(3, 2);
@@ -12212,14 +12215,13 @@ struct llm_tokenizer_bpe {
1221212215
"\\s?\\p{L}+",
1221312216
"\\s?\\p{P}+",
1221412217
"[一-龥ࠀ-一가-퟿]+",
12215-
"\\p{N}+",
12218+
"\\p{N}",
1221612219
});
1221712220
break;
1221812221
case LLAMA_VOCAB_PRE_TYPE_FALCON:
1221912222
word_collection = unicode_regex_split(text, {
1222012223
"[\\p{P}\\$\\+<=>\\^~\\|]+",
1222112224
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222-
"\\p{N}+",
1222312225
"[0-9][0-9][0-9]",
1222412226
});
1222512227
break;
@@ -12235,6 +12237,12 @@ struct llm_tokenizer_bpe {
1223512237
});
1223612238
break;
1223712239
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12240+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
12241+
word_collection = unicode_regex_split(text, {
12242+
"\\p{N}",
12243+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12244+
});
12245+
break;
1223812246
case LLAMA_VOCAB_PRE_TYPE_GPT2:
1223912247
word_collection = unicode_regex_split(text, {
1224012248
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -17466,9 +17474,10 @@ int32_t llama_tokenize(
1746617474

1746717475
static std::string llama_decode_text(const std::string & text) {
1746817476
std::string decoded_text;
17469-
auto unicode_sequences = unicode_cpts_from_utf8(text);
17470-
for (auto & unicode_sequence : unicode_sequences) {
17471-
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17477+
17478+
const auto cpts = unicode_cpts_from_utf8(text);
17479+
for (const auto cpt : cpts) {
17480+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
1747217481
}
1747317482

1747417483
return decoded_text;

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ extern "C" {
7979
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
8080
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
8181
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
8283
};
8384

8485
// note: these values should be synchronized with ggml_rope

models/ggml-vocab-bert-bge.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-bert-bge.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
29464 2094 1018 1092 2706
2+
11865 17875
13

24

35

models/ggml-vocab-deepseek-coder.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-deepseek-coder.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
1050 207 19 207 19192 4217
2+
37 32009 71 6247
13

24
207
35
243

models/ggml-vocab-deepseek-llm.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-deepseek-llm.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
1052 207 19 207 19109 4223
2+
37 100014 71 6245
13

24
207
35
243

models/ggml-vocab-falcon.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-falcon.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
878 204 31 3068 133 2137
2+
28611 132 30042
13

24
204
35
258

models/ggml-vocab-gpt-2.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-gpt-2.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
798 604 25208 1933
2+
37 9116 71 11751
13

24
220
35
220 220

models/ggml-vocab-llama-bpe.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-llama-bpe.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
1142 220 19 220 27154 4038
2+
37 51853 261
13

24
220
35
256

models/ggml-vocab-llama-spm.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-llama-spm.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
474 287 29871 29946 29871 30226 7378
2+
383 4000 261
13

24
259
35
1678

models/ggml-vocab-mpt.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-mpt.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
728 577 24142 2607
2+
39 26288 6554
13

24
209
35
50276

models/ggml-vocab-phi-3.gguf

-99 Bytes
Binary file not shown.

models/ggml-vocab-phi-3.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-phi-3.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
474 287 29871 29946 29871 30226 7378
2+
383 4000 261
13

24
259
35
1678

models/ggml-vocab-refact.gguf

44 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)