Skip to content

Commit ce7d3a0

Browse files
committed
tests : add test-tokenizer-0.sh
1 parent c4ec9c0 commit ce7d3a0

5 files changed

+110
-240
lines changed

tests/test-tokenizer-0-bpe.py

Lines changed: 0 additions & 117 deletions
This file was deleted.

tests/test-tokenizer-0-spm.py

Lines changed: 0 additions & 114 deletions
This file was deleted.

tests/test-tokenizer-0.cpp

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,10 @@
5555
// return _k_tests;
5656
//}
5757

58-
static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) {
59-
std::map<std::string, std::vector<llama_token>> tests;
58+
using llama_tests = std::map<std::string, std::vector<llama_token>>;
59+
60+
static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
61+
llama_tests tests;
6062

6163
std::ifstream ifs_inp(fname_inp);
6264
if (!ifs_inp) {
@@ -175,12 +177,20 @@ int main(int argc, char **argv) {
175177

176178
bool success = true;
177179

178-
const auto k_tests = read_tests(fname_inp, fname_out);
180+
const auto k_tests = [&]() -> llama_tests {
181+
if (!fname_text.empty()) {
182+
return {};
183+
}
179184

180-
if (k_tests.empty()) {
181-
fprintf(stderr, "%s : error: no tests found\n", __func__);
182-
return 1;
183-
}
185+
const auto res = read_tests(fname_inp, fname_out);
186+
187+
if (res.empty()) {
188+
fprintf(stderr, "%s : error: no tests found\n", __func__);
189+
exit(1);
190+
}
191+
192+
return res;
193+
}();
184194

185195
const bool add_special = false;
186196

@@ -238,7 +248,17 @@ int main(int argc, char **argv) {
238248

239249
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
240250

241-
const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special);
251+
std::vector<llama_token> res;
252+
253+
{
254+
const auto t_start = ggml_time_us();
255+
256+
res = llama_tokenize(ctx, text, add_special);
257+
258+
const auto t_end = ggml_time_us();
259+
260+
fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
261+
}
242262

243263
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
244264

@@ -252,7 +272,8 @@ int main(int argc, char **argv) {
252272
}
253273

254274
for (const auto & tok : res) {
255-
ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
275+
//ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
276+
ofs << tok << "\n";
256277
}
257278
}
258279

tests/test-tokenizer-0.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import time
2+
import argparse
3+
4+
from transformers import AutoTokenizer
5+
6+
parser = argparse.ArgumentParser()
7+
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
8+
parser.add_argument("--fname-tok", help="path to a text file to tokenize", required=True)
9+
args = parser.parse_args()
10+
11+
dir_tokenizer = args.dir_tokenizer
12+
fname_tok = args.fname_tok
13+
14+
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
15+
16+
print('tokenizing file: ', fname_tok)
17+
fname_out = fname_tok + '.tok'
18+
with open(fname_tok, 'r', encoding='utf-8') as f:
19+
lines = f.readlines()
20+
s = ''.join(lines)
21+
t_start = time.time()
22+
res = tokenizer.encode(s, add_special_tokens=False)
23+
t_end = time.time()
24+
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)')
25+
with open(fname_out, 'w', encoding='utf-8') as f:
26+
for x in res:
27+
# LLaMA v3 for some reason strips the space for these tokens (and others)
28+
# if x == 662:
29+
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
30+
# elif x == 1174:
31+
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
32+
# elif x == 2564:
33+
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
34+
# elif x == 758:
35+
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
36+
# elif x == 949:
37+
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
38+
# elif x == 5354:
39+
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
40+
# else:
41+
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
42+
# f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
43+
f.write(str(x) + '\n')
44+
print('len(res): ', len(res))
45+
print('len(lines): ', len(lines))
46+
print('results written to: ', fname_out)

tests/test-tokenizer-0.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
#
3+
# Usage:
4+
#
5+
# test-tokenizer-0.sh <name> <input>
6+
#
7+
8+
if [ $# -ne 2 ]; then
9+
printf "Usage: $0 <name> <input>\n"
10+
exit 1
11+
fi
12+
13+
name=$1
14+
input=$2
15+
16+
make -j tests/test-tokenizer-0
17+
18+
printf "Testing %s on %s ...\n" $name $input
19+
20+
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
21+
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
22+
23+
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
24+
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
25+
26+
diff $input.tok $input.tokcpp > /dev/null 2>&1
27+
28+
if [ $? -eq 0 ]; then
29+
printf "Tokenization is correct!\n"
30+
else
31+
diff $input.tok $input.tokcpp | head -n 32
32+
33+
printf "Tokenization differs!\n"
34+
fi

0 commit comments

Comments
 (0)