Skip to content

Commit ac793a2

Browse files
committed
Fix for ggml-org#2023
1 parent 0db14fe commit ac793a2

File tree

4 files changed

+110
-14
lines changed

4 files changed

+110
-14
lines changed

convert.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -231,19 +231,10 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
231231
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
232232
tokenizer = self.sentencepiece_tokenizer
233233
for i in range(tokenizer.vocab_size()):
234-
text: bytes
235-
if tokenizer.is_unknown(i):
236-
text = " \u2047 ".encode("utf-8")
237-
elif tokenizer.is_control(i):
238-
text = b""
239-
elif tokenizer.is_byte(i):
240-
piece = tokenizer.id_to_piece(i)
241-
if len(piece) != 6:
242-
raise Exception(f"Invalid token: {piece}")
243-
byte_value = int(piece[3:-1], 16)
244-
text = struct.pack("B", byte_value)
245-
else:
246-
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
234+
# TODO: How do we want to support is_unknown, is_control, is_byte and is_unused(i)?
235+
piece = tokenizer.id_to_piece(i)
236+
text: bytes = piece.encode("utf-8")
237+
247238
score: float = tokenizer.get_score(i)
248239
yield text, score
249240

llama.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1805,7 +1805,8 @@ struct llama_tokenizer {
18051805
size_t offs = 0;
18061806
while (offs < text.size()) {
18071807
llama_sp_symbol sym;
1808-
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
1808+
assert(utf8_len(text[offs]) <= text.size() - offs);
1809+
size_t char_len = utf8_len(text[offs]);
18091810
sym.text = text.c_str() + offs;
18101811
sym.n = char_len;
18111812
offs += char_len;

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@ llama_add_test(test-quantize-fns.cpp)
1111
llama_add_test(test-quantize-perf.cpp)
1212
llama_add_test(test-sampling.cpp)
1313
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
14+
llama_add_test(test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
1415
llama_add_test(test-grad0.c) # SLOW
1516
# llama_add_test(test-opt.c) # SLOW

tests/test-tokenizer-1.cpp

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#include "llama.h"
2+
3+
#include <cassert>
4+
#include <cstdio>
5+
#include <string>
6+
#include <codecvt>
7+
#include <map>
8+
#include <vector>
9+
10+
std::string detokenize(llama_context * ctx, llama_token * tokens, int count) {
11+
std::string result;
12+
for (int i = 0; i < count; ++i) {
13+
result += llama_token_to_str(ctx, tokens[i]);
14+
if (i < count - 1) {
15+
result += "_";
16+
}
17+
}
18+
return result;
19+
}
20+
21+
int main(int argc, char **argv) {
22+
if (argc < 2) {
23+
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
24+
return 1;
25+
}
26+
27+
const std::string fname = argv[1];
28+
29+
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
30+
31+
llama_model * model;
32+
llama_context * ctx;
33+
34+
llama_backend_init(false);
35+
36+
// load the vocab
37+
{
38+
auto lparams = llama_context_default_params();
39+
40+
lparams.vocab_only = true;
41+
42+
model = llama_load_model_from_file(fname.c_str(), lparams);
43+
44+
if (model == NULL) {
45+
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
46+
return 1;
47+
}
48+
49+
ctx = llama_new_context_with_model(model, lparams);
50+
51+
if (ctx == NULL) {
52+
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
53+
llama_free_model(model);
54+
return 1;
55+
}
56+
}
57+
58+
const int n_vocab = llama_n_vocab(ctx);
59+
60+
if (n_vocab != 32000) {
61+
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
62+
llama_free_model(model);
63+
llama_free(ctx);
64+
return 2;
65+
}
66+
67+
for (int i = 0; i < n_vocab; ++i) {
68+
const char * forward = llama_token_to_str(ctx, i);
69+
llama_token tokens[strlen(forward)];
70+
auto n = llama_tokenize(ctx, forward, tokens, strlen(forward), false);
71+
if (n == 1) {
72+
if (i != tokens[0]) {
73+
const char* backward = llama_token_to_str(ctx, tokens[0]);
74+
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n", __func__, i, forward, tokens[0], backward);
75+
}
76+
} else {
77+
if (i <= 258) {
78+
fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n", __func__, i, forward, detokenize(ctx, tokens, n).c_str());
79+
} else {
80+
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n", __func__, i, forward, detokenize(ctx, tokens, n).c_str());
81+
}
82+
}
83+
}
84+
85+
std::wstring string_to_convert;
86+
std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
87+
for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
88+
std::wstring wstr(1, ch);
89+
std::string str = converter.to_bytes(wstr);
90+
llama_token tokens[strlen(str.c_str())];
91+
auto n = llama_tokenize(ctx, str.c_str(), tokens, str.length(), false);
92+
if (n == 1) {
93+
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
94+
}
95+
}
96+
97+
llama_free_model(model);
98+
llama_free(ctx);
99+
100+
llama_backend_free();
101+
102+
return 0;
103+
}

0 commit comments

Comments
 (0)