Skip to content

Commit 0e74a72

Browse files
committed
Added whitespace escaping and unescaping
Now we see some resemblence to the Meta-Tokenizer, I think. Only problem: how to integrate this into `llama.cpp` kernel.
1 parent 94a0ee1 commit 0e74a72

File tree

4 files changed

+94
-35
lines changed

4 files changed

+94
-35
lines changed

convert.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -233,12 +233,7 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
233233
for i in range(tokenizer.vocab_size()):
234234
# TODO: How do we want to support is_unknown, is_control, is_byte and is_unused?
235235
piece = tokenizer.id_to_piece(i)
236-
text: bytes
237-
if tokenizer.is_unknown(i) or tokenizer.is_control(i) or tokenizer.is_byte(i):
238-
text: bytes = piece.encode("utf-8")
239-
else:
240-
text = piece.replace("\u2581", " ").encode("utf-8")
241-
236+
text: bytes = piece.encode("utf-8")
242237
score: float = tokenizer.get_score(i)
243238
yield text, score
244239

llama.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1832,13 +1832,13 @@ struct llama_tokenizer {
18321832
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
18331833

18341834
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
1835-
// split string into utf8 chars
1835+
// split string into utf8 chars / token?
18361836
int index = 0;
18371837
size_t offs = 0;
18381838
while (offs < text.size()) {
18391839
llama_sp_symbol sym;
1840-
// size_t len = utf8_len(text[offs]);
1841-
size_t len = llama_trie_find(vocab_.trie, text, offs);
1840+
size_t len = utf8_len(text[offs]);
1841+
// size_t len = llama_trie_find(vocab_.trie, text, offs);
18421842
if (len == 0) {
18431843
len = utf8_len(text[offs]);
18441844
}

tests/test-tokenizer-0.cpp

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,59 @@
55
#include <map>
66
#include <vector>
77

8-
std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
8+
static std::string escape_whitespace(const std::string& text) {
99
std::string result;
10-
for (int i = 0; i < count; ++i) {
11-
result += llama_token_to_str(ctx, tokens[i]);
12-
if (i < count - 1) {
13-
result += "_";
10+
bool escaping = false;
11+
result += char(0xe2);
12+
result += char(0x96);
13+
result += char(0x81);
14+
for (size_t offs = 0; offs < text.length(); ++offs) {
15+
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
16+
if (!escaping) {
17+
result += char(0xe2);
18+
result += char(0x96);
19+
result += char(0x81);
20+
escaping = true;
21+
}
22+
}
23+
else {
24+
escaping = false;
25+
result += text[offs];
1426
}
1527
}
1628
return result;
1729
}
1830

31+
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
32+
const char* word = llama_token_to_str(ctx, token);
33+
if (strlen(word) >= 3 &&
34+
word[0] == char(0xe2) &&
35+
word[1] == char(0x96) &&
36+
word[2] == char(0x81)) {
37+
return std::string(" ") + (word + 3);
38+
}
39+
return word;
40+
}
41+
42+
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
43+
std::string result;
44+
for (int i = 0; i < count; ++i) {
45+
result += unescape_whitespace(ctx, tokens[i]);
46+
}
47+
return result;
48+
}
49+
1950
static const std::map<std::string, std::vector<llama_token>> & k_tests()
2051
{
2152
static std::map<std::string, std::vector<llama_token>> _k_tests = {
22-
{ "Hello World", { 1, 10994, 2787, }, },
23-
{ " Hello World", { 1, 15043, 2787, }, },
24-
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
25-
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
26-
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
27-
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
53+
{ "Hello world", { 1, 15043, 3186, }, },
54+
{ " Hello world", { 1, 29871, 15043, 3186, }, },
55+
{ "Hello World", { 1, 15043, 2787, }, },
56+
{ " Hello World", { 1, 29871, 15043, 2787, }, },
57+
{" Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
58+
{" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
59+
{"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
60+
{"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
2861
};
2962
return _k_tests;
3063
};
@@ -77,9 +110,9 @@ int main(int argc, char **argv) {
77110

78111
for (const auto & test_kv : k_tests()) {
79112
std::vector<llama_token> res(test_kv.first.size());
80-
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
113+
const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true);
81114
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
82-
__func__, test_kv.first.c_str(), detokenize(ctx, res.data(), n).c_str());
115+
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
83116
res.resize(n);
84117

85118
bool correct = res.size() == test_kv.second.size();

tests/test-tokenizer-1.cpp

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,48 @@
88
#include <map>
99
#include <vector>
1010

11-
std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
11+
static std::string escape_whitespace(const std::string& text) {
1212
std::string result;
13-
for (int i = 0; i < count; ++i) {
14-
result += llama_token_to_str(ctx, tokens[i]);
15-
if (i < count - 1) {
16-
result += "_";
13+
bool escaping = false;
14+
result += char(0xe2);
15+
result += char(0x96);
16+
result += char(0x81);
17+
for (size_t offs = 0; offs < text.length(); ++offs) {
18+
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
19+
if (!escaping) {
20+
result += char(0xe2);
21+
result += char(0x96);
22+
result += char(0x81);
23+
escaping = true;
24+
}
25+
}
26+
else {
27+
escaping = false;
28+
result += text[offs];
1729
}
1830
}
1931
return result;
2032
}
2133

34+
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
35+
const char* word = llama_token_to_str(ctx, token);
36+
if (strlen(word) >= 3 &&
37+
word[0] == char(0xe2) &&
38+
word[1] == char(0x96) &&
39+
word[2] == char(0x81)) {
40+
return std::string(" ") + (word + 3);
41+
}
42+
return word;
43+
}
44+
45+
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
46+
std::string result;
47+
for (int i = 0; i < count; ++i) {
48+
result += unescape_whitespace(ctx, tokens[i]);
49+
}
50+
return result;
51+
}
52+
2253
int main(int argc, char **argv) {
2354
if (argc < 2) {
2455
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
@@ -66,22 +97,22 @@ int main(int argc, char **argv) {
6697
}
6798

6899
for (int i = 0; i < n_vocab; ++i) {
69-
const char * forward = llama_token_to_str(ctx, i);
70-
std::vector<llama_token> tokens(strlen(forward));
71-
auto n = llama_tokenize(ctx, forward, tokens.data(), strlen(forward), false);
100+
std::string forward = llama_token_to_str(ctx, i);
101+
std::vector<llama_token> tokens(forward.length());
102+
int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false);
72103
if (n == 1) {
73104
if (i != tokens[0]) {
74-
const char* backward = llama_token_to_str(ctx, tokens[0]);
105+
std::string backward = unescape_whitespace(ctx, tokens[0]);
75106
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n",
76-
__func__, i, forward, tokens[0], backward);
107+
__func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str());
77108
}
78109
} else {
79110
if (i <= 258) {
80111
fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n",
81-
__func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
112+
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
82113
} else {
83114
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n",
84-
__func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
115+
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
85116
}
86117
}
87118
}
@@ -91,7 +122,7 @@ int main(int argc, char **argv) {
91122
std::wstring wstr(1, ch);
92123
std::string str = converter.to_bytes(wstr);
93124
std::vector<llama_token> tokens(strlen(str.c_str()));
94-
auto n = llama_tokenize(ctx, str.c_str(), tokens.data(), str.length(), false);
125+
auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false);
95126
if (n == 1) {
96127
fprintf(stderr, "%s : info: %s tokenized to %d \n",
97128
__func__, str.c_str(), tokens[0]);

0 commit comments

Comments
 (0)