Skip to content

Commit edd4c14

Browse files
ggerganovklosax
andauthored
llama : more tokenizer fixes (#2810)
* tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <[email protected]>
1 parent 1591e2e commit edd4c14

20 files changed

+671
-224
lines changed

common/common.cpp

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -733,16 +733,49 @@ std::vector<llama_token> llama_tokenize(
733733
return result;
734734
}
735735

736-
std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
736+
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
737737
std::vector<char> result(8, 0);
738-
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
738+
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
739739
if (n_tokens < 0) {
740740
result.resize(-n_tokens);
741-
int check = llama_token_to_str(ctx, token, result.data(), result.size());
741+
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
742742
GGML_ASSERT(check == -n_tokens);
743743
} else {
744744
result.resize(n_tokens);
745745
}
746746

747747
return std::string(result.data(), result.size());
748748
}
749+
750+
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
751+
const llama_token bos_id = llama_token_bos(ctx);
752+
753+
std::string piece;
754+
std::string result;
755+
756+
for (size_t i = 0; i < tokens.size(); ++i) {
757+
piece = llama_token_to_piece(ctx, tokens[i]);
758+
759+
// remove the leading space of the first non-BOS token
760+
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
761+
piece = piece.substr(1);
762+
}
763+
764+
result += piece;
765+
}
766+
767+
return result;
768+
}
769+
770+
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
771+
std::string piece;
772+
std::string result;
773+
774+
for (size_t i = 0; i < tokens.size(); ++i) {
775+
piece = llama_token_to_piece(ctx, tokens[i]);
776+
777+
result += piece;
778+
}
779+
780+
return result;
781+
}

common/common.h

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,31 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
116116
// Vocab utils
117117
//
118118

119+
// tokenizes a string into a vector of tokens
120+
// should work similar to Python's `tokenizer.encode`
119121
std::vector<llama_token> llama_tokenize(
120122
struct llama_context * ctx,
121123
const std::string & text,
122124
bool add_bos);
123125

124-
std::string llama_token_to_str(
126+
// tokenizes a token into a piece
127+
// should work similar to Python's `tokenizer.id_to_piece`
128+
std::string llama_token_to_piece(
125129
const struct llama_context * ctx,
126130
llama_token token);
131+
132+
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
133+
// that takes into account the tokenizer type and decides how to handle the leading space
134+
//
135+
// detokenizes a vector of tokens into a string
136+
// should work similar to Python's `tokenizer.decode`
137+
// removes the leading space from the first non-BOS token
138+
std::string llama_detokenize_spm(
139+
llama_context * ctx,
140+
const std::vector<llama_token> & tokens);
141+
142+
// detokenizes a vector of tokens into a string
143+
// should work similar to Python's `tokenizer.decode`
144+
std::string llama_detokenize_bpe(
145+
llama_context * ctx,
146+
const std::vector<llama_token> & tokens);

examples/beam_search/beam_search.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ struct ostream_beam_view {
3535
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
3636
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
3737
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
38-
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
38+
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
3939
}
4040
return os << ')';
4141
}
@@ -156,7 +156,7 @@ int main(int argc, char ** argv)
156156

157157
for( auto id : tokens_list )
158158
{
159-
std::cout << llama_token_to_str(ctx, id);
159+
std::cout << llama_token_to_piece(ctx, id);
160160
}
161161
std::cout << std::flush;
162162

@@ -175,7 +175,7 @@ int main(int argc, char ** argv)
175175

176176
std::cout << "\n\n";
177177
for (llama_token const token_id : callback_data.response) {
178-
std::cout << llama_token_to_str(ctx,token_id);
178+
std::cout << llama_token_to_piece(ctx,token_id);
179179
}
180180
std::cout << std::endl;
181181

examples/embd-input/embd-input-lib.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) {
214214
if (id == llama_token_eos(ctx)) {
215215
ret = "</s>";
216216
} else {
217-
ret = llama_token_to_str(ctx, id);
217+
ret = llama_token_to_piece(ctx, id);
218218
}
219219
eval_id(mymodel, id);
220220
return ret.c_str();

examples/embedding/embedding.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
5656

5757
int n_past = 0;
5858

59-
// Add a space in front of the first character to match OG llama tokenizer behavior
60-
params.prompt.insert(0, 1, ' ');
61-
6259
// tokenize the prompt
6360
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
6461

@@ -67,7 +64,7 @@ int main(int argc, char ** argv) {
6764
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
6865
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
6966
for (int i = 0; i < (int) embd_inp.size(); i++) {
70-
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
67+
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
7168
}
7269
fprintf(stderr, "\n");
7370
}

examples/main/main.cpp

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -195,11 +195,6 @@ int main(int argc, char ** argv) {
195195
// tokenize the prompt
196196
std::vector<llama_token> embd_inp;
197197

198-
if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
199-
// Add a space in front of the first character to match OG llama tokenizer behavior
200-
params.prompt.insert(0, 1, ' ');
201-
}
202-
203198
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
204199
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
205200
} else {
@@ -216,7 +211,6 @@ int main(int argc, char ** argv) {
216211
int guidance_offset = 0;
217212
int original_prompt_len = 0;
218213
if (ctx_guidance) {
219-
params.cfg_negative_prompt.insert(0, 1, ' ');
220214
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
221215

222216
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@@ -285,22 +279,22 @@ int main(int argc, char ** argv) {
285279
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
286280
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
287281
for (int i = 0; i < (int) embd_inp.size(); i++) {
288-
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
282+
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
289283
}
290284

291285
if (ctx_guidance) {
292286
fprintf(stderr, "\n");
293287
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
294288
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
295289
for (int i = 0; i < (int) guidance_inp.size(); i++) {
296-
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
290+
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
297291
}
298292
}
299293

300294
if (params.n_keep > 0) {
301295
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
302296
for (int i = 0; i < params.n_keep; i++) {
303-
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
297+
fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
304298
}
305299
fprintf(stderr, "'\n");
306300
}
@@ -456,7 +450,7 @@ int main(int argc, char ** argv) {
456450
//printf("\n---\n");
457451
//printf("resetting: '");
458452
//for (int i = 0; i < (int) embd.size(); i++) {
459-
// printf("%s", llama_token_to_str(ctx, embd[i]));
453+
// printf("%s", llama_token_to_piece(ctx, embd[i]));
460454
//}
461455
//printf("'\n");
462456
//printf("\n---\n");
@@ -509,7 +503,7 @@ int main(int argc, char ** argv) {
509503
input_size = embd_guidance.size();
510504
//fprintf(stderr, "\n---------------------\n");
511505
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
512-
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
506+
//fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
513507
//}
514508
//fprintf(stderr, "\n---------------------\n");
515509
} else {
@@ -673,7 +667,7 @@ int main(int argc, char ** argv) {
673667
// display text
674668
if (input_echo) {
675669
for (auto id : embd) {
676-
printf("%s", llama_token_to_str(ctx, id).c_str());
670+
printf("%s", llama_token_to_piece(ctx, id).c_str());
677671
}
678672
fflush(stdout);
679673
}
@@ -689,7 +683,7 @@ int main(int argc, char ** argv) {
689683
if (params.antiprompt.size()) {
690684
std::string last_output;
691685
for (auto id : last_n_tokens) {
692-
last_output += llama_token_to_str(ctx, id);
686+
last_output += llama_token_to_piece(ctx, id);
693687
}
694688

695689
is_antiprompt = false;

examples/perplexity/perplexity.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
392392
hs_data[i].context = prompt_lines[idx*6];
393393
hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
394394
for (size_t j=0; j < 4; j++) {
395-
hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
395+
hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
396396
}
397397

398398
// Delete the selected random example from the prompt
@@ -417,7 +417,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
417417
size_t context_size = context_embd.size();
418418

419419
for (int i = 0; i < 4; ++i) {
420-
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos);
420+
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
421421
for (int k = 0; k < int(context_size); ++k) {
422422
if (ending_tokens[i][k] != context_embd[k]) {
423423
fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);

examples/save-load-state/save-load-state.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ int main(int argc, char ** argv) {
8787
}
8888
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
8989
auto next_token = llama_sample_token(ctx, &candidates_p);
90-
auto next_token_str = llama_token_to_str(ctx, next_token);
90+
auto next_token_str = llama_token_to_piece(ctx, next_token);
9191
last_n_tokens_data.push_back(next_token);
9292

9393
printf("%s", next_token_str.c_str());
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
147147
}
148148
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
149149
auto next_token = llama_sample_token(ctx2, &candidates_p);
150-
auto next_token_str = llama_token_to_str(ctx2, next_token);
150+
auto next_token_str = llama_token_to_piece(ctx2, next_token);
151151
last_n_tokens_data.push_back(next_token);
152152

153153
printf("%s", next_token_str.c_str());

examples/server/server.cpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
9494
std::string ret;
9595
for (; begin != end; ++begin)
9696
{
97-
ret += llama_token_to_str(ctx, *begin);
97+
ret += llama_token_to_piece(ctx, *begin);
9898
}
9999
return ret;
100100
}
@@ -123,7 +123,7 @@ static void server_log(const char *level, const char *function, int line,
123123
// format incomplete utf-8 multibyte character for output
124124
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
125125
{
126-
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
126+
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
127127
// if the size is 1 and first bit is 1, meaning it's a partial character
128128
// (size > 1 meaning it's already a known token)
129129
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -286,7 +286,6 @@ struct llama_server_context
286286
std::vector<llama_token> p;
287287
if (first)
288288
{
289-
s.insert(0, 1, ' '); // add a space if it's the first
290289
p = ::llama_tokenize(ctx, s, add_bos);
291290
first = false;
292291
}
@@ -309,7 +308,6 @@ struct llama_server_context
309308
else
310309
{
311310
auto s = json_prompt.template get<std::string>();
312-
s.insert(0, 1, ' '); // always add a first space
313311
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
314312
}
315313

@@ -566,7 +564,7 @@ struct llama_server_context
566564

567565
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
568566
{
569-
// stopping_word = llama_token_to_str(ctx, embd.back());
567+
// stopping_word = llama_token_to_piece(ctx, embd.back());
570568
has_next_token = false;
571569
stopped_eos = true;
572570
LOG_VERBOSE("eos token found", {});
@@ -613,7 +611,7 @@ struct llama_server_context
613611
{
614612
const completion_token_output token_with_probs = nextToken();
615613

616-
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
614+
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
617615
generated_text += token_text;
618616

619617
if (params.n_probs > 0)
@@ -1254,7 +1252,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
12541252

12551253
struct token_translator {
12561254
llama_context * ctx;
1257-
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
1255+
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
12581256
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
12591257
};
12601258

@@ -1364,7 +1362,7 @@ int main(int argc, char **argv)
13641362

13651363
while (llama.has_next_token) {
13661364
const completion_token_output token_with_probs = llama.doCompletion();
1367-
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
1365+
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
13681366

13691367
stop_pos = llama.findStoppingStrings(llama.generated_text,
13701368
token_text.size(), STOP_FULL);
@@ -1395,7 +1393,7 @@ int main(int argc, char **argv)
13951393
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
13961394
continue;
13971395
}
1398-
const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
1396+
const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
13991397

14001398
size_t pos = std::min(sent_count, llama.generated_text.size());
14011399

examples/simple/simple.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
6363
fprintf(stderr, "\n\n");
6464

6565
for (auto id : tokens_list) {
66-
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
66+
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
6767
}
6868

6969
fflush(stderr);
@@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
112112
}
113113

114114
// print the new token :
115-
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
115+
printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
116116
fflush(stdout);
117117

118118
// push this new token for next evaluation

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {
19641964

19651965

19661966
void print_token(struct llama_context * ctx, llama_token token) {
1967-
printf("%s", llama_token_to_str(ctx, token).c_str());
1967+
printf("%s", llama_token_to_piece(ctx, token).c_str());
19681968
}
19691969

19701970
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -2202,7 +2202,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
22022202
const char * in = buf.data();
22032203
const char * end = buf.data() + buf.size();
22042204
for (int i = 0; i < (int) out.size(); ++i) {
2205-
std::string s = llama_token_to_str(lctx, out[i]);
2205+
std::string s = llama_token_to_piece(lctx, out[i]);
22062206
int len = s.length();
22072207
if (in >= end) {
22082208
printf("%s: unexpected end of original text.\n", __func__);

0 commit comments

Comments
 (0)