Skip to content

Commit d3286d6

Browse files
tokenize: fix double BOS token
1 parent 858f6b7 commit d3286d6

File tree

21 files changed

+78
-58
lines changed

21 files changed

+78
-58
lines changed

common/common.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2343,15 +2343,17 @@ std::vector<llama_token> llama_tokenize(
23432343
const struct llama_context * ctx,
23442344
const std::string & text,
23452345
bool add_special,
2346-
bool parse_special) {
2347-
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
2346+
bool parse_special,
2347+
bool fix_double_bos) {
2348+
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special, fix_double_bos);
23482349
}
23492350

23502351
std::vector<llama_token> llama_tokenize(
23512352
const struct llama_model * model,
23522353
const std::string & text,
23532354
bool add_special,
2354-
bool parse_special) {
2355+
bool parse_special,
2356+
bool fix_double_bos) {
23552357
// upper limit for the number of tokens
23562358
int n_tokens = text.length() + 2 * add_special;
23572359
std::vector<llama_token> result(n_tokens);
@@ -2363,9 +2365,19 @@ std::vector<llama_token> llama_tokenize(
23632365
} else {
23642366
result.resize(n_tokens);
23652367
}
2368+
if (fix_double_bos) {
2369+
llama_fix_double_bos(model, result);
2370+
}
23662371
return result;
23672372
}
23682373

2374+
void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt) {
2375+
const llama_token bos = llama_token_bos(model);
2376+
if (prompt.size() >= 2 && prompt[0] == bos && prompt[1] == bos) {
2377+
prompt.erase(prompt.begin(), prompt.begin() + 1);
2378+
}
2379+
}
2380+
23692381
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
23702382
std::vector<char> result(8, 0);
23712383
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);

common/common.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,13 +238,18 @@ std::vector<llama_token> llama_tokenize(
238238
const struct llama_context * ctx,
239239
const std::string & text,
240240
bool add_special,
241-
bool parse_special = false);
241+
bool parse_special = false,
242+
bool fix_dobule_bos = false);
242243

243244
std::vector<llama_token> llama_tokenize(
244245
const struct llama_model * model,
245246
const std::string & text,
246247
bool add_special,
247-
bool parse_special = false);
248+
bool parse_special = false,
249+
bool fix_double_bos = false);
250+
251+
// if the first and the second token in the prompt are both EOS, remove the first token
252+
void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt);
248253

249254
// tokenizes a token into a piece, optionally renders special/control tokens
250255
// should work similar to Python's `tokenizer.id_to_piece`

examples/batched/batched.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ int main(int argc, char ** argv) {
7171
// tokenize the prompt
7272

7373
std::vector<llama_token> tokens_list;
74-
tokens_list = ::llama_tokenize(model, params.prompt, true);
74+
tokens_list = ::llama_tokenize(model, params.prompt, true, true, true);
7575

7676
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
7777

examples/beam-search/beam-search.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ int main(int argc, char ** argv)
137137
// Tokenize the prompt :
138138
//---------------------------------
139139

140-
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
140+
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true, true, true);
141141

142142
const size_t max_context_size = llama_n_ctx( ctx );
143143
const size_t max_tokens_list_size = max_context_size - 4 ;

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
114114
// tokenize the prompts and trim
115115
std::vector<std::vector<int32_t>> inputs;
116116
for (const auto & prompt : prompts) {
117-
auto inp = ::llama_tokenize(ctx, prompt, true, false);
117+
auto inp = ::llama_tokenize(ctx, prompt, true, false, true);
118118
if (inp.size() > n_batch) {
119119
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
120120
__func__, (long long int) inp.size(), (long long int) n_batch);

examples/imatrix/imatrix.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
378378
auto tim1 = std::chrono::high_resolution_clock::now();
379379
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
380380

381-
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
381+
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);
382382

383383
auto tim2 = std::chrono::high_resolution_clock::now();
384384
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

examples/infill/infill.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,8 @@ int main(int argc, char ** argv) {
248248
suff_rm_leading_spc = false;
249249
}
250250
std::vector<llama_token> embd_inp;
251-
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
252-
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
251+
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
252+
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
253253
const int space_token = 29871;
254254
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
255255
inp_sfx.erase(inp_sfx.begin());
@@ -280,10 +280,10 @@ int main(int argc, char ** argv) {
280280
if (ctx_guidance) {
281281
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
282282

283-
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
283+
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
284284
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
285285

286-
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
286+
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
287287
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
288288

289289
original_prompt_len = original_inp.size();
@@ -630,8 +630,8 @@ int main(int argc, char ** argv) {
630630
suff_rm_leading_spc = false;
631631
}
632632
// tokenize new prefix and suffix
633-
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
634-
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
633+
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
634+
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
635635
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
636636
inp_sfx.erase(inp_sfx.begin());
637637
}
@@ -703,7 +703,7 @@ int main(int argc, char ** argv) {
703703

704704
const size_t original_size = embd_inp.size();
705705

706-
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
706+
const auto line_inp = ::llama_tokenize(ctx, buffer, false, true, false);
707707
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
708708

709709
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

examples/llava/llava-cli.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
3535

3636
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
3737
std::string str2 = str;
38-
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
38+
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true, add_bos);
3939
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
4040
return true;
4141
}
@@ -156,14 +156,14 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
156156
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
157157
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
158158
if (params->verbose_prompt) {
159-
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
159+
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true, true);
160160
for (int i = 0; i < (int) tmp.size(); i++) {
161161
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
162162
}
163163
}
164164
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
165165
if (params->verbose_prompt) {
166-
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
166+
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
167167
for (int i = 0; i < (int) tmp.size(); i++) {
168168
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
169169
}
@@ -173,7 +173,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
173173
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
174174
user_prompt = prompt + "\nASSISTANT:";
175175
if (params->verbose_prompt) {
176-
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
176+
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
177177
for (int i = 0; i < (int) tmp.size(); i++) {
178178
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
179179
}

examples/lookahead/lookahead.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
6767
std::vector<llama_token> inp;
6868
std::vector<llama_token> all;
6969

70-
inp = ::llama_tokenize(ctx, params.prompt, true, true);
70+
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
7171
all = inp;
7272

7373
const int max_context_size = llama_n_ctx(ctx);

examples/lookup/lookup-create.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ int main(int argc, char ** argv){
2929

3030
// tokenize the prompt
3131
std::vector<llama_token> inp;
32-
inp = ::llama_tokenize(ctx, params.prompt, true, true);
32+
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
3333
fprintf(stderr, "%s: tokenization done\n", __func__);
3434

3535

examples/lookup/lookup-stats.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ int main(int argc, char ** argv){
3434

3535
// tokenize the prompt
3636
std::vector<llama_token> inp;
37-
inp = ::llama_tokenize(ctx, params.prompt, true, true);
37+
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
3838

3939
llama_ngram_cache ngram_cache_context;
4040
llama_ngram_cache ngram_cache_dynamic;

examples/lookup/lookup.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ int main(int argc, char ** argv){
4242

4343
// tokenize the prompt
4444
std::vector<llama_token> inp;
45-
inp = ::llama_tokenize(ctx, params.prompt, true, true);
45+
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
4646

4747
llama_ngram_cache ngram_cache_context;
4848
llama_ngram_cache ngram_cache_dynamic;

examples/main/main.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
255255
if (params.chatml) {
256256
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
257257
}
258-
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
258+
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
259259
} else {
260260
LOG("use session tokens\n");
261261
embd_inp = session_tokens;
@@ -277,10 +277,10 @@ int main(int argc, char ** argv) {
277277
if (ctx_guidance) {
278278
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
279279

280-
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
280+
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
281281
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
282282

283-
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
283+
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
284284
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
285285

286286
original_prompt_len = original_inp.size();
@@ -339,15 +339,15 @@ int main(int argc, char ** argv) {
339339
}
340340

341341
// prefix & suffix for instruct mode
342-
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
343-
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
342+
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true, false);
343+
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true, false);
344344

345345
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
346346
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
347347

348348
// chatml prefix & suffix
349-
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
350-
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
349+
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true, false);
350+
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true, false);
351351

352352
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
353353
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
@@ -418,7 +418,7 @@ int main(int argc, char ** argv) {
418418
for (const auto & antiprompt : params.antiprompt) {
419419
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
420420
if (params.verbose_prompt) {
421-
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
421+
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true, false);
422422
for (int i = 0; i < (int) tmp.size(); i++) {
423423
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
424424
}
@@ -433,7 +433,7 @@ int main(int argc, char ** argv) {
433433
if (!params.input_prefix.empty()) {
434434
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
435435
if (params.verbose_prompt) {
436-
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
436+
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true, true);
437437
for (int i = 0; i < (int) tmp.size(); i++) {
438438
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
439439
}
@@ -443,7 +443,7 @@ int main(int argc, char ** argv) {
443443
if (!params.input_suffix.empty()) {
444444
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
445445
if (params.verbose_prompt) {
446-
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
446+
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
447447
for (int i = 0; i < (int) tmp.size(); i++) {
448448
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
449449
}
@@ -516,7 +516,7 @@ int main(int argc, char ** argv) {
516516

517517
antiprompt_ids.reserve(params.antiprompt.size());
518518
for (const std::string & antiprompt : params.antiprompt) {
519-
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
519+
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true, false));
520520
}
521521

522522
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
@@ -801,7 +801,7 @@ int main(int argc, char ** argv) {
801801
if (params.interactive) {
802802
if (!params.antiprompt.empty()) {
803803
// tokenize and inject first reverse prompt
804-
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
804+
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true, false);
805805
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
806806
is_antiprompt = true;
807807
}
@@ -875,9 +875,9 @@ int main(int argc, char ** argv) {
875875
process_escapes(buffer);
876876
}
877877

878-
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
879-
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
880-
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
878+
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
879+
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false, false);
880+
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
881881

882882
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
883883

examples/parallel/parallel.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ int main(int argc, char ** argv) {
164164
}
165165

166166
std::vector<llama_token> tokens_system;
167-
tokens_system = ::llama_tokenize(ctx, k_system, true);
167+
tokens_system = ::llama_tokenize(ctx, k_system, true, true, true);
168168
const int32_t n_tokens_system = tokens_system.size();
169169

170170
llama_seq_id g_seq_id = 0;
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
256256

257257
// do not prepend BOS because we have a system prompt!
258258
std::vector<llama_token> tokens_prompt;
259-
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
259+
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false, true, false);
260260

261261
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
262262
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);

examples/passkey/passkey.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,10 @@ int main(int argc, char ** argv) {
108108

109109
// tokenize the prompt
110110
std::vector<llama_token> tokens_list;
111-
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
111+
tokens_list = ::llama_tokenize(ctx, params.prompt, true, true, true);
112112

113113
// tokenize the prefix and use it as a sink
114-
const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
114+
const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true, true, true).size();
115115

116116
const int n_tokens_all = tokens_list.size();
117117

0 commit comments

Comments
 (0)