Skip to content

Commit 149603e

Browse files
tokenization: no double BOS tokens
1 parent 858f6b7 commit 149603e

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

llama.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12674,11 +12674,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1267412674
// tokenizer.encode('', add_special_tokens=True) returns [1]
1267512675
// tokenizer.encode('', add_special_tokens=False) returns []
1267612676

12677-
if (add_special && vocab.special_add_bos != 0) {
12678-
GGML_ASSERT(vocab.special_bos_id != -1);
12679-
output.push_back(vocab.special_bos_id);
12680-
}
12681-
1268212677
for (const auto & fragment : fragment_buffer) {
1268312678
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1268412679
// without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -12705,18 +12700,18 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1270512700
}
1270612701
}
1270712702

12703+
if (add_special && vocab.special_add_bos != 0 && output[0] != vocab.special_bos_id) {
12704+
GGML_ASSERT(vocab.special_bos_id != -1);
12705+
output.insert(output.begin(), vocab.special_bos_id);
12706+
}
12707+
1270812708
if (add_special && vocab.special_add_eos == 1) {
1270912709
GGML_ASSERT(vocab.special_eos_id != -1);
1271012710
output.push_back(vocab.special_eos_id);
1271112711
}
1271212712
} break;
1271312713
case LLAMA_VOCAB_TYPE_BPE:
1271412714
{
12715-
if (add_special && vocab.special_add_bos != 0) {
12716-
GGML_ASSERT(vocab.special_bos_id != -1);
12717-
output.push_back(vocab.special_bos_id);
12718-
}
12719-
1272012715
for (const auto & fragment : fragment_buffer) {
1272112716
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1272212717
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -12731,6 +12726,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1273112726
}
1273212727
}
1273312728

12729+
if (add_special && vocab.special_add_bos != 0 && output[0] != vocab.special_bos_id) {
12730+
GGML_ASSERT(vocab.special_bos_id != -1);
12731+
output.insert(output.begin(), vocab.special_bos_id);
12732+
}
12733+
1273412734
GGML_ASSERT(vocab.special_add_eos != 1);
1273512735
} break;
1273612736
case LLAMA_VOCAB_TYPE_WPM:

0 commit comments

Comments
 (0)