Skip to content

Commit 33de247

Browse files
author
jaime-m-p
committed
bugfix: assertions, wrong special token list
1 parent 3ead1b9 commit 33de247

File tree

1 file changed

+21
-8
lines changed

1 file changed

+21
-8
lines changed

llama.cpp

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4903,16 +4903,19 @@ static void llm_load_vocab(
49034903
return false;
49044904
};
49054905

4906-
auto _set_token_attrib = [&vocab] (const std::string & token, llama_token_attrib attrib, bool value) {
4907-
llama_vocab::id id = vocab.token_to_id.at(token);
4906+
auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) {
49084907
uint32_t attribs = vocab.id_to_token[id].attribs;
49094908
attribs = value ? (attribs | attrib) : (attribs & ~attrib);
49104909
vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
49114910
};
49124911

4912+
auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) {
4913+
_set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
4914+
};
4915+
49134916
if (_contains_any({"phi-3", "phi3"})) {
4914-
for (auto token : vocab.cache_token_to_piece_special) {
4915-
_set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
4917+
for (auto id : vocab.cache_special_tokens) {
4918+
_set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
49164919
}
49174920
for (auto token : {"</s>"}) {
49184921
_set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
@@ -13312,7 +13315,8 @@ struct fragment_buffer_variant {
1331213315
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
1331313316
// for each special token
1331413317
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13315-
const auto & special_token = vocab.id_to_token[special_id].text;
13318+
const auto & data = vocab.id_to_token[special_id];
13319+
const auto & special_token = data.text;
1331613320

1331713321
// for each text fragment
1331813322
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13349,13 +13353,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
1334913353
if (match > raw_text_base_offset) {
1335013354
// left
1335113355
const int64_t left_reminder_offset = raw_text_base_offset + 0;
13352-
const int64_t left_reminder_length = match - raw_text_base_offset;
13353-
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13356+
int64_t left_reminder_length = match - raw_text_base_offset;
13357+
13358+
if (data.attribs & LLAMA_TOKEN_ATTRIB_LSTRIP) {
13359+
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13360+
left_reminder_length--;
13361+
}
13362+
}
13363+
13364+
if (left_reminder_length > 0) {
13365+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13366+
it++;
13367+
}
1335413368

1335513369
#ifdef PRETOKENIZERDEBUG
1335613370
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
1335713371
#endif
13358-
it++;
1335913372
}
1336013373

1336113374
// special token

0 commit comments

Comments
 (0)