@@ -4903,16 +4903,19 @@ static void llm_load_vocab(
4903
4903
return false;
4904
4904
};
4905
4905
4906
- auto _set_token_attrib = [&vocab] (const std::string & token, llama_token_attrib attrib, bool value) {
4907
- llama_vocab::id id = vocab.token_to_id.at(token);
4906
+ auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) {
4908
4907
uint32_t attribs = vocab.id_to_token[id].attribs;
4909
4908
attribs = value ? (attribs | attrib) : (attribs & ~attrib);
4910
4909
vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
4911
4910
};
4912
4911
4912
+ auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) {
4913
+ _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
4914
+ };
4915
+
4913
4916
if (_contains_any({"phi-3", "phi3"})) {
4914
- for (auto token : vocab.cache_token_to_piece_special ) {
4915
- _set_token_attrib(token , LLAMA_TOKEN_ATTRIB_RSTRIP, true);
4917
+ for (auto id : vocab.cache_special_tokens ) {
4918
+ _set_tokenid_attrib(id , LLAMA_TOKEN_ATTRIB_RSTRIP, true);
4916
4919
}
4917
4920
for (auto token : {"</s>"}) {
4918
4921
_set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
@@ -13312,7 +13315,8 @@ struct fragment_buffer_variant {
13312
13315
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
13313
13316
// for each special token
13314
13317
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13315
- const auto & special_token = vocab.id_to_token[special_id].text;
13318
+ const auto & data = vocab.id_to_token[special_id];
13319
+ const auto & special_token = data.text;
13316
13320
13317
13321
// for each text fragment
13318
13322
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13349,13 +13353,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13349
13353
if (match > raw_text_base_offset) {
13350
13354
// left
13351
13355
const int64_t left_reminder_offset = raw_text_base_offset + 0;
13352
- const int64_t left_reminder_length = match - raw_text_base_offset;
13353
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13356
+ int64_t left_reminder_length = match - raw_text_base_offset;
13357
+
13358
+ if (data.attribs & LLAMA_TOKEN_ATTRIB_LSTRIP) {
13359
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13360
+ left_reminder_length--;
13361
+ }
13362
+ }
13363
+
13364
+ if (left_reminder_length > 0) {
13365
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13366
+ it++;
13367
+ }
13354
13368
13355
13369
#ifdef PRETOKENIZERDEBUG
13356
13370
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
13357
13371
#endif
13358
- it++;
13359
13372
}
13360
13373
13361
13374
// special token
0 commit comments