@@ -2149,12 +2149,12 @@ struct llama_control_vector {
2149
2149
struct llama_vocab {
2150
2150
using id = int32_t;
2151
2151
using token = std::string;
2152
- using ttype = llama_token_type ;
2152
+ using tattr = llama_token_attr ;
2153
2153
2154
2154
struct token_data {
2155
2155
token text;
2156
2156
float score;
2157
- ttype type ;
2157
+ tattr attr ;
2158
2158
};
2159
2159
2160
2160
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -4750,7 +4750,20 @@ static void llm_load_vocab(
4750
4750
auto & token_data = vocab.id_to_token[i];
4751
4751
token_data.text = std::move(word);
4752
4752
token_data.score = scores ? scores[i] : 0.0f;
4753
- token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
4753
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
4754
+
4755
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
4756
+ switch(toktypes[i]) {
4757
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
4758
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
4759
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
4760
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
4761
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
4762
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
4763
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4764
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4765
+ }
4766
+ }
4754
4767
}
4755
4768
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
4756
4769
@@ -4841,7 +4854,7 @@ static void llm_load_vocab(
4841
4854
// build special tokens cache
4842
4855
{
4843
4856
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4844
- if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL ) {
4857
+ if (!( vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL) ) {
4845
4858
vocab.cache_special_tokens.push_back(id);
4846
4859
}
4847
4860
}
@@ -4871,6 +4884,59 @@ static void llm_load_vocab(
4871
4884
4872
4885
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4873
4886
}
4887
+
4888
+ // Handle per token attributes
4889
+ //NOTE: Each model customizes per token attributes.
4890
+ //NOTE: Per token attributes are missing from the GGUF file.
4891
+ //TODO: Extract attributes from GGUF file.
4892
+ {
4893
+ auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
4894
+ for (auto substr : substrs) {
4895
+ if (str.find(substr) < std::string::npos) {
4896
+ return true;
4897
+ }
4898
+ }
4899
+ return false;
4900
+ };
4901
+
4902
+ auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
4903
+ uint32_t current = vocab.id_to_token.at(id).attr;
4904
+ current = value ? (current | attr) : (current & ~attr);
4905
+ vocab.id_to_token[id].attr = (llama_token_attr) current;
4906
+ };
4907
+
4908
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
4909
+ _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
4910
+ };
4911
+
4912
+ std::string model_name;
4913
+ std::string tokenizer_pre;
4914
+
4915
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
4916
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4917
+
4918
+ // model name to lowercase
4919
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
4920
+ [] (const std::string::value_type x) {
4921
+ return std::tolower(x);
4922
+ }
4923
+ );
4924
+
4925
+ // set attributes by model/tokenizer name
4926
+ if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
4927
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4928
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4929
+ for (auto id : vocab.cache_special_tokens) {
4930
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
4931
+ }
4932
+ for (auto token : {"</s>"}) {
4933
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
4934
+ }
4935
+ for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
4936
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
4937
+ }
4938
+ }
4939
+ }
4874
4940
}
4875
4941
4876
4942
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -12620,27 +12686,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
12620
12686
12621
12687
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
12622
12688
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12623
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL ;
12689
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL ;
12624
12690
}
12625
12691
12626
12692
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
12627
12693
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12628
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN ;
12694
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN ;
12629
12695
}
12630
12696
12631
12697
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
12632
12698
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12633
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL ;
12699
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL ;
12634
12700
}
12635
12701
12636
12702
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
12637
12703
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12638
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE ;
12704
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE ;
12639
12705
}
12640
12706
12641
12707
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
12642
12708
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12643
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED ;
12709
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED ;
12644
12710
}
12645
12711
12646
12712
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -13258,7 +13324,8 @@ struct fragment_buffer_variant {
13258
13324
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
13259
13325
// for each special token
13260
13326
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13261
- const auto & special_token = vocab.id_to_token[special_id].text;
13327
+ const auto & data = vocab.id_to_token[special_id];
13328
+ const auto & special_token = data.text;
13262
13329
13263
13330
// for each text fragment
13264
13331
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13295,13 +13362,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13295
13362
if (match > raw_text_base_offset) {
13296
13363
// left
13297
13364
const int64_t left_reminder_offset = raw_text_base_offset + 0;
13298
- const int64_t left_reminder_length = match - raw_text_base_offset;
13299
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13365
+ int64_t left_reminder_length = match - raw_text_base_offset;
13366
+
13367
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
13368
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13369
+ left_reminder_length--;
13370
+ }
13371
+ }
13372
+
13373
+ if (left_reminder_length > 0) {
13374
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13375
+ it++;
13376
+ }
13300
13377
13301
13378
#ifdef PRETOKENIZERDEBUG
13302
13379
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
13303
13380
#endif
13304
- it++;
13305
13381
}
13306
13382
13307
13383
// special token
@@ -13310,16 +13386,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13310
13386
13311
13387
// right
13312
13388
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
13313
- const int64_t right_reminder_offset = match + special_token.length();
13314
- const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13315
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13389
+ int64_t right_reminder_offset = match + special_token.length();
13390
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13391
+
13392
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
13393
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13394
+ right_reminder_offset++;
13395
+ right_reminder_length--;
13396
+ }
13397
+ }
13398
+
13399
+ if (right_reminder_length > 0) {
13400
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13401
+ it++;
13402
+ }
13316
13403
13317
13404
#ifdef PRETOKENIZERDEBUG
13318
13405
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
13319
13406
#endif
13320
13407
13321
- it++;
13322
-
13323
13408
if (source == 0) {
13324
13409
buffer.erase_after(buffer.before_begin());
13325
13410
} else {
@@ -13365,9 +13450,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13365
13450
// tokenizer.encode('', add_special_tokens=True) returns [1]
13366
13451
// tokenizer.encode('', add_special_tokens=False) returns []
13367
13452
13368
- static const bool rtrim = true; //TODO: as param
13369
13453
bool is_prev_special = false;
13370
- bool special_token_rtrim = false;
13371
13454
13372
13455
if (add_special && vocab.special_add_bos != 0) {
13373
13456
GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13377,25 +13460,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13377
13460
13378
13461
for (const auto & fragment : fragment_buffer) {
13379
13462
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13380
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
13381
-
13382
- // TODO: It's likely possible to get rid of this string copy entirely
13383
- // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13384
- // and passing 'add space prefix' as bool argument
13385
- //
13386
13463
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13387
13464
13388
- if (special_token_rtrim) {
13389
- size_t num_whitespaces = 0;
13390
- while (isspace(raw_text[num_whitespaces])) {
13391
- num_whitespaces++;
13392
- }
13393
- if (num_whitespaces == raw_text.size()) {
13394
- continue; // skip if all whitespaces
13395
- }
13396
- raw_text = raw_text.substr(num_whitespaces);
13397
- }
13398
-
13399
13465
if (vocab.add_space_prefix) {
13400
13466
if (!output.size() || is_prev_special) { // prefix with space if first token
13401
13467
raw_text = " " + raw_text;
@@ -13411,11 +13477,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13411
13477
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13412
13478
output.push_back(fragment.token);
13413
13479
is_prev_special = true;
13414
- // phi-3 special tokens without rtrim, works fine for llama-spm too
13415
- special_token_rtrim = rtrim
13416
- && fragment.token != vocab.special_bos_id
13417
- && fragment.token != vocab.special_unk_id
13418
- && fragment.token != vocab.special_eos_id;
13419
13480
}
13420
13481
}
13421
13482
@@ -18221,9 +18282,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
18221
18282
return model->vocab.id_to_token[token].score;
18222
18283
}
18223
18284
18224
- llama_token_type llama_token_get_type (const struct llama_model * model, llama_token token) {
18285
+ llama_token_attr llama_token_get_attr (const struct llama_model * model, llama_token token) {
18225
18286
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
18226
- return model->vocab.id_to_token[token].type ;
18287
+ return model->vocab.id_to_token[token].attr ;
18227
18288
}
18228
18289
18229
18290
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
0 commit comments