From cec6a3bde95a74bb59f09f0b8ac2010a56b33d49 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 1 Jun 2024 19:42:21 +0200 Subject: [PATCH 01/10] Add per token attrib enum --- llama.cpp | 32 ++++++++++++++++++++++++++------ llama.h | 14 ++++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 841be1de7291e..02f7be2c1e43d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2147,14 +2147,16 @@ struct llama_control_vector { }; struct llama_vocab { - using id = int32_t; - using token = std::string; - using ttype = llama_token_type; + using id = int32_t; + using token = std::string; + using ttype = llama_token_type; + using tattrib = llama_token_attrib; struct token_data { - token text; - float score; - ttype type; + token text; + float score; + ttype type; + tattrib attribs; }; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; @@ -4865,6 +4867,24 @@ static void llm_load_vocab( LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); } + + // Handle per token attributes + //NOTE: Each model customizes per token attributes. + //NOTE: Per token attributes are missing from the GGUF file. + //TODO: Merge llama_token_type and llama_token_attrib. + { + // convert token type as an attribute + for (auto data : vocab.id_to_token) { + uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED; + attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN); + attrib |= LLAMA_TOKEN_ATTRIB_UNUSED * (data.type == LLAMA_TOKEN_TYPE_UNUSED); + attrib |= LLAMA_TOKEN_ATTRIB_NORMAL * (data.type == LLAMA_TOKEN_TYPE_NORMAL); + attrib |= LLAMA_TOKEN_ATTRIB_CONTROL * (data.type == LLAMA_TOKEN_TYPE_CONTROL); + attrib |= LLAMA_TOKEN_ATTRIB_USER_DEFINED * (data.type == LLAMA_TOKEN_TYPE_USER_DEFINED); + attrib |= LLAMA_TOKEN_ATTRIB_BYTE * (data.type == LLAMA_TOKEN_TYPE_BYTE); + data.attribs = (llama_token_attrib) attrib; + } + } } static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { diff --git a/llama.h b/llama.h index 95105c28e5e42..a9952d6e01ee8 100644 --- a/llama.h +++ b/llama.h @@ -107,6 +107,20 @@ extern "C" { LLAMA_TOKEN_TYPE_BYTE = 6, }; + enum llama_token_attrib { + LLAMA_TOKEN_ATTRIB_UNDEFINED = 0, + LLAMA_TOKEN_ATTRIB_UNKNOWN = 1 << 1, + LLAMA_TOKEN_ATTRIB_UNUSED = 1 << 2, + LLAMA_TOKEN_ATTRIB_NORMAL = 1 << 3, + LLAMA_TOKEN_ATTRIB_CONTROL = 1 << 4, // SPECIAL? + LLAMA_TOKEN_ATTRIB_USER_DEFINED = 1 << 5, + LLAMA_TOKEN_ATTRIB_BYTE = 1 << 6, + LLAMA_TOKEN_ATTRIB_NORMALIZED = 1 << 7, + LLAMA_TOKEN_ATTRIB_LSTRIP = 1 << 8, + LLAMA_TOKEN_ATTRIB_RSTRIP = 1 << 9, + LLAMA_TOKEN_ATTRIB_SINGLE_WORD = 1 << 10, + }; + // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0, From 3ead1b9757e417533408101e9287313c2965cdeb Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 1 Jun 2024 19:45:14 +0200 Subject: [PATCH 02/10] Using phi-3 for testing 'rstrip' --- llama.cpp | 38 ++++++++++++++++++++++++++++++++++ tests/test-tokenizer-random.py | 6 +++--- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 02f7be2c1e43d..0e77585b56c92 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4884,6 +4884,44 @@ static void llm_load_vocab( attrib |= LLAMA_TOKEN_ATTRIB_BYTE * (data.type == LLAMA_TOKEN_TYPE_BYTE); data.attribs = (llama_token_attrib) attrib; } + + // set attributes by model name + std::string model_name; + if (ml.get_key(LLM_KV_GENERAL_NAME, model_name, false)) { + std::transform(model_name.begin(), model_name.end(), model_name.begin(), + [] (const std::string::value_type x) { + return std::tolower(x); + } + ); + + auto _contains_any = [&model_name] (const std::vector &substrs) -> bool { + for (auto substr : substrs) { + if (model_name.find(substr) < std::string::npos) { + return true; + } + } + return false; + }; + + auto _set_token_attrib = [&vocab] (const std::string & token, llama_token_attrib attrib, bool value) { + llama_vocab::id id = vocab.token_to_id.at(token); + uint32_t attribs = vocab.id_to_token[id].attribs; + attribs = value ? (attribs | attrib) : (attribs & ~attrib); + vocab.id_to_token[id].attribs = (llama_token_attrib) attribs; + }; + + if (_contains_any({"phi-3", "phi3"})) { + for (auto token : vocab.cache_token_to_piece_special) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + } + for (auto token : {""}) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + } + for (auto token : {"", "", "<|endoftext|>"}) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false); + } + } + } } } diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index ec1b2837cfab5..14f544c4d58b9 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -329,9 +329,9 @@ def func_tokenize2(text: str): # tokenizers = os.listdir(path_tokenizers) tokenizers = [ # "llama-spm", # SPM - # "phi-3", # SPM - "jina-v2-en", # WPM - "bert-bge", # WPM + "phi-3", # SPM + # "jina-v2-en", # WPM + # "bert-bge", # WPM ] for tokenizer in tokenizers: From 33de2479483cce4b260d84ac719457a53bbf3265 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 1 Jun 2024 20:27:32 +0200 Subject: [PATCH 03/10] bugfix: assertions, wrong special token list --- llama.cpp | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0e77585b56c92..58e8ecc4c7c1b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4903,16 +4903,19 @@ static void llm_load_vocab( return false; }; - auto _set_token_attrib = [&vocab] (const std::string & token, llama_token_attrib attrib, bool value) { - llama_vocab::id id = vocab.token_to_id.at(token); + auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) { uint32_t attribs = vocab.id_to_token[id].attribs; attribs = value ? (attribs | attrib) : (attribs & ~attrib); vocab.id_to_token[id].attribs = (llama_token_attrib) attribs; }; + auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) { + _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value); + }; + if (_contains_any({"phi-3", "phi3"})) { - for (auto token : vocab.cache_token_to_piece_special) { - _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + for (auto id : vocab.cache_special_tokens) { + _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true); } for (auto token : {""}) { _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); @@ -13312,7 +13315,8 @@ struct fragment_buffer_variant { static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token for (const llama_vocab::id special_id : vocab.cache_special_tokens) { - const auto & special_token = vocab.id_to_token[special_id].text; + const auto & data = vocab.id_to_token[special_id]; + const auto & special_token = data.text; // for each text fragment std::forward_list::iterator it = buffer.begin(); @@ -13349,13 +13353,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (match > raw_text_base_offset) { // left const int64_t left_reminder_offset = raw_text_base_offset + 0; - const int64_t left_reminder_length = match - raw_text_base_offset; - buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); + int64_t left_reminder_length = match - raw_text_base_offset; + + if (data.attribs & LLAMA_TOKEN_ATTRIB_LSTRIP) { + while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) { + left_reminder_length--; + } + } + + if (left_reminder_length > 0) { + buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); + it++; + } #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); #endif - it++; } // special token From ada961cec28cb36bb0781bb09b71c4a2f56beaad Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 1 Jun 2024 20:30:42 +0200 Subject: [PATCH 04/10] Implement 'rstrip' properly --- llama.cpp | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/llama.cpp b/llama.cpp index 58e8ecc4c7c1b..69f648a5027e7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13377,16 +13377,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // right if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { - const int64_t right_reminder_offset = match + special_token.length(); - const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); - buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); + int64_t right_reminder_offset = match + special_token.length(); + int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); + + if (data.attribs & LLAMA_TOKEN_ATTRIB_RSTRIP) { + while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) { + right_reminder_offset++; + right_reminder_length--; + } + } + + if (right_reminder_length > 0) { + buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); + it++; + } #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); #endif - it++; - if (source == 0) { buffer.erase_after(buffer.before_begin()); } else { @@ -13432,9 +13441,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & // tokenizer.encode('', add_special_tokens=True) returns [1] // tokenizer.encode('', add_special_tokens=False) returns [] - static const bool rtrim = true; //TODO: as param bool is_prev_special = false; - bool special_token_rtrim = false; if (add_special && vocab.special_add_bos != 0) { GGML_ASSERT(vocab.special_bos_id != -1); @@ -13444,25 +13451,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - // without adding this leading whitespace, we do not get the same results as the original tokenizer - - // TODO: It's likely possible to get rid of this string copy entirely - // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer - // and passing 'add space prefix' as bool argument - // auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); - if (special_token_rtrim) { - size_t num_whitespaces = 0; - while (isspace(raw_text[num_whitespaces])) { - num_whitespaces++; - } - if (num_whitespaces == raw_text.size()) { - continue; // skip if all whitespaces - } - raw_text = raw_text.substr(num_whitespaces); - } - if (vocab.add_space_prefix) { if (!output.size() || is_prev_special) { // prefix with space if first token raw_text = " " + raw_text; @@ -13478,11 +13468,6 @@ static std::vector llama_tokenize_internal(const llama_vocab & } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); is_prev_special = true; - // phi-3 special tokens without rtrim, works fine for llama-spm too - special_token_rtrim = rtrim - && fragment.token != vocab.special_bos_id - && fragment.token != vocab.special_unk_id - && fragment.token != vocab.special_eos_id; } } From 01c9229186f6210186bcb44af4b5ed587e00895f Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 1 Jun 2024 21:22:57 +0200 Subject: [PATCH 05/10] Refactor + add 'jina-v2' for testing 'lstrip' --- llama.cpp | 77 ++++++++++++++++++---------------- tests/test-tokenizer-random.py | 2 + 2 files changed, 44 insertions(+), 35 deletions(-) diff --git a/llama.cpp b/llama.cpp index 69f648a5027e7..c282bceb7c7dc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4872,9 +4872,29 @@ static void llm_load_vocab( //NOTE: Each model customizes per token attributes. //NOTE: Per token attributes are missing from the GGUF file. //TODO: Merge llama_token_type and llama_token_attrib. + //TODO: Extract attribs from GGUF file. { + auto _contains_any = [] (const std::string &str, const std::vector &substrs) -> bool { + for (auto substr : substrs) { + if (str.find(substr) < std::string::npos) { + return true; + } + } + return false; + }; + + auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) { + uint32_t attribs = vocab.id_to_token.at(id).attribs; + attribs = value ? (attribs | attrib) : (attribs & ~attrib); + vocab.id_to_token[id].attribs = (llama_token_attrib) attribs; + }; + + auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) { + _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value); + }; + // convert token type as an attribute - for (auto data : vocab.id_to_token) { + for (auto &data : vocab.id_to_token) { uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED; attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN); attrib |= LLAMA_TOKEN_ATTRIB_UNUSED * (data.type == LLAMA_TOKEN_TYPE_UNUSED); @@ -4885,44 +4905,31 @@ static void llm_load_vocab( data.attribs = (llama_token_attrib) attrib; } - // set attributes by model name std::string model_name; - if (ml.get_key(LLM_KV_GENERAL_NAME, model_name, false)) { - std::transform(model_name.begin(), model_name.end(), model_name.begin(), - [] (const std::string::value_type x) { - return std::tolower(x); - } - ); - - auto _contains_any = [&model_name] (const std::vector &substrs) -> bool { - for (auto substr : substrs) { - if (model_name.find(substr) < std::string::npos) { - return true; - } - } - return false; - }; + std::string tokenizer_pre; - auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) { - uint32_t attribs = vocab.id_to_token[id].attribs; - attribs = value ? (attribs | attrib) : (attribs & ~attrib); - vocab.id_to_token[id].attribs = (llama_token_attrib) attribs; - }; + ml.get_key(LLM_KV_GENERAL_NAME, model_name, false); + ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) { - _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value); - }; + // model name to lowercase + std::transform(model_name.begin(), model_name.end(), model_name.begin(), + [] (const std::string::value_type x) { + return std::tolower(x); + } + ); - if (_contains_any({"phi-3", "phi3"})) { - for (auto id : vocab.cache_special_tokens) { - _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true); - } - for (auto token : {""}) { - _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); - } - for (auto token : {"", "", "<|endoftext|>"}) { - _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false); - } + // set attributes by model/tokenizer name + if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) { + _set_token_attrib("", LLAMA_TOKEN_ATTRIB_LSTRIP, true); + } else if (_contains_any(model_name, {"phi-3", "phi3"})) { + for (auto id : vocab.cache_special_tokens) { + _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + } + for (auto token : {""}) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + } + for (auto token : {"", "", "<|endoftext|>"}) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false); } } } diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 14f544c4d58b9..9a84d9379cb27 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -156,6 +156,8 @@ def generator_custom_text_edge_cases() -> Iterator[str]: 'a', # Phi-3 fail '<|endoftext|>', # Phi-3 fail 'a\na', # TODO: Bert fail + 'a b', # rstrip phi-3 + 'a b', # lstrip jina-v2 ] From 8564c1989ab8a7d2a6b3e0993285abc3907f7a6f Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sun, 2 Jun 2024 02:13:04 +0200 Subject: [PATCH 06/10] Update phi-3 GGUF file (obsolete since 917dc8c) --- models/ggml-vocab-phi-3.gguf | Bin 725846 -> 726019 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf index f8022a385e4aa48ca10d40d0f079a25365a7be78..745be416a798a1e7a2effaa935bc903edaaf303d 100644 GIT binary patch delta 576 zcmcb%Os9E;j-b1Hs2ei_0!U31tWws5@(VIDjrEH13sUuplM{0?^V0S5i!#$Q^AdC7 zHZQ#0~&Qc{Zyafln?5I05>&w&|a zQe(#jx1hMPxFj{V#*Q5>0(2ajPMAhh9O7m*b{uf+rNya5HFn#S*u7Sj-b1Hs2ei_0*Fr(tlD@)fw7sLshypP5r{!FGZ3=?F)I+WZD(ghKMD)h@nC3_C{s) zm(1Hc)!AP&PoJR9F5h0F&Az=vo8#;I=?kSfk5Atq&1o?`Lx$6klL=@gD6E*buan{Y n&OCju3@1 Date: Mon, 3 Jun 2024 00:51:48 +0200 Subject: [PATCH 07/10] Update brute force test: testing 'lstrip' and 'rstrip' --- tests/test-tokenizer-random.py | 45 +++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 9a84d9379cb27..f699af0228076 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -161,14 +161,34 @@ def generator_custom_text_edge_cases() -> Iterator[str]: ] -def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]: - special_tokens = set(tokenizer.all_special_tokens) - special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "", ""]) - special_tokens = list(sorted(special_tokens)) +def generator_vocab_words(vocab: list[str]) -> Iterator[str]: + """Brute force check all vocab words""" + yield from vocab + + +def generator_added_lr_strip(tokenizer) -> Iterator[str]: + WHITESPACES = ["", " ", " ", " "] + special_tokens = list(tokenizer.all_special_tokens) + added_tokens = list(tokenizer.added_tokens_encoder) + all_tokens = list(sorted(set(special_tokens + added_tokens))) + for token in all_tokens: + for lstrip in WHITESPACES: + for rstrip in WHITESPACES: + yield lstrip + token + rstrip + yield "a" + lstrip + token + rstrip + yield lstrip + token + rstrip + "z" + yield "a" + lstrip + token + rstrip + "z" + + +def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]: + special_tokens = list(tokenizer.all_special_tokens) + added_tokens = list(tokenizer.added_tokens_encoder) + separations = [" ", "\n", "\t", "-", "!", "one", "1", "", ""] + all_tokens = list(sorted(set(special_tokens + added_tokens + separations))) rand = random.Random() for m in range(iterations): rand.seed(m) - words = rand.choices(special_tokens, k=500) + words = rand.choices(all_tokens, k=500) if words[0] == tokenizer.bos_token: # skip spam warning of double BOS while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS words.pop(0) @@ -276,8 +296,8 @@ def find_first_mismatch(ids1: list[int], ids2: list[int]): ids2 = func_tokenize2(text) if ids1 != ids2: i = find_first_mismatch(ids1, ids2) - ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1] - ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1] + ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] + ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] logger.info(" TokenIDs: " + str(ids1)) logger.info(" Expected: " + str(ids2)) raise Exception() @@ -311,8 +331,9 @@ def func_tokenize2(text: str): vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True))) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text()) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases()) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab)) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer)) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000)) @@ -324,16 +345,16 @@ def func_tokenize2(text: str): if __name__ == "__main__": # main() - path_tokenizers = "./models/tokenizers/" + path_tokenizers = "./models/tokenizers/" path_vocab_format = "./models/ggml-vocab-%s.gguf" # import os # tokenizers = os.listdir(path_tokenizers) tokenizers = [ - # "llama-spm", # SPM + "llama-spm", # SPM "phi-3", # SPM - # "jina-v2-en", # WPM - # "bert-bge", # WPM + "jina-v2-en", # WPM + "bert-bge", # WPM ] for tokenizer in tokenizers: From 54e9f23b8a17f75b2aafba6aa4044fff37776f21 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 3 Jun 2024 01:44:21 +0200 Subject: [PATCH 08/10] Fix previous commit --- tests/test-tokenizer-random.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index f699af0228076..52f589511e470 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -197,11 +197,6 @@ def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]: yield "".join(words) -def generator_vocab_words(vocab: list[str]) -> Iterator[str]: - """Brute force check all vocab words""" - yield from vocab - - def generator_random_chars(iterations=100) -> Iterator[str]: """Brute force random text with simple characters""" From ac40ff0e5049eb7f1674e44f571a791612d3735a Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 3 Jun 2024 01:48:07 +0200 Subject: [PATCH 09/10] Replace llama_token_type with llama_token_attribs --- llama.cpp | 69 +++++++++++++++++++++++++++---------------------------- llama.h | 6 ++--- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/llama.cpp b/llama.cpp index c282bceb7c7dc..90feea14a82bd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2147,16 +2147,15 @@ struct llama_control_vector { }; struct llama_vocab { - using id = int32_t; - using token = std::string; - using ttype = llama_token_type; - using tattrib = llama_token_attrib; + using id = int32_t; + using token = std::string; + using ttype = llama_token_type; + using tattribs = llama_token_attribs; struct token_data { - token text; - float score; - ttype type; - tattrib attribs; + token text; + float score; + tattribs attribs; }; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; @@ -4740,9 +4739,22 @@ static void llm_load_vocab( vocab.token_to_id[word] = i; auto & token_data = vocab.id_to_token[i]; - token_data.text = std::move(word); - token_data.score = scores ? scores[i] : 0.0f; - token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL; + token_data.text = std::move(word); + token_data.score = scores ? scores[i] : 0.0f; + token_data.attribs = LLAMA_TOKEN_ATTRIB_NORMAL; + + if (toktypes) { //TODO: remove, required until per token attribs are available from GGUF file + switch(toktypes[i]) { + case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attribs = LLAMA_TOKEN_ATTRIB_UNKNOWN; break; + case LLAMA_TOKEN_TYPE_UNUSED: token_data.attribs = LLAMA_TOKEN_ATTRIB_UNUSED; break; + case LLAMA_TOKEN_TYPE_NORMAL: token_data.attribs = LLAMA_TOKEN_ATTRIB_NORMAL; break; + case LLAMA_TOKEN_TYPE_CONTROL: token_data.attribs = LLAMA_TOKEN_ATTRIB_CONTROL; break; + case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attribs = LLAMA_TOKEN_ATTRIB_USER_DEFINED; break; + case LLAMA_TOKEN_TYPE_BYTE: token_data.attribs = LLAMA_TOKEN_ATTRIB_BYTE; break; + case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attribs = LLAMA_TOKEN_ATTRIB_UNDEFINED; break; + default: token_data.attribs = LLAMA_TOKEN_ATTRIB_UNDEFINED; break; + } + } } GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size()); @@ -4833,7 +4845,7 @@ static void llm_load_vocab( // build special tokens cache { for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) { - if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) { + if (!(vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_NORMAL)) { vocab.cache_special_tokens.push_back(id); } } @@ -4871,7 +4883,6 @@ static void llm_load_vocab( // Handle per token attributes //NOTE: Each model customizes per token attributes. //NOTE: Per token attributes are missing from the GGUF file. - //TODO: Merge llama_token_type and llama_token_attrib. //TODO: Extract attribs from GGUF file. { auto _contains_any = [] (const std::string &str, const std::vector &substrs) -> bool { @@ -4883,28 +4894,16 @@ static void llm_load_vocab( return false; }; - auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) { + auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attribs attrib, bool value) { uint32_t attribs = vocab.id_to_token.at(id).attribs; attribs = value ? (attribs | attrib) : (attribs & ~attrib); - vocab.id_to_token[id].attribs = (llama_token_attrib) attribs; + vocab.id_to_token[id].attribs = (llama_token_attribs) attribs; }; - auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) { + auto _set_token_attrib = [&] (const std::string & token, llama_token_attribs attrib, bool value) { _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value); }; - // convert token type as an attribute - for (auto &data : vocab.id_to_token) { - uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED; - attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN); - attrib |= LLAMA_TOKEN_ATTRIB_UNUSED * (data.type == LLAMA_TOKEN_TYPE_UNUSED); - attrib |= LLAMA_TOKEN_ATTRIB_NORMAL * (data.type == LLAMA_TOKEN_TYPE_NORMAL); - attrib |= LLAMA_TOKEN_ATTRIB_CONTROL * (data.type == LLAMA_TOKEN_TYPE_CONTROL); - attrib |= LLAMA_TOKEN_ATTRIB_USER_DEFINED * (data.type == LLAMA_TOKEN_TYPE_USER_DEFINED); - attrib |= LLAMA_TOKEN_ATTRIB_BYTE * (data.type == LLAMA_TOKEN_TYPE_BYTE); - data.attribs = (llama_token_attrib) attrib; - } - std::string model_name; std::string tokenizer_pre; @@ -12684,27 +12683,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) { static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; + return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_NORMAL; } static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; + return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_UNKNOWN; } static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; + return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_CONTROL; } static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; + return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_BYTE; } static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; + return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_USER_DEFINED; } static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { @@ -18277,9 +18276,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token) return model->vocab.id_to_token[token].score; } -llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { +llama_token_attribs llama_token_get_attribs(const struct llama_model * model, llama_token token) { GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); - return model->vocab.id_to_token[token].type; + return model->vocab.id_to_token[token].attribs; } bool llama_token_is_eog(const struct llama_model * model, llama_token token) { diff --git a/llama.h b/llama.h index a9952d6e01ee8..1686b8cbeb5da 100644 --- a/llama.h +++ b/llama.h @@ -97,7 +97,7 @@ extern "C" { LLAMA_ROPE_TYPE_GLM = 4, }; - enum llama_token_type { + enum llama_token_type { //TODO: remove, required until per token attribs are available from GGUF file LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_NORMAL = 1, LLAMA_TOKEN_TYPE_UNKNOWN = 2, @@ -107,7 +107,7 @@ extern "C" { LLAMA_TOKEN_TYPE_BYTE = 6, }; - enum llama_token_attrib { + enum llama_token_attribs { LLAMA_TOKEN_ATTRIB_UNDEFINED = 0, LLAMA_TOKEN_ATTRIB_UNKNOWN = 1 << 1, LLAMA_TOKEN_ATTRIB_UNUSED = 1 << 2, @@ -835,7 +835,7 @@ extern "C" { LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); - LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); + LLAMA_API enum llama_token_attribs llama_token_get_attribs(const struct llama_model * model, llama_token token); // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); From 18f5fc766b6613d5ecc43c37192eea51db0d8cb0 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 4 Jun 2024 00:56:22 +0200 Subject: [PATCH 10/10] Rename token attributes --- llama.cpp | 79 +++++++++++++++++++++++++++---------------------------- llama.h | 28 ++++++++++---------- 2 files changed, 53 insertions(+), 54 deletions(-) diff --git a/llama.cpp b/llama.cpp index 90feea14a82bd..4714c53dd248c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2147,15 +2147,14 @@ struct llama_control_vector { }; struct llama_vocab { - using id = int32_t; - using token = std::string; - using ttype = llama_token_type; - using tattribs = llama_token_attribs; + using id = int32_t; + using token = std::string; + using tattr = llama_token_attr; struct token_data { - token text; - float score; - tattribs attribs; + token text; + float score; + tattr attr; }; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; @@ -4739,20 +4738,20 @@ static void llm_load_vocab( vocab.token_to_id[word] = i; auto & token_data = vocab.id_to_token[i]; - token_data.text = std::move(word); - token_data.score = scores ? scores[i] : 0.0f; - token_data.attribs = LLAMA_TOKEN_ATTRIB_NORMAL; + token_data.text = std::move(word); + token_data.score = scores ? scores[i] : 0.0f; + token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; - if (toktypes) { //TODO: remove, required until per token attribs are available from GGUF file + if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file switch(toktypes[i]) { - case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attribs = LLAMA_TOKEN_ATTRIB_UNKNOWN; break; - case LLAMA_TOKEN_TYPE_UNUSED: token_data.attribs = LLAMA_TOKEN_ATTRIB_UNUSED; break; - case LLAMA_TOKEN_TYPE_NORMAL: token_data.attribs = LLAMA_TOKEN_ATTRIB_NORMAL; break; - case LLAMA_TOKEN_TYPE_CONTROL: token_data.attribs = LLAMA_TOKEN_ATTRIB_CONTROL; break; - case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attribs = LLAMA_TOKEN_ATTRIB_USER_DEFINED; break; - case LLAMA_TOKEN_TYPE_BYTE: token_data.attribs = LLAMA_TOKEN_ATTRIB_BYTE; break; - case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attribs = LLAMA_TOKEN_ATTRIB_UNDEFINED; break; - default: token_data.attribs = LLAMA_TOKEN_ATTRIB_UNDEFINED; break; + case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break; + case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break; + case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break; + case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break; + case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break; + case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break; + case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; + default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; } } } @@ -4845,7 +4844,7 @@ static void llm_load_vocab( // build special tokens cache { for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) { - if (!(vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_NORMAL)) { + if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) { vocab.cache_special_tokens.push_back(id); } } @@ -4883,7 +4882,7 @@ static void llm_load_vocab( // Handle per token attributes //NOTE: Each model customizes per token attributes. //NOTE: Per token attributes are missing from the GGUF file. - //TODO: Extract attribs from GGUF file. + //TODO: Extract attributes from GGUF file. { auto _contains_any = [] (const std::string &str, const std::vector &substrs) -> bool { for (auto substr : substrs) { @@ -4894,14 +4893,14 @@ static void llm_load_vocab( return false; }; - auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attribs attrib, bool value) { - uint32_t attribs = vocab.id_to_token.at(id).attribs; - attribs = value ? (attribs | attrib) : (attribs & ~attrib); - vocab.id_to_token[id].attribs = (llama_token_attribs) attribs; + auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) { + uint32_t current = vocab.id_to_token.at(id).attr; + current = value ? (current | attr) : (current & ~attr); + vocab.id_to_token[id].attr = (llama_token_attr) current; }; - auto _set_token_attrib = [&] (const std::string & token, llama_token_attribs attrib, bool value) { - _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value); + auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) { + _set_tokenid_attr(vocab.token_to_id.at(token), attr, value); }; std::string model_name; @@ -4919,16 +4918,16 @@ static void llm_load_vocab( // set attributes by model/tokenizer name if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) { - _set_token_attrib("", LLAMA_TOKEN_ATTRIB_LSTRIP, true); + _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); } else if (_contains_any(model_name, {"phi-3", "phi3"})) { for (auto id : vocab.cache_special_tokens) { - _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true); } for (auto token : {""}) { - _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true); } for (auto token : {"", "", "<|endoftext|>"}) { - _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false); + _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false); } } } @@ -12683,27 +12682,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) { static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_NORMAL; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL; } static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_UNKNOWN; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN; } static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_CONTROL; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL; } static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_BYTE; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE; } static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_USER_DEFINED; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED; } static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { @@ -13361,7 +13360,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< const int64_t left_reminder_offset = raw_text_base_offset + 0; int64_t left_reminder_length = match - raw_text_base_offset; - if (data.attribs & LLAMA_TOKEN_ATTRIB_LSTRIP) { + if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) { while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) { left_reminder_length--; } @@ -13386,7 +13385,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< int64_t right_reminder_offset = match + special_token.length(); int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); - if (data.attribs & LLAMA_TOKEN_ATTRIB_RSTRIP) { + if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) { while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) { right_reminder_offset++; right_reminder_length--; @@ -18276,9 +18275,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token) return model->vocab.id_to_token[token].score; } -llama_token_attribs llama_token_get_attribs(const struct llama_model * model, llama_token token) { +llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) { GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); - return model->vocab.id_to_token[token].attribs; + return model->vocab.id_to_token[token].attr; } bool llama_token_is_eog(const struct llama_model * model, llama_token token) { diff --git a/llama.h b/llama.h index 1686b8cbeb5da..a78ccdaf557d0 100644 --- a/llama.h +++ b/llama.h @@ -97,7 +97,7 @@ extern "C" { LLAMA_ROPE_TYPE_GLM = 4, }; - enum llama_token_type { //TODO: remove, required until per token attribs are available from GGUF file + enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_NORMAL = 1, LLAMA_TOKEN_TYPE_UNKNOWN = 2, @@ -107,18 +107,18 @@ extern "C" { LLAMA_TOKEN_TYPE_BYTE = 6, }; - enum llama_token_attribs { - LLAMA_TOKEN_ATTRIB_UNDEFINED = 0, - LLAMA_TOKEN_ATTRIB_UNKNOWN = 1 << 1, - LLAMA_TOKEN_ATTRIB_UNUSED = 1 << 2, - LLAMA_TOKEN_ATTRIB_NORMAL = 1 << 3, - LLAMA_TOKEN_ATTRIB_CONTROL = 1 << 4, // SPECIAL? - LLAMA_TOKEN_ATTRIB_USER_DEFINED = 1 << 5, - LLAMA_TOKEN_ATTRIB_BYTE = 1 << 6, - LLAMA_TOKEN_ATTRIB_NORMALIZED = 1 << 7, - LLAMA_TOKEN_ATTRIB_LSTRIP = 1 << 8, - LLAMA_TOKEN_ATTRIB_RSTRIP = 1 << 9, - LLAMA_TOKEN_ATTRIB_SINGLE_WORD = 1 << 10, + enum llama_token_attr { + LLAMA_TOKEN_ATTR_UNDEFINED = 0, + LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 1, + LLAMA_TOKEN_ATTR_UNUSED = 1 << 2, + LLAMA_TOKEN_ATTR_NORMAL = 1 << 3, + LLAMA_TOKEN_ATTR_CONTROL = 1 << 4, // SPECIAL? + LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 5, + LLAMA_TOKEN_ATTR_BYTE = 1 << 6, + LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 7, + LLAMA_TOKEN_ATTR_LSTRIP = 1 << 8, + LLAMA_TOKEN_ATTR_RSTRIP = 1 << 9, + LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 10, }; // model file types @@ -835,7 +835,7 @@ extern "C" { LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); - LLAMA_API enum llama_token_attribs llama_token_get_attribs(const struct llama_model * model, llama_token token); + LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token); // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);