From cec6a3bde95a74bb59f09f0b8ac2010a56b33d49 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 1 Jun 2024 19:42:21 +0200
Subject: [PATCH 01/10] Add per token attrib enum

---
 llama.cpp | 32 ++++++++++++++++++++++++++------
 llama.h   | 14 ++++++++++++++
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 841be1de7291e..02f7be2c1e43d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2147,14 +2147,16 @@ struct llama_control_vector {
 };
 
 struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-    using ttype = llama_token_type;
+    using id      = int32_t;
+    using token   = std::string;
+    using ttype   = llama_token_type;
+    using tattrib = llama_token_attrib;
 
     struct token_data {
-        token text;
-        float score;
-        ttype type;
+        token   text;
+        float   score;
+        ttype   type;
+        tattrib attribs;
     };
 
     enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
@@ -4865,6 +4867,24 @@ static void llm_load_vocab(
 
         LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
     }
+
+    // Handle per token attributes
+    //NOTE: Each model customizes per token attributes.
+    //NOTE: Per token attributes are missing from the GGUF file.
+    //TODO: Merge llama_token_type and llama_token_attrib.
+    {
+        // convert token type as an attribute
+        for (auto data : vocab.id_to_token) {
+            uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED;
+            attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN      * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN);
+            attrib |= LLAMA_TOKEN_ATTRIB_UNUSED       * (data.type == LLAMA_TOKEN_TYPE_UNUSED);
+            attrib |= LLAMA_TOKEN_ATTRIB_NORMAL       * (data.type == LLAMA_TOKEN_TYPE_NORMAL);
+            attrib |= LLAMA_TOKEN_ATTRIB_CONTROL      * (data.type == LLAMA_TOKEN_TYPE_CONTROL);
+            attrib |= LLAMA_TOKEN_ATTRIB_USER_DEFINED * (data.type == LLAMA_TOKEN_TYPE_USER_DEFINED);
+            attrib |= LLAMA_TOKEN_ATTRIB_BYTE         * (data.type == LLAMA_TOKEN_TYPE_BYTE);
+            data.attribs = (llama_token_attrib) attrib;
+        }
+    }
 }
 
 static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
diff --git a/llama.h b/llama.h
index 95105c28e5e42..a9952d6e01ee8 100644
--- a/llama.h
+++ b/llama.h
@@ -107,6 +107,20 @@ extern "C" {
         LLAMA_TOKEN_TYPE_BYTE         = 6,
     };
 
+    enum llama_token_attrib {
+        LLAMA_TOKEN_ATTRIB_UNDEFINED    = 0,
+        LLAMA_TOKEN_ATTRIB_UNKNOWN      = 1 <<  1,
+        LLAMA_TOKEN_ATTRIB_UNUSED       = 1 <<  2,
+        LLAMA_TOKEN_ATTRIB_NORMAL       = 1 <<  3,
+        LLAMA_TOKEN_ATTRIB_CONTROL      = 1 <<  4,  // SPECIAL?
+        LLAMA_TOKEN_ATTRIB_USER_DEFINED = 1 <<  5,
+        LLAMA_TOKEN_ATTRIB_BYTE         = 1 <<  6,
+        LLAMA_TOKEN_ATTRIB_NORMALIZED   = 1 <<  7,
+        LLAMA_TOKEN_ATTRIB_LSTRIP       = 1 <<  8,
+        LLAMA_TOKEN_ATTRIB_RSTRIP       = 1 <<  9,
+        LLAMA_TOKEN_ATTRIB_SINGLE_WORD  = 1 << 10,
+    };
+
     // model file types
     enum llama_ftype {
         LLAMA_FTYPE_ALL_F32              = 0,

From 3ead1b9757e417533408101e9287313c2965cdeb Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 1 Jun 2024 19:45:14 +0200
Subject: [PATCH 02/10] Using phi-3 for testing 'rstrip'

---
 llama.cpp                      | 38 ++++++++++++++++++++++++++++++++++
 tests/test-tokenizer-random.py |  6 +++---
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 02f7be2c1e43d..0e77585b56c92 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4884,6 +4884,44 @@ static void llm_load_vocab(
             attrib |= LLAMA_TOKEN_ATTRIB_BYTE         * (data.type == LLAMA_TOKEN_TYPE_BYTE);
             data.attribs = (llama_token_attrib) attrib;
         }
+
+        // set attributes by model name
+        std::string model_name;
+        if (ml.get_key(LLM_KV_GENERAL_NAME, model_name, false)) {
+            std::transform(model_name.begin(), model_name.end(), model_name.begin(),
+                [] (const std::string::value_type x) {
+                    return std::tolower(x);
+                }
+            );
+
+            auto _contains_any = [&model_name] (const std::vector<std::string> &substrs) -> bool {
+                for (auto substr : substrs) {
+                    if (model_name.find(substr) < std::string::npos) {
+                        return true;
+                    }
+                }
+                return false;
+            };
+
+            auto _set_token_attrib = [&vocab] (const std::string & token, llama_token_attrib attrib, bool value) {
+                llama_vocab::id id = vocab.token_to_id.at(token);
+                uint32_t attribs = vocab.id_to_token[id].attribs;
+                attribs = value ? (attribs | attrib) : (attribs & ~attrib);
+                vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
+            };
+
+            if (_contains_any({"phi-3", "phi3"})) {
+                for (auto token : vocab.cache_token_to_piece_special) {
+                    _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+                }
+                for (auto token : {"</s>"}) {
+                    _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+                }
+                for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
+                    _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false);
+                }
+            }
+        }
     }
 }
 
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index ec1b2837cfab5..14f544c4d58b9 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -329,9 +329,9 @@ def func_tokenize2(text: str):
     # tokenizers = os.listdir(path_tokenizers)
     tokenizers = [
         # "llama-spm",   # SPM
-        # "phi-3",       # SPM
-        "jina-v2-en",  # WPM
-        "bert-bge",    # WPM
+        "phi-3",       # SPM
+        # "jina-v2-en",  # WPM
+        # "bert-bge",    # WPM
     ]
 
     for tokenizer in tokenizers:

From 33de2479483cce4b260d84ac719457a53bbf3265 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 1 Jun 2024 20:27:32 +0200
Subject: [PATCH 03/10] bugfix: assertions, wrong special token list

---
 llama.cpp | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0e77585b56c92..58e8ecc4c7c1b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4903,16 +4903,19 @@ static void llm_load_vocab(
                 return false;
             };
 
-            auto _set_token_attrib = [&vocab] (const std::string & token, llama_token_attrib attrib, bool value) {
-                llama_vocab::id id = vocab.token_to_id.at(token);
+            auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) {
                 uint32_t attribs = vocab.id_to_token[id].attribs;
                 attribs = value ? (attribs | attrib) : (attribs & ~attrib);
                 vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
             };
 
+            auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) {
+                _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
+            };
+
             if (_contains_any({"phi-3", "phi3"})) {
-                for (auto token : vocab.cache_token_to_piece_special) {
-                    _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+                for (auto id : vocab.cache_special_tokens) {
+                    _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
                 }
                 for (auto token : {"</s>"}) {
                     _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
@@ -13312,7 +13315,8 @@ struct fragment_buffer_variant {
 static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
     // for each special token
     for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
-        const auto & special_token = vocab.id_to_token[special_id].text;
+        const auto & data = vocab.id_to_token[special_id];
+        const auto & special_token = data.text;
 
         // for each text fragment
         std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13349,13 +13353,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                     if (match > raw_text_base_offset) {
                         // left
                         const int64_t left_reminder_offset = raw_text_base_offset + 0;
-                        const int64_t left_reminder_length = match - raw_text_base_offset;
-                        buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
+                        int64_t left_reminder_length = match - raw_text_base_offset;
+
+                        if (data.attribs & LLAMA_TOKEN_ATTRIB_LSTRIP) {
+                            while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
+                                left_reminder_length--;
+                            }
+                        }
+
+                        if (left_reminder_length > 0) {
+                            buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
+                            it++;
+                        }
 
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
 #endif
-                        it++;
                     }
 
                     // special token

From ada961cec28cb36bb0781bb09b71c4a2f56beaad Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 1 Jun 2024 20:30:42 +0200
Subject: [PATCH 04/10] Implement 'rstrip' properly

---
 llama.cpp | 43 ++++++++++++++-----------------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 58e8ecc4c7c1b..69f648a5027e7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13377,16 +13377,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
 
                     // right
                     if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
-                        const int64_t right_reminder_offset = match + special_token.length();
-                        const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
-                        buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
+                        int64_t right_reminder_offset = match + special_token.length();
+                        int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
+
+                        if (data.attribs & LLAMA_TOKEN_ATTRIB_RSTRIP) {
+                            while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
+                                right_reminder_offset++;
+                                right_reminder_length--;
+                            }
+                        }
+
+                        if (right_reminder_length > 0) {
+                            buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
+                            it++;
+                        }
 
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
 #endif
 
-                        it++;
-
                         if (source == 0) {
                             buffer.erase_after(buffer.before_begin());
                         } else {
@@ -13432,9 +13441,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                 // tokenizer.encode('', add_special_tokens=True)  returns [1]
                 // tokenizer.encode('', add_special_tokens=False) returns []
 
-                static const bool rtrim = true;  //TODO: as param
                 bool is_prev_special = false;
-                bool special_token_rtrim = false;
 
                 if (add_special && vocab.special_add_bos != 0) {
                     GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13444,25 +13451,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
 
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                        // without adding this leading whitespace, we do not get the same results as the original tokenizer
-
-                        // TODO: It's likely possible to get rid of this string copy entirely
-                        //  by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
-                        //  and passing 'add space prefix' as bool argument
-                        //
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
-                        if (special_token_rtrim) {
-                            size_t num_whitespaces = 0;
-                            while (isspace(raw_text[num_whitespaces])) {
-                                num_whitespaces++;
-                            }
-                            if (num_whitespaces == raw_text.size()) {
-                                continue; // skip if all whitespaces
-                            }
-                            raw_text = raw_text.substr(num_whitespaces);
-                        }
-
                         if (vocab.add_space_prefix) {
                             if (!output.size() || is_prev_special) {  // prefix with space if first token
                                 raw_text = " " + raw_text;
@@ -13478,11 +13468,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                         is_prev_special = true;
-                        // phi-3 special tokens without rtrim, works fine for llama-spm too
-                        special_token_rtrim = rtrim
-                            && fragment.token != vocab.special_bos_id
-                            && fragment.token != vocab.special_unk_id
-                            && fragment.token != vocab.special_eos_id;
                     }
                 }
 

From 01c9229186f6210186bcb44af4b5ed587e00895f Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 1 Jun 2024 21:22:57 +0200
Subject: [PATCH 05/10] Refactor + add 'jina-v2' for testing 'lstrip'

---
 llama.cpp                      | 77 ++++++++++++++++++----------------
 tests/test-tokenizer-random.py |  2 +
 2 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 69f648a5027e7..c282bceb7c7dc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4872,9 +4872,29 @@ static void llm_load_vocab(
     //NOTE: Each model customizes per token attributes.
     //NOTE: Per token attributes are missing from the GGUF file.
     //TODO: Merge llama_token_type and llama_token_attrib.
+    //TODO: Extract attribs from GGUF file.
     {
+        auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
+            for (auto substr : substrs) {
+                if (str.find(substr) < std::string::npos) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) {
+            uint32_t attribs = vocab.id_to_token.at(id).attribs;
+            attribs = value ? (attribs | attrib) : (attribs & ~attrib);
+            vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
+        };
+
+        auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) {
+            _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
+        };
+
         // convert token type as an attribute
-        for (auto data : vocab.id_to_token) {
+        for (auto &data : vocab.id_to_token) {
             uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED;
             attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN      * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN);
             attrib |= LLAMA_TOKEN_ATTRIB_UNUSED       * (data.type == LLAMA_TOKEN_TYPE_UNUSED);
@@ -4885,44 +4905,31 @@ static void llm_load_vocab(
             data.attribs = (llama_token_attrib) attrib;
         }
 
-        // set attributes by model name
         std::string model_name;
-        if (ml.get_key(LLM_KV_GENERAL_NAME, model_name, false)) {
-            std::transform(model_name.begin(), model_name.end(), model_name.begin(),
-                [] (const std::string::value_type x) {
-                    return std::tolower(x);
-                }
-            );
-
-            auto _contains_any = [&model_name] (const std::vector<std::string> &substrs) -> bool {
-                for (auto substr : substrs) {
-                    if (model_name.find(substr) < std::string::npos) {
-                        return true;
-                    }
-                }
-                return false;
-            };
+        std::string tokenizer_pre;
 
-            auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) {
-                uint32_t attribs = vocab.id_to_token[id].attribs;
-                attribs = value ? (attribs | attrib) : (attribs & ~attrib);
-                vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
-            };
+        ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
+        ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
 
-            auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) {
-                _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
-            };
+        // model name to lowercase
+        std::transform(model_name.begin(), model_name.end(), model_name.begin(),
+            [] (const std::string::value_type x) {
+                return std::tolower(x);
+            }
+        );
 
-            if (_contains_any({"phi-3", "phi3"})) {
-                for (auto id : vocab.cache_special_tokens) {
-                    _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
-                }
-                for (auto token : {"</s>"}) {
-                    _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
-                }
-                for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
-                    _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false);
-                }
+        // set attributes by model/tokenizer name
+        if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
+            _set_token_attrib("<mask>", LLAMA_TOKEN_ATTRIB_LSTRIP, true);
+        } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
+            for (auto id : vocab.cache_special_tokens) {
+                _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+            }
+            for (auto token : {"</s>"}) {
+                _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+            }
+            for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
+                _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false);
             }
         }
     }
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 14f544c4d58b9..9a84d9379cb27 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -156,6 +156,8 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
         '<s>a',       # Phi-3 fail
         '<unk><|endoftext|><s>',  # Phi-3 fail
         'a\na',       # TODO: Bert fail
+        'a </s> b',   # rstrip phi-3
+        'a <mask> b', # lstrip jina-v2
     ]
 
 
From 8564c1989ab8a7d2a6b3e0993285abc3907f7a6f Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sun, 2 Jun 2024 02:13:04 +0200
Subject: [PATCH 06/10] Update phi-3 GGUF file (obsolete since 917dc8c)

---
 models/ggml-vocab-phi-3.gguf | Bin 725846 -> 726019 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf
index f8022a385e4aa48ca10d40d0f079a25365a7be78..745be416a798a1e7a2effaa935bc903edaaf303d 100644
GIT binary patch
delta 576
zcmcb%Os9E;j-b1Hs2ei_0!U31tWws5@(VIDjrEH13sUuplM{0?^V0S5i!#$Q^AdC7
zH<p?)Dhp$%ODjq(j894|PGtj0Imoy*uV!pt%?QFwK+FupEI`Z(#BAGFGqU$9)bm0O
zv8hSTOUX|wNv$ZUu|o(Z78hq0mn7x^1qI+L3vv>ZQ#0~&Qc{Zyafln?5I05>&w&|a
zQe(#jx1hMPxFj{V#*Q5>0(2ajPMAhh9O7m*b{uf+rNya5HFn#S*<UhqRUKhqfZ*j1
z)!13uKWnjX|E$IF^}RM!7$%L6p8j5n^UCz+(wv5zp!fq}mUcxM&h3gaT;D%y!o&#D
z(-j4{uWy$S<hEndk%2llJvA@2C^1K`uoM{XnN^7;nfZC~WvNBQK$-=ph6#uPlUk6M

delta 353
zcmZo(p>u7Sj-b1Hs2ei_0*Fr(tlD@)fw7sLshypP5r{!FGZ3=?F)I+WZD(g<?^mej
zf@+EmaC9*?GB7ZRMHex|5HZ3KF~$%v!4NUU5HXA8a!0e?97B->hKMD)h@nC3_C{s)
zm(1Hc)!AP&PoJR9F5h0F&Az=vo8#;I=?kSfk5Atq&1o?`Lx$6klL=@gD6E*buan{Y
n&OCju3@1<fQU%WKOBJ}jyq|tifcwPsy8_%6+h+=L+c5zEY8^+j


From 83cac0ec836bcb7066b2468dac0f5ac48943f7a9 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 3 Jun 2024 00:51:48 +0200
Subject: [PATCH 07/10] Update brute force test: testing 'lstrip' and 'rstrip'

---
 tests/test-tokenizer-random.py | 45 +++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 9a84d9379cb27..f699af0228076 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -161,14 +161,34 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
     ]
 
 
-def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
-    special_tokens = set(tokenizer.all_special_tokens)
-    special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
-    special_tokens = list(sorted(special_tokens))
+def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
+    """Brute force check all vocab words"""
+    yield from vocab
+
+
+def generator_added_lr_strip(tokenizer) -> Iterator[str]:
+    WHITESPACES = ["", " ", "  ", "    "]
+    special_tokens = list(tokenizer.all_special_tokens)
+    added_tokens   = list(tokenizer.added_tokens_encoder)
+    all_tokens     = list(sorted(set(special_tokens + added_tokens)))
+    for token in all_tokens:
+        for lstrip in WHITESPACES:
+            for rstrip in WHITESPACES:
+                yield lstrip + token + rstrip
+                yield "a" + lstrip + token + rstrip
+                yield lstrip + token + rstrip + "z"
+                yield "a" + lstrip + token + rstrip + "z"
+
+
+def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]:
+    special_tokens = list(tokenizer.all_special_tokens)
+    added_tokens   = list(tokenizer.added_tokens_encoder)
+    separations    = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
+    all_tokens     = list(sorted(set(special_tokens + added_tokens + separations)))
     rand = random.Random()
     for m in range(iterations):
         rand.seed(m)
-        words = rand.choices(special_tokens, k=500)
+        words = rand.choices(all_tokens, k=500)
         if words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
             while len(words) > 1 and words[1] == tokenizer.bos_token:  # leave one starting BOS
                 words.pop(0)
@@ -276,8 +296,8 @@ def find_first_mismatch(ids1: list[int], ids2: list[int]):
         ids2 = func_tokenize2(text)
         if ids1 != ids2:
             i = find_first_mismatch(ids1, ids2)
-            ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
-            ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
+            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
+            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
             logger.info(" TokenIDs: " + str(ids1))
             logger.info(" Expected: " + str(ids2))
             raise Exception()
@@ -311,8 +331,9 @@ def func_tokenize2(text: str):
     vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
     test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
     test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000))
     test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
+    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
+    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
     test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
     test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
     test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
@@ -324,16 +345,16 @@ def func_tokenize2(text: str):
 if __name__ == "__main__":
     # main()
 
-    path_tokenizers = "./models/tokenizers/"
+    path_tokenizers   = "./models/tokenizers/"
     path_vocab_format = "./models/ggml-vocab-%s.gguf"
 
     # import os
     # tokenizers = os.listdir(path_tokenizers)
     tokenizers = [
-        # "llama-spm",   # SPM
+        "llama-spm",   # SPM
         "phi-3",       # SPM
-        # "jina-v2-en",  # WPM
-        # "bert-bge",    # WPM
+        "jina-v2-en",  # WPM
+        "bert-bge",    # WPM
     ]
 
     for tokenizer in tokenizers:

From 54e9f23b8a17f75b2aafba6aa4044fff37776f21 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 3 Jun 2024 01:44:21 +0200
Subject: [PATCH 08/10] Fix previous commit

---
 tests/test-tokenizer-random.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index f699af0228076..52f589511e470 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -197,11 +197,6 @@ def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]:
         yield "".join(words)
 
 
-def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
-    """Brute force check all vocab words"""
-    yield from vocab
-
-
 def generator_random_chars(iterations=100) -> Iterator[str]:
     """Brute force random text with simple characters"""
 

From ac40ff0e5049eb7f1674e44f571a791612d3735a Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 3 Jun 2024 01:48:07 +0200
Subject: [PATCH 09/10] Replace llama_token_type with llama_token_attribs

---
 llama.cpp | 69 +++++++++++++++++++++++++++----------------------------
 llama.h   |  6 ++---
 2 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c282bceb7c7dc..90feea14a82bd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2147,16 +2147,15 @@ struct llama_control_vector {
 };
 
 struct llama_vocab {
-    using id      = int32_t;
-    using token   = std::string;
-    using ttype   = llama_token_type;
-    using tattrib = llama_token_attrib;
+    using id       = int32_t;
+    using token    = std::string;
+    using ttype    = llama_token_type;
+    using tattribs = llama_token_attribs;
 
     struct token_data {
-        token   text;
-        float   score;
-        ttype   type;
-        tattrib attribs;
+        token    text;
+        float    score;
+        tattribs attribs;
     };
 
     enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
@@ -4740,9 +4739,22 @@ static void llm_load_vocab(
         vocab.token_to_id[word] = i;
 
         auto & token_data = vocab.id_to_token[i];
-        token_data.text  = std::move(word);
-        token_data.score = scores ? scores[i] : 0.0f;
-        token_data.type  = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
+        token_data.text    = std::move(word);
+        token_data.score   = scores ? scores[i] : 0.0f;
+        token_data.attribs = LLAMA_TOKEN_ATTRIB_NORMAL;
+
+        if (toktypes) {  //TODO: remove, required until per token attribs are available from GGUF file
+            switch(toktypes[i]) {
+                case LLAMA_TOKEN_TYPE_UNKNOWN:      token_data.attribs = LLAMA_TOKEN_ATTRIB_UNKNOWN;      break;
+                case LLAMA_TOKEN_TYPE_UNUSED:       token_data.attribs = LLAMA_TOKEN_ATTRIB_UNUSED;       break;
+                case LLAMA_TOKEN_TYPE_NORMAL:       token_data.attribs = LLAMA_TOKEN_ATTRIB_NORMAL;       break;
+                case LLAMA_TOKEN_TYPE_CONTROL:      token_data.attribs = LLAMA_TOKEN_ATTRIB_CONTROL;      break;
+                case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attribs = LLAMA_TOKEN_ATTRIB_USER_DEFINED; break;
+                case LLAMA_TOKEN_TYPE_BYTE:         token_data.attribs = LLAMA_TOKEN_ATTRIB_BYTE;         break;
+                case LLAMA_TOKEN_TYPE_UNDEFINED:    token_data.attribs = LLAMA_TOKEN_ATTRIB_UNDEFINED;    break;
+                default:                            token_data.attribs = LLAMA_TOKEN_ATTRIB_UNDEFINED;    break;
+            }
+        }
     }
     GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
 
@@ -4833,7 +4845,7 @@ static void llm_load_vocab(
     // build special tokens cache
     {
         for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
-            if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
+            if (!(vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_NORMAL)) {
                 vocab.cache_special_tokens.push_back(id);
             }
         }
@@ -4871,7 +4883,6 @@ static void llm_load_vocab(
     // Handle per token attributes
     //NOTE: Each model customizes per token attributes.
     //NOTE: Per token attributes are missing from the GGUF file.
-    //TODO: Merge llama_token_type and llama_token_attrib.
     //TODO: Extract attribs from GGUF file.
     {
         auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
@@ -4883,28 +4894,16 @@ static void llm_load_vocab(
             return false;
         };
 
-        auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) {
+        auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attribs attrib, bool value) {
             uint32_t attribs = vocab.id_to_token.at(id).attribs;
             attribs = value ? (attribs | attrib) : (attribs & ~attrib);
-            vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
+            vocab.id_to_token[id].attribs = (llama_token_attribs) attribs;
         };
 
-        auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) {
+        auto _set_token_attrib = [&] (const std::string & token, llama_token_attribs attrib, bool value) {
             _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
         };
 
-        // convert token type as an attribute
-        for (auto &data : vocab.id_to_token) {
-            uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED;
-            attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN      * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN);
-            attrib |= LLAMA_TOKEN_ATTRIB_UNUSED       * (data.type == LLAMA_TOKEN_TYPE_UNUSED);
-            attrib |= LLAMA_TOKEN_ATTRIB_NORMAL       * (data.type == LLAMA_TOKEN_TYPE_NORMAL);
-            attrib |= LLAMA_TOKEN_ATTRIB_CONTROL      * (data.type == LLAMA_TOKEN_TYPE_CONTROL);
-            attrib |= LLAMA_TOKEN_ATTRIB_USER_DEFINED * (data.type == LLAMA_TOKEN_TYPE_USER_DEFINED);
-            attrib |= LLAMA_TOKEN_ATTRIB_BYTE         * (data.type == LLAMA_TOKEN_TYPE_BYTE);
-            data.attribs = (llama_token_attrib) attrib;
-        }
-
         std::string model_name;
         std::string tokenizer_pre;
 
@@ -12684,27 +12683,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
 
 static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
+    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_NORMAL;
 }
 
 static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
+    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_UNKNOWN;
 }
 
 static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
+    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_CONTROL;
 }
 
 static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
+    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_BYTE;
 }
 
 static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
+    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_USER_DEFINED;
 }
 
 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -18277,9 +18276,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
     return model->vocab.id_to_token[token].score;
 }
 
-llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
+llama_token_attribs llama_token_get_attribs(const struct llama_model * model, llama_token token) {
     GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return model->vocab.id_to_token[token].type;
+    return model->vocab.id_to_token[token].attribs;
 }
 
 bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
diff --git a/llama.h b/llama.h
index a9952d6e01ee8..1686b8cbeb5da 100644
--- a/llama.h
+++ b/llama.h
@@ -97,7 +97,7 @@ extern "C" {
         LLAMA_ROPE_TYPE_GLM  =  4,
     };
 
-    enum llama_token_type {
+    enum llama_token_type { //TODO: remove, required until per token attribs are available from GGUF file
         LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
         LLAMA_TOKEN_TYPE_NORMAL       = 1,
         LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
@@ -107,7 +107,7 @@ extern "C" {
         LLAMA_TOKEN_TYPE_BYTE         = 6,
     };
 
-    enum llama_token_attrib {
+    enum llama_token_attribs {
         LLAMA_TOKEN_ATTRIB_UNDEFINED    = 0,
         LLAMA_TOKEN_ATTRIB_UNKNOWN      = 1 <<  1,
         LLAMA_TOKEN_ATTRIB_UNUSED       = 1 <<  2,
@@ -835,7 +835,7 @@ extern "C" {
 
     LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
 
-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
+    LLAMA_API enum llama_token_attribs llama_token_get_attribs(const struct llama_model * model, llama_token token);
 
     // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
     LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);

From 18f5fc766b6613d5ecc43c37192eea51db0d8cb0 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 4 Jun 2024 00:56:22 +0200
Subject: [PATCH 10/10] Rename token attributes

---
 llama.cpp | 79 +++++++++++++++++++++++++++----------------------------
 llama.h   | 28 ++++++++++----------
 2 files changed, 53 insertions(+), 54 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 90feea14a82bd..4714c53dd248c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2147,15 +2147,14 @@ struct llama_control_vector {
 };
 
 struct llama_vocab {
-    using id       = int32_t;
-    using token    = std::string;
-    using ttype    = llama_token_type;
-    using tattribs = llama_token_attribs;
+    using id    = int32_t;
+    using token = std::string;
+    using tattr = llama_token_attr;
 
     struct token_data {
-        token    text;
-        float    score;
-        tattribs attribs;
+        token text;
+        float score;
+        tattr attr;
     };
 
     enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
@@ -4739,20 +4738,20 @@ static void llm_load_vocab(
         vocab.token_to_id[word] = i;
 
         auto & token_data = vocab.id_to_token[i];
-        token_data.text    = std::move(word);
-        token_data.score   = scores ? scores[i] : 0.0f;
-        token_data.attribs = LLAMA_TOKEN_ATTRIB_NORMAL;
+        token_data.text  = std::move(word);
+        token_data.score = scores ? scores[i] : 0.0f;
+        token_data.attr  = LLAMA_TOKEN_ATTR_NORMAL;
 
-        if (toktypes) {  //TODO: remove, required until per token attribs are available from GGUF file
+        if (toktypes) {  //TODO: remove, required until per token attributes are available from GGUF file
             switch(toktypes[i]) {
-                case LLAMA_TOKEN_TYPE_UNKNOWN:      token_data.attribs = LLAMA_TOKEN_ATTRIB_UNKNOWN;      break;
-                case LLAMA_TOKEN_TYPE_UNUSED:       token_data.attribs = LLAMA_TOKEN_ATTRIB_UNUSED;       break;
-                case LLAMA_TOKEN_TYPE_NORMAL:       token_data.attribs = LLAMA_TOKEN_ATTRIB_NORMAL;       break;
-                case LLAMA_TOKEN_TYPE_CONTROL:      token_data.attribs = LLAMA_TOKEN_ATTRIB_CONTROL;      break;
-                case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attribs = LLAMA_TOKEN_ATTRIB_USER_DEFINED; break;
-                case LLAMA_TOKEN_TYPE_BYTE:         token_data.attribs = LLAMA_TOKEN_ATTRIB_BYTE;         break;
-                case LLAMA_TOKEN_TYPE_UNDEFINED:    token_data.attribs = LLAMA_TOKEN_ATTRIB_UNDEFINED;    break;
-                default:                            token_data.attribs = LLAMA_TOKEN_ATTRIB_UNDEFINED;    break;
+                case LLAMA_TOKEN_TYPE_UNKNOWN:      token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN;      break;
+                case LLAMA_TOKEN_TYPE_UNUSED:       token_data.attr = LLAMA_TOKEN_ATTR_UNUSED;       break;
+                case LLAMA_TOKEN_TYPE_NORMAL:       token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;       break;
+                case LLAMA_TOKEN_TYPE_CONTROL:      token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;      break;
+                case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
+                case LLAMA_TOKEN_TYPE_BYTE:         token_data.attr = LLAMA_TOKEN_ATTR_BYTE;         break;
+                case LLAMA_TOKEN_TYPE_UNDEFINED:    token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
+                default:                            token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
             }
         }
     }
@@ -4845,7 +4844,7 @@ static void llm_load_vocab(
     // build special tokens cache
     {
         for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
-            if (!(vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_NORMAL)) {
+            if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
                 vocab.cache_special_tokens.push_back(id);
             }
         }
@@ -4883,7 +4882,7 @@ static void llm_load_vocab(
     // Handle per token attributes
     //NOTE: Each model customizes per token attributes.
     //NOTE: Per token attributes are missing from the GGUF file.
-    //TODO: Extract attribs from GGUF file.
+    //TODO: Extract attributes from GGUF file.
     {
         auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
             for (auto substr : substrs) {
@@ -4894,14 +4893,14 @@ static void llm_load_vocab(
             return false;
         };
 
-        auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attribs attrib, bool value) {
-            uint32_t attribs = vocab.id_to_token.at(id).attribs;
-            attribs = value ? (attribs | attrib) : (attribs & ~attrib);
-            vocab.id_to_token[id].attribs = (llama_token_attribs) attribs;
+        auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
+            uint32_t current = vocab.id_to_token.at(id).attr;
+            current = value ? (current | attr) : (current & ~attr);
+            vocab.id_to_token[id].attr = (llama_token_attr) current;
         };
 
-        auto _set_token_attrib = [&] (const std::string & token, llama_token_attribs attrib, bool value) {
-            _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
+        auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
+            _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
         };
 
         std::string model_name;
@@ -4919,16 +4918,16 @@ static void llm_load_vocab(
 
         // set attributes by model/tokenizer name
         if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
-            _set_token_attrib("<mask>", LLAMA_TOKEN_ATTRIB_LSTRIP, true);
+            _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
         } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
             for (auto id : vocab.cache_special_tokens) {
-                _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+                _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
             }
             for (auto token : {"</s>"}) {
-                _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
             }
             for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
-                _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false);
+                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
             }
         }
     }
@@ -12683,27 +12682,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
 
 static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_NORMAL;
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
 }
 
 static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_UNKNOWN;
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
 }
 
 static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_CONTROL;
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
 }
 
 static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_BYTE;
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
 }
 
 static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
     GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attribs & LLAMA_TOKEN_ATTRIB_USER_DEFINED;
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
 }
 
 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -13361,7 +13360,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         const int64_t left_reminder_offset = raw_text_base_offset + 0;
                         int64_t left_reminder_length = match - raw_text_base_offset;
 
-                        if (data.attribs & LLAMA_TOKEN_ATTRIB_LSTRIP) {
+                        if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
                             while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
                                 left_reminder_length--;
                             }
@@ -13386,7 +13385,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         int64_t right_reminder_offset = match + special_token.length();
                         int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
 
-                        if (data.attribs & LLAMA_TOKEN_ATTRIB_RSTRIP) {
+                        if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
                             while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
                                 right_reminder_offset++;
                                 right_reminder_length--;
@@ -18276,9 +18275,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
     return model->vocab.id_to_token[token].score;
 }
 
-llama_token_attribs llama_token_get_attribs(const struct llama_model * model, llama_token token) {
+llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
     GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return model->vocab.id_to_token[token].attribs;
+    return model->vocab.id_to_token[token].attr;
 }
 
 bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
diff --git a/llama.h b/llama.h
index 1686b8cbeb5da..a78ccdaf557d0 100644
--- a/llama.h
+++ b/llama.h
@@ -97,7 +97,7 @@ extern "C" {
         LLAMA_ROPE_TYPE_GLM  =  4,
     };
 
-    enum llama_token_type { //TODO: remove, required until per token attribs are available from GGUF file
+    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
         LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
         LLAMA_TOKEN_TYPE_NORMAL       = 1,
         LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
@@ -107,18 +107,18 @@ extern "C" {
         LLAMA_TOKEN_TYPE_BYTE         = 6,
     };
 
-    enum llama_token_attribs {
-        LLAMA_TOKEN_ATTRIB_UNDEFINED    = 0,
-        LLAMA_TOKEN_ATTRIB_UNKNOWN      = 1 <<  1,
-        LLAMA_TOKEN_ATTRIB_UNUSED       = 1 <<  2,
-        LLAMA_TOKEN_ATTRIB_NORMAL       = 1 <<  3,
-        LLAMA_TOKEN_ATTRIB_CONTROL      = 1 <<  4,  // SPECIAL?
-        LLAMA_TOKEN_ATTRIB_USER_DEFINED = 1 <<  5,
-        LLAMA_TOKEN_ATTRIB_BYTE         = 1 <<  6,
-        LLAMA_TOKEN_ATTRIB_NORMALIZED   = 1 <<  7,
-        LLAMA_TOKEN_ATTRIB_LSTRIP       = 1 <<  8,
-        LLAMA_TOKEN_ATTRIB_RSTRIP       = 1 <<  9,
-        LLAMA_TOKEN_ATTRIB_SINGLE_WORD  = 1 << 10,
+    enum llama_token_attr {
+        LLAMA_TOKEN_ATTR_UNDEFINED    = 0,
+        LLAMA_TOKEN_ATTR_UNKNOWN      = 1 <<  1,
+        LLAMA_TOKEN_ATTR_UNUSED       = 1 <<  2,
+        LLAMA_TOKEN_ATTR_NORMAL       = 1 <<  3,
+        LLAMA_TOKEN_ATTR_CONTROL      = 1 <<  4,  // SPECIAL?
+        LLAMA_TOKEN_ATTR_USER_DEFINED = 1 <<  5,
+        LLAMA_TOKEN_ATTR_BYTE         = 1 <<  6,
+        LLAMA_TOKEN_ATTR_NORMALIZED   = 1 <<  7,
+        LLAMA_TOKEN_ATTR_LSTRIP       = 1 <<  8,
+        LLAMA_TOKEN_ATTR_RSTRIP       = 1 <<  9,
+        LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 10,
     };
 
     // model file types
@@ -835,7 +835,7 @@ extern "C" {
 
     LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
 
-    LLAMA_API enum llama_token_attribs llama_token_get_attribs(const struct llama_model * model, llama_token token);
+    LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
 
     // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
     LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);