Refactor + add 'jina-v2' for testing 'lstrip'

jaime-m-p · jaime-m-p · commit 01c9229186f6 · 2024-06-01T21:22:57.000+02:00
diff --git a/llama.cpp b/llama.cpp
@@ -4872,9 +4872,29 @@ static void llm_load_vocab(
     //NOTE: Each model customizes per token attributes.
     //NOTE: Per token attributes are missing from the GGUF file.
     //TODO: Merge llama_token_type and llama_token_attrib.
+    //TODO: Extract attribs from GGUF file.
     {
+        auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
+            for (auto substr : substrs) {
+                if (str.find(substr) < std::string::npos) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) {
+            uint32_t attribs = vocab.id_to_token.at(id).attribs;
+            attribs = value ? (attribs | attrib) : (attribs & ~attrib);
+            vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
+        };
+
+        auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) {
+            _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
+        };
+
         // convert token type as an attribute
-        for (auto data : vocab.id_to_token) {
+        for (auto &data : vocab.id_to_token) {
             uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED;
             attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN      * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN);
             attrib |= LLAMA_TOKEN_ATTRIB_UNUSED       * (data.type == LLAMA_TOKEN_TYPE_UNUSED);
@@ -4885,44 +4905,31 @@ static void llm_load_vocab(
             data.attribs = (llama_token_attrib) attrib;
         }
 
-        // set attributes by model name
         std::string model_name;
-        if (ml.get_key(LLM_KV_GENERAL_NAME, model_name, false)) {
-            std::transform(model_name.begin(), model_name.end(), model_name.begin(),
-                [] (const std::string::value_type x) {
-                    return std::tolower(x);
-                }
-            );
-
-            auto _contains_any = [&model_name] (const std::vector<std::string> &substrs) -> bool {
-                for (auto substr : substrs) {
-                    if (model_name.find(substr) < std::string::npos) {
-                        return true;
-                    }
-                }
-                return false;
-            };
+        std::string tokenizer_pre;
 
-            auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) {
-                uint32_t attribs = vocab.id_to_token[id].attribs;
-                attribs = value ? (attribs | attrib) : (attribs & ~attrib);
-                vocab.id_to_token[id].attribs = (llama_token_attrib) attribs;
-            };
+        ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
+        ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
 
-            auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) {
-                _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value);
-            };
+        // model name to lowercase
+        std::transform(model_name.begin(), model_name.end(), model_name.begin(),
+            [] (const std::string::value_type x) {
+                return std::tolower(x);
+            }
+        );
 
-            if (_contains_any({"phi-3", "phi3"})) {
-                for (auto id : vocab.cache_special_tokens) {
-                    _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
-                }
-                for (auto token : {"</s>"}) {
-                    _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
-                }
-                for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
-                    _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false);
-                }
+        // set attributes by model/tokenizer name
+        if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
+            _set_token_attrib("<mask>", LLAMA_TOKEN_ATTRIB_LSTRIP, true);
+        } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
+            for (auto id : vocab.cache_special_tokens) {
+                _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+            }
+            for (auto token : {"</s>"}) {
+                _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true);
+            }
+            for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
+                _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false);
             }
         }
     }
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
@@ -156,6 +156,8 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
         '<s>a',       # Phi-3 fail
         '<unk><|endoftext|><s>',  # Phi-3 fail
         'a\na',       # TODO: Bert fail
+        'a </s> b',   # rstrip phi-3
+        'a <mask> b', # lstrip jina-v2
     ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -156,6 +156,8 @@ def generator_custom_text_edge_cases() -> Iterator[str]:`
`156`	`156`	`'<s>a', # Phi-3 fail`
`157`	`157`	`'<unk><\|endoftext\|><s>', # Phi-3 fail`
`158`	`158`	`'a\na', # TODO: Bert fail`
	`159`	`+ 'a </s> b', # rstrip phi-3`
	`160`	`+ 'a <mask> b', # lstrip jina-v2`
`159`	`161`	`]`
`160`	`162`
`161`	`163`