ggml-org · JoanFM · Apr 11, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 16, 2024
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -603,6 +603,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
             # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
             res = "smollm"
+        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
+            res = "jina-v2-zh"
 
         if res is None:
             logger.warning("\n")

diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -94,6 +94,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
     {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
     {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
+    {"name": "jina-v2-zh",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
 ]
 
 

diff --git a/include/llama.h b/include/llama.h
@@ -95,6 +95,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
         LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
         LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
+        LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH     = 23,
     };
 
     // note: these values should be synchronized with ggml_rope

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -11,6 +11,7 @@
 #include <forward_list>
 #include <queue>
 #include <sstream>
+#include <regex>
 
 //
 // helpers
@@ -446,6 +447,9 @@ struct llm_tokenizer_bpe {
                     "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
+                regex_exprs = {"\\w+|[^\\w\\s]+"};
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -498,7 +502,20 @@ struct llm_tokenizer_bpe {
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         int final_prev_index = -1;
 
-        const auto word_collection = unicode_regex_split(text, regex_exprs);
+        std::vector<std::string> word_collection;
+        if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH) {
+
+            std::string lowercase_text = lowercase(text);
+            std::regex regexPattern(regex_exprs[0]);
+            std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
+            std::sregex_token_iterator end;
+
+            while (it != end) {
+                word_collection.push_back(*it++);
+            }
+        } else {
+            word_collection = unicode_regex_split(text, regex_exprs);
+        }
 
         symbols_final.clear();
 

diff --git a/src/llama.cpp b/src/llama.cpp
@@ -5385,8 +5385,8 @@ static void llm_load_vocab(
                     tokenizer_pre == "jina-v2-de" ||
                     tokenizer_pre == "jina-v2-code") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
-            } else if (
-                    tokenizer_pre == "refact") {
+
+            } else if (tokenizer_pre == "refact") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
             } else if (
                 tokenizer_pre == "command-r") {
@@ -5436,6 +5436,9 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "codeshell") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+            } else if (
+                tokenizer_pre == "jina-v2-zh") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -5486,8 +5489,7 @@ static void llm_load_vocab(
 
     for (uint32_t i = 0; i < n_vocab; i++) {
         std::string word = gguf_get_arr_str(ctx, token_idx, i);
-        GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
-
+        //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); Remove check, some vocabs contain by mistake the NULL in vocab, (not ideal if it happens more than once) (jinaai-embeddings-v2-base-zh)
         vocab.token_to_id[word] = i;
         vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
 
@@ -5560,9 +5562,18 @@ static void llm_load_vocab(
     } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
         vocab.linefeed_id = vocab.special_pad_id;
     } else {
-        const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
-        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
-        vocab.linefeed_id = ids[0];
+        try {
+            const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
+            if (ids.empty()) {
+                LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), "\xC4\x8A");
+                vocab.linefeed_id = -1;
+            } else {
+                vocab.linefeed_id = ids[0];
+            }
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), e.what());
+            vocab.linefeed_id = vocab.special_pad_id;
+        }
     }
 
     // special tokens

diff --git a/src/unicode.cpp b/src/unicode.cpp
@@ -816,3 +816,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
 
     return unicode_byte_encoding_process(bpe_words);
 }
+
+
+
+std::string lowercase(const std::string & text) {
+    std::string lowercase("");
+    const std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text);
+
+    for (const char32_t cpt : cpts) {
+        const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
+        lowercase += unicode_cpt_to_utf8(unicode_tolower(cpt));  // append char to word
+    }
+
+    return lowercase;
+}
diff --git a/src/unicode.h b/src/unicode.h
@@ -65,3 +65,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
 uint32_t unicode_tolower(uint32_t cp);
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
+
+std::string lowercase(const std::string & text);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -65,3 +65,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
		uint32_t unicode_tolower(uint32_t cp);

		std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);

		std::string lowercase(const std::string & text);