tests : add fail test for llama-bpe

ggerganov · ggerganov · commit 12a7b696236f · 2024-05-09T10:29:39.000+03:00
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -257,6 +257,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
     "3333333",
     "33333333",
     "333333333",
+    # "Cửa Việt", # llama-bpe fails on this
     chktxt,
 ]
 
diff --git a/unicode.cpp b/unicode.cpp
@@ -112,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
 static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
     std::unordered_map<uint32_t, int> cpt_types;
     for (auto p : unicode_ranges_number) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
             cpt_types[i] = CODEPOINT_TYPE_NUMBER;
         }
     }
     for (auto p : unicode_ranges_letter) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
             cpt_types[i] = CODEPOINT_TYPE_LETTER;
         }
     }
     for (auto p : unicode_ranges_separator) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
             cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
         }
     }
     for (auto p : unicode_ranges_accent_mark) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
             cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
         }
     }
     for (auto p : unicode_ranges_punctuation) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
             cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
         }
     }
@@ -142,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
         }
     }
     for (auto p : unicode_ranges_control) {
-        for (auto i = p.first; i <= p.second; ++ i) {
+        for (auto i = p.first; i <= p.second; ++i) {
             cpt_types[i] = CODEPOINT_TYPE_CONTROL;
         }
     }
@@ -629,7 +629,7 @@ bool unicode_cpt_is_whitespace(uint32_t cp) {
     static const std::unordered_set<uint32_t> is_whitespace = [] {
         std::unordered_set<uint32_t> is_whitespace;
         for (auto p : unicode_ranges_whitespace) {
-            for (auto i = p.first; i <= p.second; ++ i) {
+            for (auto i = p.first; i <= p.second; ++i) {
                 is_whitespace.insert(i);
             }
         }

Original file line number	Diff line number	Diff line change
`@@ -257,6 +257,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:`
`257`	`257`	`"3333333",`
`258`	`258`	`"33333333",`
`259`	`259`	`"333333333",`
	`260`	`+ # "Cửa Việt", # llama-bpe fails on this`
`260`	`261`	`chktxt,`
`261`	`262`	`]`
`262`	`263`
Original file line number	Diff line number	Diff line change
`@@ -112,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)`
`112`	`112`	`static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {`
`113`	`113`	`std::unordered_map<uint32_t, int> cpt_types;`
`114`	`114`	`for (auto p : unicode_ranges_number) {`
`115`		`- for (auto i = p.first; i <= p.second; ++ i) {`
	`115`	`+ for (auto i = p.first; i <= p.second; ++i) {`
`116`	`116`	`cpt_types[i] = CODEPOINT_TYPE_NUMBER;`
`117`	`117`	`}`
`118`	`118`	`}`
`119`	`119`	`for (auto p : unicode_ranges_letter) {`
`120`		`- for (auto i = p.first; i <= p.second; ++ i) {`
	`120`	`+ for (auto i = p.first; i <= p.second; ++i) {`
`121`	`121`	`cpt_types[i] = CODEPOINT_TYPE_LETTER;`
`122`	`122`	`}`
`123`	`123`	`}`
`124`	`124`	`for (auto p : unicode_ranges_separator) {`
`125`		`- for (auto i = p.first; i <= p.second; ++ i) {`
	`125`	`+ for (auto i = p.first; i <= p.second; ++i) {`
`126`	`126`	`cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;`
`127`	`127`	`}`
`128`	`128`	`}`
`129`	`129`	`for (auto p : unicode_ranges_accent_mark) {`
`130`		`- for (auto i = p.first; i <= p.second; ++ i) {`
	`130`	`+ for (auto i = p.first; i <= p.second; ++i) {`
`131`	`131`	`cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;`
`132`	`132`	`}`
`133`	`133`	`}`
`134`	`134`	`for (auto p : unicode_ranges_punctuation) {`
`135`		`- for (auto i = p.first; i <= p.second; ++ i) {`
	`135`	`+ for (auto i = p.first; i <= p.second; ++i) {`
`136`	`136`	`cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;`
`137`	`137`	`}`
`138`	`138`	`}`
`@@ -142,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {`
`142`	`142`	`}`
`143`	`143`	`}`
`144`	`144`	`for (auto p : unicode_ranges_control) {`
`145`		`- for (auto i = p.first; i <= p.second; ++ i) {`
	`145`	`+ for (auto i = p.first; i <= p.second; ++i) {`
`146`	`146`	`cpt_types[i] = CODEPOINT_TYPE_CONTROL;`
`147`	`147`	`}`
`148`	`148`	`}`
`@@ -629,7 +629,7 @@ bool unicode_cpt_is_whitespace(uint32_t cp) {`
`629`	`629`	`static const std::unordered_set<uint32_t> is_whitespace = [] {`
`630`	`630`	`std::unordered_set<uint32_t> is_whitespace;`
`631`	`631`	`for (auto p : unicode_ranges_whitespace) {`
`632`		`- for (auto i = p.first; i <= p.second; ++ i) {`
	`632`	`+ for (auto i = p.first; i <= p.second; ++i) {`
`633`	`633`	`is_whitespace.insert(i);`
`634`	`634`	`}`
`635`	`635`	`}`