Skip to content

Commit 12a7b69

Browse files
committed
tests : add fail test for llama-bpe
1 parent 8de8b6d commit 12a7b69

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
257257
"3333333",
258258
"33333333",
259259
"333333333",
260+
# "Cửa Việt", # llama-bpe fails on this
260261
chktxt,
261262
]
262263

unicode.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
112112
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
113113
std::unordered_map<uint32_t, int> cpt_types;
114114
for (auto p : unicode_ranges_number) {
115-
for (auto i = p.first; i <= p.second; ++ i) {
115+
for (auto i = p.first; i <= p.second; ++i) {
116116
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
117117
}
118118
}
119119
for (auto p : unicode_ranges_letter) {
120-
for (auto i = p.first; i <= p.second; ++ i) {
120+
for (auto i = p.first; i <= p.second; ++i) {
121121
cpt_types[i] = CODEPOINT_TYPE_LETTER;
122122
}
123123
}
124124
for (auto p : unicode_ranges_separator) {
125-
for (auto i = p.first; i <= p.second; ++ i) {
125+
for (auto i = p.first; i <= p.second; ++i) {
126126
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
127127
}
128128
}
129129
for (auto p : unicode_ranges_accent_mark) {
130-
for (auto i = p.first; i <= p.second; ++ i) {
130+
for (auto i = p.first; i <= p.second; ++i) {
131131
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
132132
}
133133
}
134134
for (auto p : unicode_ranges_punctuation) {
135-
for (auto i = p.first; i <= p.second; ++ i) {
135+
for (auto i = p.first; i <= p.second; ++i) {
136136
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
137137
}
138138
}
@@ -142,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
142142
}
143143
}
144144
for (auto p : unicode_ranges_control) {
145-
for (auto i = p.first; i <= p.second; ++ i) {
145+
for (auto i = p.first; i <= p.second; ++i) {
146146
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
147147
}
148148
}
@@ -629,7 +629,7 @@ bool unicode_cpt_is_whitespace(uint32_t cp) {
629629
static const std::unordered_set<uint32_t> is_whitespace = [] {
630630
std::unordered_set<uint32_t> is_whitespace;
631631
for (auto p : unicode_ranges_whitespace) {
632-
for (auto i = p.first; i <= p.second; ++ i) {
632+
for (auto i = p.first; i <= p.second; ++i) {
633633
is_whitespace.insert(i);
634634
}
635635
}

0 commit comments

Comments
 (0)