Skip to content

Commit 9600d59

Browse files
authored
unicode : switch to multimap based nfd_map (#5799)
* switch to multimap based nfd_map due to compile time issues * simplify multimap keys * dont construct new locale every time
1 parent 5cb02b4 commit 9600d59

File tree

2 files changed

+312
-265
lines changed

2 files changed

+312
-265
lines changed

llama.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm {
89478947
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
89488948
std::vector<uint32_t> nfd_codepoints;
89498949
for (uint32_t code : codepoints) {
8950-
auto it = nfd_map.find(code);
8951-
if (it != nfd_map.end()) {
8952-
for (uint32_t c : it->second) {
8953-
nfd_codepoints.push_back(c);
8950+
auto it = nfd_map.equal_range(code);
8951+
if (it.first != it.second) {
8952+
for (auto jt = it.first; jt != it.second; jt++) {
8953+
nfd_codepoints.push_back(jt->second);
89548954
}
89558955
} else {
89568956
nfd_codepoints.push_back(code);
@@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm {
90019001
}
90029002

90039003
uint32_t to_lower(uint32_t code) {
9004+
static const std::locale locale("en_US.UTF-8");
90049005
#if defined(_WIN32)
90059006
if (code > 0xFFFF) {
90069007
return code;
90079008
}
90089009
#endif
9009-
return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
9010+
return std::tolower(wchar_t(code), locale);
90109011
}
90119012

90129013
bool is_ascii_punct(uint32_t code) {

0 commit comments

Comments
 (0)