File tree 2 files changed +312
-265
lines changed 2 files changed +312
-265
lines changed Original file line number Diff line number Diff line change @@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm {
8947
8947
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
8948
8948
std::vector<uint32_t> nfd_codepoints;
8949
8949
for (uint32_t code : codepoints) {
8950
- auto it = nfd_map.find (code);
8951
- if (it != nfd_map.end() ) {
8952
- for (uint32_t c : it-> second) {
8953
- nfd_codepoints.push_back(c );
8950
+ auto it = nfd_map.equal_range (code);
8951
+ if (it.first != it.second ) {
8952
+ for (auto jt = it.first; jt != it. second; jt++ ) {
8953
+ nfd_codepoints.push_back(jt->second );
8954
8954
}
8955
8955
} else {
8956
8956
nfd_codepoints.push_back(code);
@@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm {
9001
9001
}
9002
9002
9003
9003
uint32_t to_lower(uint32_t code) {
9004
+ static const std::locale locale("en_US.UTF-8");
9004
9005
#if defined(_WIN32)
9005
9006
if (code > 0xFFFF) {
9006
9007
return code;
9007
9008
}
9008
9009
#endif
9009
- return std::tolower(wchar_t(code), std:: locale("en_US.UTF-8") );
9010
+ return std::tolower(wchar_t(code), locale);
9010
9011
}
9011
9012
9012
9013
bool is_ascii_punct(uint32_t code) {
You can’t perform that action at this time.
0 commit comments