Skip to content

Commit 668e0d9

Browse files
author
Joan Martinez
committed
feat: remove extra complexity in NFD
1 parent 043f298 commit 668e0d9

File tree

6 files changed

+15
-51
lines changed

6 files changed

+15
-51
lines changed

examples/server/tests/features/embeddings.feature

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,17 @@ Feature: llama.cpp server
1616
Then the server is starting
1717
Then the server is healthy
1818

19+
Scenario: Embedding
20+
When embeddings are computed for:
21+
"""
22+
What is the capital of Bulgaria ?
23+
"""
24+
Then embeddings are generated
25+
1926
Scenario: Tokenize / Detokenize complex
2027
When tokenizing:
2128
"""
22-
España is your's mine's l'heure èspciâl café über naïve résumé cañón élite cañas Barça 例子 東京 こんにちは 你好 中国
29+
北京的清晨,空氣清新而寧靜,一个年轻的旅行者在长城上漫步,他从自己的故乡—서울에서 출발하여 아시아의 다양한 문화를 탐험하고자 하는 꿈을 품고 떠났다。彼は日本の古都、京都を訪れ、そこで美しい桜の花が満開の下で古典音楽のコンサートに参加しました。祭りの夜、彼は色とりどりの灯籠が空に浮かぶのを見て、その美しさに感動しました。その後、彼は印度のバラナシに到着し、गंगा की घाटों पर आध्यात्मिक शांति की खोज में जुट गया। वहाँ उसने दिवाली के उत्सव में हिस्सा लिया, जहां लाखों दीये जलाकर समृद्धि और खुशहाली की कामना की गई थी।この旅は彼にとって非常に啓発的であり、多くの異なる文化から新しいことを学び、新しい友達を作る機会を与えました。彼はこの経験を通じて、 異なる文化の間の共通点と相違点を理解するようになりました。España is your's mine's l'heure èspciâl café über naïve résumé cañón élite cañas Barça 例子 東京 こんにちは 你好 中国
2330
"""
2431
Then tokens can be detokenize and is equivalent False
2532

llama.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12456,8 +12456,7 @@ struct llm_tokenizer_wpm {
1245612456
}
1245712457

1245812458
std::vector<std::string> preprocess(const std::string & text) {
12459-
auto unicode_cpts = unicode_cpts_from_utf8(text);
12460-
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts);
12459+
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
1246112460

1246212461
// strip accents, strip control, uniformize whitespace,
1246312462
// to lowercase, pad chinese characters, pad punctuation

unicode-data.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1691,8 +1691,3 @@ const std::map<char32_t, char32_t> unicode_map_lowercase = {
16911691
{0x1E917, 0x1E939}, {0x1E918, 0x1E93A}, {0x1E919, 0x1E93B}, {0x1E91A, 0x1E93C}, {0x1E91B, 0x1E93D}, {0x1E91C, 0x1E93E},
16921692
{0x1E91D, 0x1E93F}, {0x1E91E, 0x1E940}, {0x1E91F, 0x1E941}, {0x1E920, 0x1E942}, {0x1E921, 0x1E943},
16931693
};
1694-
1695-
1696-
const std::map<uint32_t, uint32_t> unicode_canonical_class = {
1697-
{42613, 230}, {824, 1}, {811, 220}, {2292, 230}, {2030, 230}, {65065, 220}, {773, 230}, {784, 230}, {2281, 220}, {43233, 230}, {43244, 230}, {6964, 7}, {1434, 222}, {1752, 230}, {7646, 230}, {7657, 230}, {8401, 230}, {8412, 230}, {65057, 230}, {70197, 9}, {6841, 220}, {11764, 230}, {1847, 220}, {42654, 230}, {119143, 1}, {841, 220}, {43699, 230}, {68325, 230}, {1459, 13}, {7397, 1}, {92914, 1}, {7083, 9}, {1558, 230}, {69890, 230}, {119152, 216}, {7619, 230}, {1859, 230}, {1441, 230}, {1452, 230}, {1473, 24}, {42616, 230}, {2090, 230}, {2876, 7}, {792, 220}, {803, 220}, {795, 216}, {856, 232}, {1453, 222}, {42655, 230}, {1771, 230}, {1474, 25}, {122894, 230}, {776, 230}, {7626, 220}, {119212, 230}, {43204, 9}, {1559, 230}, {2303, 230}, {43309, 220}, {125253, 230}, {7638, 230}, {65060, 230}, {1469, 22}, {7387, 230}, {1627, 230}, {12330, 218}, {119174, 230}, {2265, 230}, {7019, 230}, {874, 230}, {6779, 230}, {7668, 230}, {122913, 230}, {3975, 230}, {11775, 230}, {1433, 230}, {2071, 230}, {2082, 230}, {2093, 230}, {6836, 230}, {43235, 230}, {43246, 230}, {836, 230}, {43766, 9}, {92980, 230}, {125136, 220}, {1458, 12}, {6680, 220}, {122886, 230}, {3963, 130}, {2765, 9}, {11748, 230}, {852, 220}, {2284, 230}, {1155, 230}, {2295, 230}, {2033, 230}, {65068, 220}, {787, 230}, {2139, 220}, {43236, 230}, {1619, 230}, {7629, 234}, {1850, 230}, {1861, 230}, {2381, 9}, {7022, 230}, {844, 230}, {855, 230}, {1755, 230}, {7649, 230}, {7660, 230}, {122916, 230}, {6459, 220}, {11756, 230}, {11767, 230}, {43443, 7}, {11505, 230}, {71104, 7}, {8431, 220}, {43014, 9}, {1425, 220}, {2492, 7}, {7622, 230}, {1455, 230}, {12334, 224}, {42619, 230}, {1613, 29}, {2276, 230}, {7142, 7}, {7154, 9}, {8423, 230}, {768, 230}, {779, 230}, {7417, 230}, {1858, 220}, {1763, 220}, {43710, 230}, {4237, 220}, {1842, 230}, {1444, 220}, {7641, 230}, {7652, 230}, {66045, 220}, {122897, 230}, {1630, 230}, {11759, 230}, {119177, 230}, {125138, 220}, {70090, 7}, {825, 220}, {125256, 230}, {1465, 19}, {1553, 230}, {1843, 230}, {1436, 230}, {43247, 230}, {70512, 230}, {3158, 91}, {113822, 1}, {2085, 230}, {2620, 7}, {12442, 8}, {2268, 230}, {3768, 118}, {65071, 230}, {7024, 230}, {1158, 230}, {43713, 230}, {7400, 1}, {1612, 28}, {66424, 230}, {7633, 230}, {1454, 228}, {12333, 222}, {806, 220}, {817, 220}, {828, 220}, {2260, 230}, {7025, 230}, {122908, 230}, {122919, 230}, {3970, 230}, {11770, 230}, {2287, 220}, {1428, 230}, {1439, 230}, {70504, 230}, {70515, 230}, {7679, 220}, {2077, 230}, {42737, 230}, {64286, 26}, {119362, 230}, {122881, 230}, {71103, 9}, {2385, 230}, {3893, 220}, {2279, 230}, {2028, 230}, {877, 230}, {7390, 220}, {119166, 220}, {794, 232}, {782, 230}, {771, 230}, {8426, 1}, {1845, 230}, {1856, 230}, {3787, 122}, {850, 230}, {1447, 220}, {7644, 230}, {4957, 230}, {122889, 230}, {122900, 230}, {6839, 220}, {11751, 230}, {11762, 230}, {11773, 230}, {2288, 27}, {1556, 230}, {69888, 230}, {92978, 230}, {2298, 220}, {4154, 9}, {42614, 230}, {790, 220}, {4958, 230}, {2289, 28}, {2271, 230}, {869, 230}, {1477, 220}, {6774, 230}, {3658, 107}, {7382, 220}, {7663, 230}, {819, 220}, {774, 230}, {7412, 230}, {8407, 230}, {70080, 9}, {119150, 216}, {1864, 220}, {858, 220}, {43307, 220}, {831, 230}, {3864, 220}, {7625, 230}, {7636, 230}, {1865, 230}, {7647, 230}, {6842, 220}, {2748, 7}, {11754, 230}, {822, 1}, {11503, 230}, {4038, 220}, {65063, 220}, {122911, 230}, {6458, 230}, {1431, 230}, {43242, 230}, {70507, 230}, {119168, 220}, {119179, 220}, {6834, 230}, {1750, 230}, {7655, 230}, {7666, 230}, {8410, 1}, {2388, 230}, {839, 220}, {2282, 230}, {2031, 230}, {7395, 1}, {92912, 1}, {7617, 230}, {7628, 230}, {801, 202}, {1857, 230}, {7377, 230}, {8400, 230}, {812, 220}, {861, 234}, {1479, 18}, {842, 230}, {1450, 220}, {1460, 14}, {72767, 9}, {71231, 9}, {119144, 1}, {125142, 220}, {1471, 23}, {122892, 230}, {122903, 230}, {11765, 230}, {122883, 230}, {119210, 230}, {2301, 230}, {43347, 9}, {8429, 220}, {2072, 230}, {92981, 230}, {65058, 230}, {3964, 130}, {1625, 230}, {70851, 7}, {2263, 230}, {872, 230}, {6777, 230}, {7385, 220}, {1761, 230}, {1772, 230}, {3897, 216}, {789, 232}, {8421, 1}, {119142, 216}, {66422, 230}, {70377, 7}, {119153, 216}, {122922, 230}, {2080, 230}, {2091, 230}, {1840, 230}, {6980, 9}, {12331, 228}, {834, 230}, {1442, 220}, {2364, 7}, {7639, 230}, {122884, 230}, {122895, 230}, {11746, 230}, {11757, 230}, {2293, 230}, {65066, 220}, {125254, 230}, {785, 230}, {860, 233}, {2137, 220}, {837, 240}, {3405, 9}, {43245, 230}, {1615, 31}, {2266, 230}, {3530, 9}, {70722, 9}, {7155, 9}, {70726, 7}, {1753, 230}, {1764, 230}, {7658, 230}, {814, 220}, {7669, 230}, {8402, 1}, {11647, 9}, {1848, 220}, {853, 220}, {7398, 1}, {92915, 1}, {7620, 230}, {6837, 220}, {42617, 230}, {804, 220}, {815, 220}, {69818, 7}, {8432, 230}, {777, 230}, {788, 230}, {43456, 9}, {2285, 220}, {119213, 230}, {1426, 230}, {43237, 230}, {70502, 230}, {119163, 220}, {5908, 9}, {7677, 220}, {1614, 30}, {1467, 20}, {864, 234}, {42618, 230}, {3656, 107}, {7650, 230}, {8405, 230}, {65061, 230}, {69939, 9}, {3784, 122}, {6845, 220}, {119175, 230}, {845, 220}, {2277, 230}, {43703, 230}, {875, 230}, {6780, 230}, {2290, 29}, {68153, 1}, {66425, 230}, {7676, 233}, {807, 202}, {12335, 224}, {2083, 230}, {796, 220}, {3785, 122}, {125137, 220}, {122898, 230}, {11749, 230}, {2296, 230}, {1156, 230}, {43711, 230}, {8424, 220}, {65069, 220}, {125257, 230}, {863, 233}, {70003, 7}, {43700, 220}, {43248, 230}, {1620, 230}, {8425, 230}, {826, 220}, {2269, 230}, {7023, 230}, {7631, 220}, {867, 230}, {878, 230}, {1756, 230}, {7661, 230}, {1466, 19}, {11768, 230}, {1851, 220}, {68154, 220}, {1437, 230}, {2034, 220}, {70513, 230}, {2075, 230}, {2086, 230}, {43239, 230}, {829, 230}, {7627, 230}, {7623, 230}, {122890, 230}, {42620, 230}, {820, 1}, {70850, 9}, {2299, 230}, {7388, 220}, {1628, 220}, {780, 230}, {866, 233}, {1429, 230}, {43240, 230}, {70505, 230}, {2261, 230}, {848, 230}, {859, 230}, {1445, 220}, {1759, 230}, {3659, 107}, {1560, 30}, {798, 220}, {809, 220}, {3640, 103}, {7642, 230}, {7653, 230}, {3971, 230}, {7664, 230}, {8408, 1}, {11760, 230}, {11771, 230}, {122909, 230}, {125139, 220}, {70378, 9}, {1554, 230}, {92976, 230}, {1855, 230}, {1448, 230}, {69759, 9}, {42612, 230}, {799, 220}, {2280, 230}, {3769, 118}, {2029, 230}, {1767, 230}, {772, 230}, {783, 230}, {8427, 1}, {1862, 220}, {43232, 230}, {1622, 220}, {1555, 230}, {1159, 230}, {6783, 220}, {6679, 230}, {119169, 220}, {7634, 230}, {7645, 230}, {65056, 230}, {71351, 7}, {1623, 230}, {6840, 220}, {11752, 230}, {818, 220}, {7026, 230}, {870, 230}, {6775, 230}, {122920, 230}, {1461, 15}, {1440, 230}, {1451, 230}, {70516, 230}, {2078, 230}, {2089, 230}, {791, 220}, {1617, 33}, {3642, 9}, {6832, 230}, {6843, 230}, {44013, 9}, {832, 230}, {119363, 230}, {1462, 16}, {70477, 9}, {122882, 230}, {11744, 230}, {1854, 220}, {69702, 9}, {2291, 230}, {2302, 230}, {65064, 220}, {7391, 220}, {43308, 220}, {2386, 220}, {119178, 220}, {4151, 7}, {66272, 220}, {810, 220}, {1846, 230}, {1751, 230}, {7656, 230}, {6109, 230}, {122901, 230}, {122912, 230}, {3974, 230}, {11763, 230}, {11774, 230}, {1432, 230}, {70508, 230}, {2070, 230}, {2081, 230}, {6835, 230}, {7630, 214}, {3157, 84}, {68111, 230}, {43234, 230}, {1557, 230}, {69889, 230}, {92979, 230}, {3277, 9}, {1809, 36}, {3149, 9}, {3962, 130}, {42615, 230}, {840, 220}, {2272, 230}, {2283, 230}, {2032, 230}, {7396, 1}, {7383, 220}, {7675, 230}, {119170, 220}, {786, 230}, {775, 230}, {8428, 220}, {3972, 9}, {119151, 216}, {119211, 230}, {802, 202}, {862, 234}, {843, 230}, {68159, 9}, {7637, 230}, {7648, 230}, {793, 220}, {8403, 1}, {7386, 230}, {122893, 230}, {1626, 230}, {11755, 230}, {11766, 230}, {119173, 230}, {823, 1}, {11504, 230}, {125252, 230}, {43243, 230}, {42607, 230}, {2264, 230}, {873, 230}, {6778, 230}, {3895, 220}, {1762, 230}, {7667, 230}, {7678, 230}, {8411, 230}, {1468, 21}, {7416, 230}, {8422, 1}, {122885, 230}, {851, 220}, {43698, 230}, {7405, 220}, {92913, 1}, {1616, 32}, {7640, 230}, {7378, 230}, {11747, 230}, {813, 220}, {68152, 230}, {7021, 230}, {7618, 220}, {7380, 1}, {119145, 1}, {122904, 230}, {122915, 230}, {2138, 220}, {3953, 129}, {1770, 220}, {8430, 220}, {2073, 230}, {71467, 9}, {92982, 230}, {1754, 230}, {7659, 230}, {65059, 230}, {65070, 230}, {3954, 130}, {3965, 130}, {71350, 9}, {1849, 220}, {854, 220}, {2035, 230}, {7399, 1}, {12332, 232}, {778, 230}, {119154, 216}, {66423, 230}, {7621, 230}, {125258, 7}, {2092, 230}, {8404, 230}, {805, 220}, {835, 230}, {1443, 220}, {1456, 10}, {122896, 230}, {122907, 230}, {11758, 230}, {11769, 230}, {70503, 230}, {1773, 220}, {769, 230}, {65067, 220}, {1457, 11}, {7020, 220}, {125255, 230}, {1552, 230}, {2294, 220}, {69940, 9}, {1629, 230}, {2893, 9}, {119176, 230}, {68326, 220}, {2267, 230}, {2027, 230}, {876, 230}, {7389, 220}, {119165, 220}, {770, 230}, {1463, 17}, {1860, 220}, {92916, 1}, {12441, 8}, {1464, 18}, {838, 230}, {1435, 220}, {6313, 228}, {7643, 230}, {1476, 230}, {70198, 7}, {122888, 230}, {6838, 220}, {11750, 230}, {125140, 220}, {119149, 226}, {816, 220}, {5940, 9}, {2275, 220}, {2286, 220}, {1427, 230}, {1438, 230}, {43238, 230}, {43249, 230}, {70514, 230}, {119164, 220}, {42736, 230}, {1863, 230}, {865, 234}, {6752, 9}, {6773, 230}, {3657, 107}, {3021, 9}, {7651, 230}, {7662, 230}, {3968, 130}, {6457, 222}, {8406, 230}, {8417, 230}, {65062, 230}, {1841, 220}, {1852, 220}, {69817, 9}, {846, 220}, {857, 220}, {43704, 230}, {66426, 230}, {7624, 230}, {808, 202}, {1561, 31}, {1853, 230}, {42621, 230}, {821, 1}, {797, 220}, {3786, 122}, {849, 230}, {1446, 220}, {70460, 7}, {1562, 32}, {1648, 35}, {122899, 230}, {122910, 230}, {781, 230}, {11761, 230}, {2278, 220}, {70506, 230}, {1631, 220}, {1157, 230}, {119167, 220}, {6833, 230}, {7223, 7}, {92977, 230}, {2509, 9}, {4153, 9}, {7654, 230}, {7392, 230}, {2387, 230}, {7632, 202}, {827, 220}, {2270, 230}, {1611, 27}, {43696, 230}, {868, 230}, {879, 230}, {7381, 220}, {1768, 230}, {1621, 220}, {7394, 1}, {122918, 230}, {2297, 220}, {1449, 230}, {7082, 9}, {2076, 230}, {2087, 230}, {800, 220}, {6098, 9}, {830, 230}, {125141, 220}, {3260, 7}, {7635, 230}, {4959, 230}, {122880, 230}, {122891, 230}, {122902, 230}, {11753, 230}, {2300, 230}, {1618, 34}, {43241, 230}, {1624, 230}, {3865, 220}, {2637, 9}, {2262, 230}, {1866, 230}, {2273, 230}, {7027, 230}, {871, 230}, {6776, 230}, {7384, 220}, {1760, 230}, {7665, 230}, {3641, 103}, {8409, 1}, {122921, 230}, {119141, 216}, {11772, 230}, {1844, 220}, {2079, 230}, {6844, 230}, {119364, 230}, {833, 230}, {1430, 220}, {7616, 230}, {3956, 132}, {7376, 230}, {68109, 220}, {11745, 230}
1698-
};

unicode-data.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,3 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
1414
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
1515
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
1616
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
17-
extern const std::map<uint32_t, uint32_t> unicode_canonical_class;

unicode.cpp

Lines changed: 5 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#include <vector>
1414
#include <locale>
1515
#include <codecvt>
16-
#include <algorithm>
1716

1817
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
1918
std::string result;
@@ -470,54 +469,21 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
470469
throw std::invalid_argument("invalid codepoint");
471470
}
472471

473-
auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
474-
auto cc_a_it = unicode_canonical_class.find(a);
475-
if (cc_a_it != unicode_canonical_class.end()) {
476-
auto cc_b_it = unicode_canonical_class.find(b);
477-
if (cc_b_it != unicode_canonical_class.end()) {
478-
return cc_a_it->second < cc_b_it->second;
479-
}
480-
481-
}
482-
return false;
483-
};
484-
485-
// Function to sort subsequences based on canonical class
486-
std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts) {
487-
// Sort the sequence using the custom comparator function
488-
sort(cpts.begin(), cpts.end(), compareByCanonicalClass);
489-
return cpts;
490-
}
491-
492-
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, uint32_t starting_offset) {
472+
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
493473
std::vector<uint32_t> result;
494-
for (auto i = starting_offset; i < cpts.size(); i++) {
495-
const auto& it = unicode_map_nfd.equal_range(cpts[i]);
474+
for (uint32_t cpt : cpts) {
475+
auto it = unicode_map_nfd.equal_range(cpt);
496476
if (it.first != it.second) {
497-
uint offset = 0;
498477
for (auto jt = it.first; jt != it.second; jt++) {
499-
if (offset == 0) {
500-
cpts[i] = jt->second;
501-
} else {
502-
cpts.emplace(cpts.begin() + i + offset, jt->second);
503-
}
504-
offset++;
478+
result.push_back(jt->second);
505479
}
506-
const auto & inner_result = canonical_decomposition_cpts(cpts, i);
507-
result.insert(result.end(), inner_result.begin(), inner_result.end());
508-
break;
509480
} else {
510-
result.push_back(cpts[i]);
481+
result.push_back(cpt);
511482
}
512483
}
513484
return result;
514485
}
515486

516-
std::vector<uint32_t> unicode_cpts_normalize_nfd(std::vector<uint32_t> & cpts) {
517-
auto result = canonical_decomposition_cpts(cpts, 0);
518-
return sort_by_canonical_class(result);
519-
}
520-
521487
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
522488
std::vector<uint32_t> result;
523489
size_t offset = 0;

unicode.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
std::string unicode_cpt_to_utf8(uint32_t cp);
1717
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
1818

19-
std::vector<uint32_t> unicode_cpts_normalize_nfd(std::vector<uint32_t> & cpts);
20-
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, uint32_t starting_offset);
21-
std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts);
19+
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
2220

2321
int unicode_cpt_type(uint32_t cp);
2422
int unicode_cpt_type(const std::string & utf8);

0 commit comments

Comments
 (0)