Skip to content

Commit d6edc62

Browse files
author
Joan Martinez
committed
fix: add real values
1 parent 88e943f commit d6edc62

File tree

3 files changed

+7
-34
lines changed

3 files changed

+7
-34
lines changed

unicode-data.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,8 +1694,5 @@ const std::map<char32_t, char32_t> unicode_map_lowercase = {
16941694

16951695

16961696
const std::map<uint32_t, uint32_t> unicode_canonical_class = {
1697-
{65, 0}, // Example: Unicode point A has canonical class 0
1698-
{769, 1}, // Example: Combining acute accent has canonical class 1
1699-
{99, 0}, // Example: Unicode point c has canonical class 0
1700-
{807, 1} // Example: Combining cedilla has canonical class 1
1697+
{42613, 230}, {824, 1}, {811, 220}, {2292, 230}, {2030, 230}, {65065, 220}, {773, 230}, {784, 230}, {2281, 220}, {43233, 230}, {43244, 230}, {6964, 7}, {1434, 222}, {1752, 230}, {7646, 230}, {7657, 230}, {8401, 230}, {8412, 230}, {65057, 230}, {70197, 9}, {6841, 220}, {11764, 230}, {1847, 220}, {42654, 230}, {119143, 1}, {841, 220}, {43699, 230}, {68325, 230}, {1459, 13}, {7397, 1}, {92914, 1}, {7083, 9}, {1558, 230}, {69890, 230}, {119152, 216}, {7619, 230}, {1859, 230}, {1441, 230}, {1452, 230}, {1473, 24}, {42616, 230}, {2090, 230}, {2876, 7}, {792, 220}, {803, 220}, {795, 216}, {856, 232}, {1453, 222}, {42655, 230}, {1771, 230}, {1474, 25}, {122894, 230}, {776, 230}, {7626, 220}, {119212, 230}, {43204, 9}, {1559, 230}, {2303, 230}, {43309, 220}, {125253, 230}, {7638, 230}, {65060, 230}, {1469, 22}, {7387, 230}, {1627, 230}, {12330, 218}, {119174, 230}, {2265, 230}, {7019, 230}, {874, 230}, {6779, 230}, {7668, 230}, {122913, 230}, {3975, 230}, {11775, 230}, {1433, 230}, {2071, 230}, {2082, 230}, {2093, 230}, {6836, 230}, {43235, 230}, {43246, 230}, {836, 230}, {43766, 9}, {92980, 230}, {125136, 220}, {1458, 12}, {6680, 220}, {122886, 230}, {3963, 130}, {2765, 9}, {11748, 230}, {852, 220}, {2284, 230}, {1155, 230}, {2295, 230}, {2033, 230}, {65068, 220}, {787, 230}, {2139, 220}, {43236, 230}, {1619, 230}, {7629, 234}, {1850, 230}, {1861, 230}, {2381, 9}, {7022, 230}, {844, 230}, {855, 230}, {1755, 230}, {7649, 230}, {7660, 230}, {122916, 230}, {6459, 220}, {11756, 230}, {11767, 230}, {43443, 7}, {11505, 230}, {71104, 7}, {8431, 220}, {43014, 9}, {1425, 220}, {2492, 7}, {7622, 230}, {1455, 230}, {12334, 224}, {42619, 230}, {1613, 29}, {2276, 230}, {7142, 7}, {7154, 9}, {8423, 230}, {768, 230}, {779, 230}, {7417, 230}, {1858, 220}, {1763, 220}, {43710, 230}, {4237, 220}, {1842, 230}, {1444, 220}, {7641, 230}, {7652, 230}, {66045, 220}, {122897, 230}, {1630, 230}, {11759, 230}, {119177, 230}, {125138, 220}, {70090, 7}, {825, 220}, {125256, 230}, {1465, 19}, {1553, 230}, {1843, 230}, {1436, 230}, {43247, 230}, {70512, 230}, {3158, 91}, {113822, 1}, {2085, 230}, {2620, 7}, {12442, 8}, {2268, 230}, {3768, 118}, {65071, 230}, {7024, 230}, {1158, 230}, {43713, 230}, {7400, 1}, {1612, 28}, {66424, 230}, {7633, 230}, {1454, 228}, {12333, 222}, {806, 220}, {817, 220}, {828, 220}, {2260, 230}, {7025, 230}, {122908, 230}, {122919, 230}, {3970, 230}, {11770, 230}, {2287, 220}, {1428, 230}, {1439, 230}, {70504, 230}, {70515, 230}, {7679, 220}, {2077, 230}, {42737, 230}, {64286, 26}, {119362, 230}, {122881, 230}, {71103, 9}, {2385, 230}, {3893, 220}, {2279, 230}, {2028, 230}, {877, 230}, {7390, 220}, {119166, 220}, {794, 232}, {782, 230}, {771, 230}, {8426, 1}, {1845, 230}, {1856, 230}, {3787, 122}, {850, 230}, {1447, 220}, {7644, 230}, {4957, 230}, {122889, 230}, {122900, 230}, {6839, 220}, {11751, 230}, {11762, 230}, {11773, 230}, {2288, 27}, {1556, 230}, {69888, 230}, {92978, 230}, {2298, 220}, {4154, 9}, {42614, 230}, {790, 220}, {4958, 230}, {2289, 28}, {2271, 230}, {869, 230}, {1477, 220}, {6774, 230}, {3658, 107}, {7382, 220}, {7663, 230}, {819, 220}, {774, 230}, {7412, 230}, {8407, 230}, {70080, 9}, {119150, 216}, {1864, 220}, {858, 220}, {43307, 220}, {831, 230}, {3864, 220}, {7625, 230}, {7636, 230}, {1865, 230}, {7647, 230}, {6842, 220}, {2748, 7}, {11754, 230}, {822, 1}, {11503, 230}, {4038, 220}, {65063, 220}, {122911, 230}, {6458, 230}, {1431, 230}, {43242, 230}, {70507, 230}, {119168, 220}, {119179, 220}, {6834, 230}, {1750, 230}, {7655, 230}, {7666, 230}, {8410, 1}, {2388, 230}, {839, 220}, {2282, 230}, {2031, 230}, {7395, 1}, {92912, 1}, {7617, 230}, {7628, 230}, {801, 202}, {1857, 230}, {7377, 230}, {8400, 230}, {812, 220}, {861, 234}, {1479, 18}, {842, 230}, {1450, 220}, {1460, 14}, {72767, 9}, {71231, 9}, {119144, 1}, {125142, 220}, {1471, 23}, {122892, 230}, {122903, 230}, {11765, 230}, {122883, 230}, {119210, 230}, {2301, 230}, {43347, 9}, {8429, 220}, {2072, 230}, {92981, 230}, {65058, 230}, {3964, 130}, {1625, 230}, {70851, 7}, {2263, 230}, {872, 230}, {6777, 230}, {7385, 220}, {1761, 230}, {1772, 230}, {3897, 216}, {789, 232}, {8421, 1}, {119142, 216}, {66422, 230}, {70377, 7}, {119153, 216}, {122922, 230}, {2080, 230}, {2091, 230}, {1840, 230}, {6980, 9}, {12331, 228}, {834, 230}, {1442, 220}, {2364, 7}, {7639, 230}, {122884, 230}, {122895, 230}, {11746, 230}, {11757, 230}, {2293, 230}, {65066, 220}, {125254, 230}, {785, 230}, {860, 233}, {2137, 220}, {837, 240}, {3405, 9}, {43245, 230}, {1615, 31}, {2266, 230}, {3530, 9}, {70722, 9}, {7155, 9}, {70726, 7}, {1753, 230}, {1764, 230}, {7658, 230}, {814, 220}, {7669, 230}, {8402, 1}, {11647, 9}, {1848, 220}, {853, 220}, {7398, 1}, {92915, 1}, {7620, 230}, {6837, 220}, {42617, 230}, {804, 220}, {815, 220}, {69818, 7}, {8432, 230}, {777, 230}, {788, 230}, {43456, 9}, {2285, 220}, {119213, 230}, {1426, 230}, {43237, 230}, {70502, 230}, {119163, 220}, {5908, 9}, {7677, 220}, {1614, 30}, {1467, 20}, {864, 234}, {42618, 230}, {3656, 107}, {7650, 230}, {8405, 230}, {65061, 230}, {69939, 9}, {3784, 122}, {6845, 220}, {119175, 230}, {845, 220}, {2277, 230}, {43703, 230}, {875, 230}, {6780, 230}, {2290, 29}, {68153, 1}, {66425, 230}, {7676, 233}, {807, 202}, {12335, 224}, {2083, 230}, {796, 220}, {3785, 122}, {125137, 220}, {122898, 230}, {11749, 230}, {2296, 230}, {1156, 230}, {43711, 230}, {8424, 220}, {65069, 220}, {125257, 230}, {863, 233}, {70003, 7}, {43700, 220}, {43248, 230}, {1620, 230}, {8425, 230}, {826, 220}, {2269, 230}, {7023, 230}, {7631, 220}, {867, 230}, {878, 230}, {1756, 230}, {7661, 230}, {1466, 19}, {11768, 230}, {1851, 220}, {68154, 220}, {1437, 230}, {2034, 220}, {70513, 230}, {2075, 230}, {2086, 230}, {43239, 230}, {829, 230}, {7627, 230}, {7623, 230}, {122890, 230}, {42620, 230}, {820, 1}, {70850, 9}, {2299, 230}, {7388, 220}, {1628, 220}, {780, 230}, {866, 233}, {1429, 230}, {43240, 230}, {70505, 230}, {2261, 230}, {848, 230}, {859, 230}, {1445, 220}, {1759, 230}, {3659, 107}, {1560, 30}, {798, 220}, {809, 220}, {3640, 103}, {7642, 230}, {7653, 230}, {3971, 230}, {7664, 230}, {8408, 1}, {11760, 230}, {11771, 230}, {122909, 230}, {125139, 220}, {70378, 9}, {1554, 230}, {92976, 230}, {1855, 230}, {1448, 230}, {69759, 9}, {42612, 230}, {799, 220}, {2280, 230}, {3769, 118}, {2029, 230}, {1767, 230}, {772, 230}, {783, 230}, {8427, 1}, {1862, 220}, {43232, 230}, {1622, 220}, {1555, 230}, {1159, 230}, {6783, 220}, {6679, 230}, {119169, 220}, {7634, 230}, {7645, 230}, {65056, 230}, {71351, 7}, {1623, 230}, {6840, 220}, {11752, 230}, {818, 220}, {7026, 230}, {870, 230}, {6775, 230}, {122920, 230}, {1461, 15}, {1440, 230}, {1451, 230}, {70516, 230}, {2078, 230}, {2089, 230}, {791, 220}, {1617, 33}, {3642, 9}, {6832, 230}, {6843, 230}, {44013, 9}, {832, 230}, {119363, 230}, {1462, 16}, {70477, 9}, {122882, 230}, {11744, 230}, {1854, 220}, {69702, 9}, {2291, 230}, {2302, 230}, {65064, 220}, {7391, 220}, {43308, 220}, {2386, 220}, {119178, 220}, {4151, 7}, {66272, 220}, {810, 220}, {1846, 230}, {1751, 230}, {7656, 230}, {6109, 230}, {122901, 230}, {122912, 230}, {3974, 230}, {11763, 230}, {11774, 230}, {1432, 230}, {70508, 230}, {2070, 230}, {2081, 230}, {6835, 230}, {7630, 214}, {3157, 84}, {68111, 230}, {43234, 230}, {1557, 230}, {69889, 230}, {92979, 230}, {3277, 9}, {1809, 36}, {3149, 9}, {3962, 130}, {42615, 230}, {840, 220}, {2272, 230}, {2283, 230}, {2032, 230}, {7396, 1}, {7383, 220}, {7675, 230}, {119170, 220}, {786, 230}, {775, 230}, {8428, 220}, {3972, 9}, {119151, 216}, {119211, 230}, {802, 202}, {862, 234}, {843, 230}, {68159, 9}, {7637, 230}, {7648, 230}, {793, 220}, {8403, 1}, {7386, 230}, {122893, 230}, {1626, 230}, {11755, 230}, {11766, 230}, {119173, 230}, {823, 1}, {11504, 230}, {125252, 230}, {43243, 230}, {42607, 230}, {2264, 230}, {873, 230}, {6778, 230}, {3895, 220}, {1762, 230}, {7667, 230}, {7678, 230}, {8411, 230}, {1468, 21}, {7416, 230}, {8422, 1}, {122885, 230}, {851, 220}, {43698, 230}, {7405, 220}, {92913, 1}, {1616, 32}, {7640, 230}, {7378, 230}, {11747, 230}, {813, 220}, {68152, 230}, {7021, 230}, {7618, 220}, {7380, 1}, {119145, 1}, {122904, 230}, {122915, 230}, {2138, 220}, {3953, 129}, {1770, 220}, {8430, 220}, {2073, 230}, {71467, 9}, {92982, 230}, {1754, 230}, {7659, 230}, {65059, 230}, {65070, 230}, {3954, 130}, {3965, 130}, {71350, 9}, {1849, 220}, {854, 220}, {2035, 230}, {7399, 1}, {12332, 232}, {778, 230}, {119154, 216}, {66423, 230}, {7621, 230}, {125258, 7}, {2092, 230}, {8404, 230}, {805, 220}, {835, 230}, {1443, 220}, {1456, 10}, {122896, 230}, {122907, 230}, {11758, 230}, {11769, 230}, {70503, 230}, {1773, 220}, {769, 230}, {65067, 220}, {1457, 11}, {7020, 220}, {125255, 230}, {1552, 230}, {2294, 220}, {69940, 9}, {1629, 230}, {2893, 9}, {119176, 230}, {68326, 220}, {2267, 230}, {2027, 230}, {876, 230}, {7389, 220}, {119165, 220}, {770, 230}, {1463, 17}, {1860, 220}, {92916, 1}, {12441, 8}, {1464, 18}, {838, 230}, {1435, 220}, {6313, 228}, {7643, 230}, {1476, 230}, {70198, 7}, {122888, 230}, {6838, 220}, {11750, 230}, {125140, 220}, {119149, 226}, {816, 220}, {5940, 9}, {2275, 220}, {2286, 220}, {1427, 230}, {1438, 230}, {43238, 230}, {43249, 230}, {70514, 230}, {119164, 220}, {42736, 230}, {1863, 230}, {865, 234}, {6752, 9}, {6773, 230}, {3657, 107}, {3021, 9}, {7651, 230}, {7662, 230}, {3968, 130}, {6457, 222}, {8406, 230}, {8417, 230}, {65062, 230}, {1841, 220}, {1852, 220}, {69817, 9}, {846, 220}, {857, 220}, {43704, 230}, {66426, 230}, {7624, 230}, {808, 202}, {1561, 31}, {1853, 230}, {42621, 230}, {821, 1}, {797, 220}, {3786, 122}, {849, 230}, {1446, 220}, {70460, 7}, {1562, 32}, {1648, 35}, {122899, 230}, {122910, 230}, {781, 230}, {11761, 230}, {2278, 220}, {70506, 230}, {1631, 220}, {1157, 230}, {119167, 220}, {6833, 230}, {7223, 7}, {92977, 230}, {2509, 9}, {4153, 9}, {7654, 230}, {7392, 230}, {2387, 230}, {7632, 202}, {827, 220}, {2270, 230}, {1611, 27}, {43696, 230}, {868, 230}, {879, 230}, {7381, 220}, {1768, 230}, {1621, 220}, {7394, 1}, {122918, 230}, {2297, 220}, {1449, 230}, {7082, 9}, {2076, 230}, {2087, 230}, {800, 220}, {6098, 9}, {830, 230}, {125141, 220}, {3260, 7}, {7635, 230}, {4959, 230}, {122880, 230}, {122891, 230}, {122902, 230}, {11753, 230}, {2300, 230}, {1618, 34}, {43241, 230}, {1624, 230}, {3865, 220}, {2637, 9}, {2262, 230}, {1866, 230}, {2273, 230}, {7027, 230}, {871, 230}, {6776, 230}, {7384, 220}, {1760, 230}, {7665, 230}, {3641, 103}, {8409, 1}, {122921, 230}, {119141, 216}, {11772, 230}, {1844, 220}, {2079, 230}, {6844, 230}, {119364, 230}, {833, 230}, {1430, 220}, {7616, 230}, {3956, 132}, {7376, 230}, {68109, 220}, {11745, 230}
17011698
};

unicode.cpp

Lines changed: 5 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <vector>
1414
#include <locale>
1515
#include <codecvt>
16+
#include <algorithm>
1617

1718
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
1819
std::string result;
@@ -470,9 +471,7 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
470471
}
471472

472473
// Function to sort subsequences based on canonical class
473-
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts) {
474-
std::vector<uint32_t> subsequence;
475-
std::vector<uint32_t> result;
474+
std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts) {
476475
auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
477476
auto cc_a_it = unicode_canonical_class.find(a);
478477
if (cc_a_it != unicode_canonical_class.end()) {
@@ -485,33 +484,9 @@ std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts
485484
return false;
486485
};
487486

488-
for (const auto& cpt : cpts) {
489-
auto it = unicode_canonical_class.find(cpt);
490-
if (it != unicode_canonical_class.end()) {
491-
if (it->second > 0) {
492-
subsequence.push_back(cpt);
493-
} else {
494-
if (!subsequence.empty()) {
495-
sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
496-
for (const auto& codepoint : subsequence) {
497-
result.push_back(codepoint);
498-
}
499-
subsequence.clear();
500-
}
501-
502-
result.push_back(cpt);
503-
}
504-
}
505-
}
506-
507-
if (!subsequence.empty()) {
508-
sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
509-
for (const auto& codepoint : subsequence) {
510-
result.push_back(codepoint);
511-
}
512-
}
513-
514-
return result;
487+
// Sort the sequence using the custom comparator function
488+
sort(cpts.begin(), cpts.end(), compareByCanonicalClass);
489+
return cpts;
515490
}
516491

517492
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, const std::vector<uint32_t>::iterator& cpt_begin, const std::vector<uint32_t>::iterator& cpt_end) {

unicode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
1818

1919
std::vector<uint32_t> unicode_cpts_normalize_nfd(std::vector<uint32_t> & cpts);
2020
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, const std::vector<uint32_t>::iterator& cpt_begin, const std::vector<uint32_t>::iterator& cpt_end);
21+
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts);
2122

2223
int unicode_cpt_type(uint32_t cp);
2324
int unicode_cpt_type(const std::string & utf8);

0 commit comments

Comments
 (0)