Skip to content

Commit eb5a0e1

Browse files
author
Joan Martinez
committed
fix: do not insert in the middle of iteration
1 parent d6edc62 commit eb5a0e1

File tree

2 files changed

+21
-21
lines changed

2 files changed

+21
-21
lines changed

unicode.cpp

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -470,47 +470,47 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
470470
throw std::invalid_argument("invalid codepoint");
471471
}
472472

473-
// Function to sort subsequences based on canonical class
474-
std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts) {
475-
auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
476-
auto cc_a_it = unicode_canonical_class.find(a);
477-
if (cc_a_it != unicode_canonical_class.end()) {
478-
auto cc_b_it = unicode_canonical_class.find(b);
479-
if (cc_b_it != unicode_canonical_class.end()) {
480-
return cc_a_it->second < cc_b_it->second;
481-
}
482-
473+
auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
474+
auto cc_a_it = unicode_canonical_class.find(a);
475+
if (cc_a_it != unicode_canonical_class.end()) {
476+
auto cc_b_it = unicode_canonical_class.find(b);
477+
if (cc_b_it != unicode_canonical_class.end()) {
478+
return cc_a_it->second < cc_b_it->second;
483479
}
484-
return false;
485-
};
486480

481+
}
482+
return false;
483+
};
484+
485+
// Function to sort subsequences based on canonical class
486+
std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts) {
487487
// Sort the sequence using the custom comparator function
488488
sort(cpts.begin(), cpts.end(), compareByCanonicalClass);
489489
return cpts;
490490
}
491491

492-
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, const std::vector<uint32_t>::iterator& cpt_begin, const std::vector<uint32_t>::iterator& cpt_end) {
492+
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, uint32_t starting_offset) {
493493
std::vector<uint32_t> result;
494-
for (auto cpt_it = cpt_begin; cpt_it != cpt_end; ++cpt_it) {
495-
auto it = unicode_map_nfd.equal_range(*cpt_it);
494+
for (auto i = starting_offset; i < cpts.size(); i++) {
495+
auto it = unicode_map_nfd.equal_range(cpts[i]);
496496
if (it.first != it.second) {
497497
uint offset = 0;
498498
for (auto jt = it.first; jt != it.second; jt++) {
499-
cpts.insert(cpt_it + offset, jt->second);
499+
cpts.emplace(cpts.begin() + i + offset, jt->second);
500500
offset++;
501501
}
502-
const auto & inner_result = canonical_decomposition_cpts(cpts, cpt_it, cpt_end);
502+
const auto & inner_result = canonical_decomposition_cpts(cpts, i);
503503
result.insert(result.end(), inner_result.begin(), inner_result.end());
504504
break;
505505
} else {
506-
result.push_back(*cpt_it);
506+
result.push_back(cpts[i]);
507507
}
508508
}
509509
return result;
510510
}
511511

512512
std::vector<uint32_t> unicode_cpts_normalize_nfd(std::vector<uint32_t> & cpts) {
513-
auto result = canonical_decomposition_cpts(cpts, cpts.begin(), cpts.end());
513+
auto result = canonical_decomposition_cpts(cpts, 0);
514514
return sort_by_canonical_class(result);
515515
}
516516

unicode.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ std::string unicode_cpt_to_utf8(uint32_t cp);
1717
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
1818

1919
std::vector<uint32_t> unicode_cpts_normalize_nfd(std::vector<uint32_t> & cpts);
20-
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, const std::vector<uint32_t>::iterator& cpt_begin, const std::vector<uint32_t>::iterator& cpt_end);
21-
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts);
20+
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, uint32_t starting_offset);
21+
std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts);
2222

2323
int unicode_cpt_type(uint32_t cp);
2424
int unicode_cpt_type(const std::string & utf8);

0 commit comments

Comments
 (0)