@@ -470,47 +470,47 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
470
470
throw std::invalid_argument (" invalid codepoint" );
471
471
}
472
472
473
- // Function to sort subsequences based on canonical class
474
- std::vector<uint32_t > sort_by_canonical_class (std::vector<uint32_t > & cpts) {
475
- auto compareByCanonicalClass = [&](const uint32_t & a, const uint32_t & b) {
476
- auto cc_a_it = unicode_canonical_class.find (a);
477
- if (cc_a_it != unicode_canonical_class.end ()) {
478
- auto cc_b_it = unicode_canonical_class.find (b);
479
- if (cc_b_it != unicode_canonical_class.end ()) {
480
- return cc_a_it->second < cc_b_it->second ;
481
- }
482
-
473
+ auto compareByCanonicalClass = [&](const uint32_t & a, const uint32_t & b) {
474
+ auto cc_a_it = unicode_canonical_class.find (a);
475
+ if (cc_a_it != unicode_canonical_class.end ()) {
476
+ auto cc_b_it = unicode_canonical_class.find (b);
477
+ if (cc_b_it != unicode_canonical_class.end ()) {
478
+ return cc_a_it->second < cc_b_it->second ;
483
479
}
484
- return false ;
485
- };
486
480
481
+ }
482
+ return false ;
483
+ };
484
+
485
+ // Function to sort subsequences based on canonical class
486
+ std::vector<uint32_t > sort_by_canonical_class (std::vector<uint32_t > & cpts) {
487
487
// Sort the sequence using the custom comparator function
488
488
sort (cpts.begin (), cpts.end (), compareByCanonicalClass);
489
489
return cpts;
490
490
}
491
491
492
- std::vector<uint32_t > canonical_decomposition_cpts (std::vector<uint32_t > & cpts, const std::vector< uint32_t >::iterator& cpt_begin, const std::vector< uint32_t >::iterator& cpt_end ) {
492
+ std::vector<uint32_t > canonical_decomposition_cpts (std::vector<uint32_t > & cpts, uint32_t starting_offset ) {
493
493
std::vector<uint32_t > result;
494
- for (auto cpt_it = cpt_begin; cpt_it != cpt_end; ++cpt_it ) {
495
- auto it = unicode_map_nfd.equal_range (*cpt_it );
494
+ for (auto i = starting_offset; i < cpts. size (); i++ ) {
495
+ auto it = unicode_map_nfd.equal_range (cpts[i] );
496
496
if (it.first != it.second ) {
497
497
uint offset = 0 ;
498
498
for (auto jt = it.first ; jt != it.second ; jt++) {
499
- cpts.insert (cpt_it + offset, jt->second );
499
+ cpts.emplace (cpts. begin () + i + offset, jt->second );
500
500
offset++;
501
501
}
502
- const auto & inner_result = canonical_decomposition_cpts (cpts, cpt_it, cpt_end );
502
+ const auto & inner_result = canonical_decomposition_cpts (cpts, i );
503
503
result.insert (result.end (), inner_result.begin (), inner_result.end ());
504
504
break ;
505
505
} else {
506
- result.push_back (*cpt_it );
506
+ result.push_back (cpts[i] );
507
507
}
508
508
}
509
509
return result;
510
510
}
511
511
512
512
std::vector<uint32_t > unicode_cpts_normalize_nfd (std::vector<uint32_t > & cpts) {
513
- auto result = canonical_decomposition_cpts (cpts, cpts. begin (), cpts. end () );
513
+ auto result = canonical_decomposition_cpts (cpts, 0 );
514
514
return sort_by_canonical_class (result);
515
515
}
516
516
0 commit comments