From 0c4d7b380bf4fab7e5f7d2639b9ab5c3d5534a8e Mon Sep 17 00:00:00 2001 From: "Lasse R.H. Nielsen" Date: Mon, 26 May 2025 14:54:48 +0200 Subject: [PATCH 1/6] Optimize surrogate decoding. Use `char ^ 0xD800 <= 0x3FF` to check if a char code is a lead surrogate. That avoids doing a later `& 0x3FF` to get rid of the top bits. Similar for tail surrogate. This ensures that the `high` function gets values without high bits. Also optimize that function to reduce dependency depth and try to hit `base + (something < small)` expressions that can optimized into a single x64 address computation. Gives a ~7% increase on backwards traversal and 38% increase for forward traversal, based on tool/benchmark.dart compiled with `dart compile exe`. --- pkgs/characters/CHANGELOG.md | 5 +- pkgs/characters/lib/src/characters_impl.dart | 7 +-- .../lib/src/grapheme_clusters/breaks.dart | 53 ++++++++++--------- .../lib/src/grapheme_clusters/table.dart | 8 +-- pkgs/characters/pubspec.yaml | 2 +- pkgs/characters/test/src/unicode_tests.dart | 2 +- pkgs/characters/tool/bin/generate_tables.dart | 24 ++++++--- .../tool/src/string_literal_writer.dart | 20 +++---- 8 files changed, 71 insertions(+), 50 deletions(-) diff --git a/pkgs/characters/CHANGELOG.md b/pkgs/characters/CHANGELOG.md index 199244602..f8782bf82 100644 --- a/pkgs/characters/CHANGELOG.md +++ b/pkgs/characters/CHANGELOG.md @@ -1,6 +1,7 @@ -## 1.4.1-wip +## 1.4.1 -- Run `dart format` with the new style. +* Run `dart format` with the new style. +* Performance improvement for non-BMP characters. ## 1.4.0 diff --git a/pkgs/characters/lib/src/characters_impl.dart b/pkgs/characters/lib/src/characters_impl.dart index e8694339f..47578a987 100644 --- a/pkgs/characters/lib/src/characters_impl.dart +++ b/pkgs/characters/lib/src/characters_impl.dart @@ -509,13 +509,14 @@ class StringCharacterRange implements CharacterRange { var index = _end; while (index < _string.length) { var char = _string.codeUnitAt(index); + var surrogateBits = char ^ 0xD800; var category = categoryControl; var nextIndex = index + 1; - if (char & 0xFC00 != 0xD800) { + if (surrogateBits >= 0x3FF) { category = low(char); } else if (nextIndex < _string.length) { - var nextChar = _string.codeUnitAt(nextIndex); - if (nextChar & 0xFC00 == 0xDC00) { + var nextChar = _string.codeUnitAt(nextIndex) ^ 0xDC00; + if (nextChar <= 0x3FF) { nextIndex += 1; category = high(char, nextChar); } diff --git a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart index 4e8c9e8d6..ff767b36e 100644 --- a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart +++ b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart @@ -76,16 +76,17 @@ class Breaks { void step() { assert(cursor < end); var char = base.codeUnitAt(cursor++); - if (char & 0xFC00 != 0xD800) { + var surrogate = char ^ 0xD800; + if (surrogate >= 0x3FF) { state = move(state, low(char)); return; } // The category of an unpaired lead surrogate is Control. int category; - int nextChar; + int nextSurrogate; if (cursor < end && - (nextChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) { - category = high(char, nextChar); + (nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) { + category = high(surrogate, nextSurrogate); cursor++; } else { category = categoryControl; @@ -112,27 +113,28 @@ class Breaks { } var cursorBefore = cursor - 1; var prevChar = base.codeUnitAt(cursorBefore); + var prevSurrogate = prevChar ^ 0xD800; int prevCategory; - if (prevChar & 0xF800 != 0xD800) { + if (prevSurrogate > 0x7FF) { // Not surrogate. prevCategory = low(prevChar); - } else if (prevChar & 0xFC00 == 0xD800) { + } else if (prevSurrogate <= 0x3FF) { // Lead surrogate. Check for a following tail surrogate. - int tailChar; + int tailSurrogate; if (cursor < end && - (tailChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) { + (tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) { cursor += 1; - prevCategory = high(prevChar, tailChar); + prevCategory = high(prevSurrogate, tailSurrogate); } else { prevCategory = categoryControl; } } else { // Tail surrogate, check for prior lead surrogate. - int leadChar; + int leadSurrogate; var leadIndex = cursorBefore - 1; if (leadIndex >= start && - (leadChar = base.codeUnitAt(leadIndex)) & 0xFC00 == 0xD800) { - prevCategory = high(leadChar, prevChar); + (leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) { + prevCategory = high(leadSurrogate, prevSurrogate); cursorBefore = leadIndex; } else { prevCategory = categoryControl; @@ -206,7 +208,8 @@ class BackBreaks { void step() { assert(cursor > start); var char = base.codeUnitAt(--cursor); - if (char & 0xFC00 != 0xDC00) { + var surrogate = char ^ 0xDC00; + if (surrogate > 0x3FF) { var category = low(char); state = moveBack(state, category); return; @@ -214,10 +217,10 @@ class BackBreaks { // Found tail surrogate, check for prior lead surrogate. // The category of an unpaired tail surrogate is Control. int category; - int prevChar; + int prevSurrogate; if (cursor >= start && - (prevChar = base.codeUnitAt(--cursor)) & 0xFC00 == 0xD800) { - category = high(prevChar, char); + (prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) { + category = high(prevSurrogate, surrogate); } else { category = categoryControl; cursor++; @@ -342,21 +345,23 @@ int previousBreak(String text, int start, int end, int index) { if (start < index && index < end) { var cursorBefore = index; var nextChar = text.codeUnitAt(index); + var nextSurrogate = nextChar ^ 0xD800; var category = categoryControl; - if (nextChar & 0xF800 != 0xD800) { + if (nextSurrogate > 0x7FF) { category = low(nextChar); - } else if (nextChar & 0xFC00 == 0xD800) { + } else if (nextSurrogate <= 0x3FF) { var indexAfter = index + 1; if (indexAfter < end) { - var secondChar = text.codeUnitAt(indexAfter); - if (secondChar & 0xFC00 == 0xDC00) { - category = high(nextChar, secondChar); + var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00; + if (secondSurrogate <= 0x3FF) { + category = high(nextChar, secondSurrogate); } } } else { - var prevChar = text.codeUnitAt(index - 1); - if (prevChar & 0xFC00 == 0xD800) { - category = high(prevChar, nextChar); + var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800; + nextSurrogate &= 0x3FF; + if (prevSurrogate <= 0x3FF) { + category = high(prevSurrogate, nextSurrogate); cursorBefore -= 1; } } diff --git a/pkgs/characters/lib/src/grapheme_clusters/table.dart b/pkgs/characters/lib/src/grapheme_clusters/table.dart index fce9c85a2..46da8f703 100644 --- a/pkgs/characters/lib/src/grapheme_clusters/table.dart +++ b/pkgs/characters/lib/src/grapheme_clusters/table.dart @@ -1139,10 +1139,10 @@ int low(int codeUnit) { @pragma('vm:prefer-inline') @pragma('wasm:prefer-inline') int high(int lead, int tail) { - var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + (2048 << 8); - var chunkStart = _start.codeUnitAt(offset >> 8); - var index = chunkStart + (tail & 255); - return _data.codeUnitAt(index); + var offset = (tail >> 8) + (lead << 2); + tail &= 255; + var chunkStart = _start.codeUnitAt(2048 + offset); + return _data.codeUnitAt(chunkStart + tail); } const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey' diff --git a/pkgs/characters/pubspec.yaml b/pkgs/characters/pubspec.yaml index 0ab15fbec..183fbcd4d 100644 --- a/pkgs/characters/pubspec.yaml +++ b/pkgs/characters/pubspec.yaml @@ -1,5 +1,5 @@ name: characters -version: 1.4.1-wip +version: 1.4.1 description: >- String replacement with operations that are Unicode/grapheme cluster aware. repository: https://github.com/dart-lang/core/tree/main/pkgs/characters diff --git a/pkgs/characters/test/src/unicode_tests.dart b/pkgs/characters/test/src/unicode_tests.dart index 5c01dda8f..3fc50ee2f 100644 --- a/pkgs/characters/test/src/unicode_tests.dart +++ b/pkgs/characters/test/src/unicode_tests.dart @@ -31,7 +31,7 @@ String testDescription(List expected) { int categoryOf(int codePoint) { if (codePoint < 0x10000) return low(codePoint); var nonBmpOffset = codePoint - 0x10000; - return high(0xD800 + (nonBmpOffset >> 10), 0xDC00 + (nonBmpOffset & 0x3ff)); + return high(nonBmpOffset >> 10, nonBmpOffset & 0x3ff); } String partCategories(List parts) { diff --git a/pkgs/characters/tool/bin/generate_tables.dart b/pkgs/characters/tool/bin/generate_tables.dart index f348276cf..96822ccd8 100644 --- a/pkgs/characters/tool/bin/generate_tables.dart +++ b/pkgs/characters/tool/bin/generate_tables.dart @@ -331,16 +331,28 @@ int $name(int lead, int tail) { '''; } var shift = chunkSize.bitLength - 1; - var indexVar = chunkSize < 1024 ? 'tail' : 'offset'; - return ''' + assert(shift <= 10); + if (shift < 10) { + return ''' $preferInline int $name(int lead, int tail) { - var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift); - var chunkStart = $startName.codeUnitAt(offset >> $shift); - var index = chunkStart + ($indexVar & ${chunkSize - 1}); - return $dataName.codeUnitAt(index); + var offset = (tail >> $shift) + (lead << ${10 - shift}); + tail &= ${chunkSize - 1}; + var chunkStart = $startName.codeUnitAt($startOffset + offset); + return $dataName.codeUnitAt(chunkStart + tail); +} +'''; + } else { + assert(shift == 10); + return ''' +$preferInline +int $name(int lead, int tail) { + var chunkStart = $startName.codeUnitAt($startOffset + lead); + return $dataName.codeUnitAt(chunkStart + tail); } '''; + } + // Add code if shift > 10 ever becomes optimal for table size. } // ----------------------------------------------------------------------------- diff --git a/pkgs/characters/tool/src/string_literal_writer.dart b/pkgs/characters/tool/src/string_literal_writer.dart index 9259b4eac..96ca93ff3 100644 --- a/pkgs/characters/tool/src/string_literal_writer.dart +++ b/pkgs/characters/tool/src/string_literal_writer.dart @@ -43,12 +43,12 @@ class StringLiteralWriter { /// Adds a single UTF-16 code unit. void add(int codeUnit) { // Always escape: `\n`, `\r`, `'`, `$` and `\`, plus anything the user wants. - if (_escape(codeUnit) || - codeUnit == 0x24 || - codeUnit == 0x27 || - codeUnit == 0x5c || - codeUnit == 0x0a || - codeUnit == 0x0d) { + if (_escape(codeUnit) || // Anything the user wants encoded. + codeUnit == 0x24 /* $ */ || + codeUnit == 0x27 /* ' */ || + codeUnit == 0x5c /* \ */ || + codeUnit == 0x0a /* \n */ || + codeUnit == 0x0d /* \r */) { _writeEscape(codeUnit); return; } @@ -59,6 +59,9 @@ class StringLiteralWriter { buffer.writeCharCode(codeUnit); } + /// Writes an escape for the [codeUnit]. + /// + /// Is only called for characters that need escaping. void _writeEscape(int codeUnit) { var replacement = _escapeCache[codeUnit]; if (replacement == null) { @@ -83,9 +86,8 @@ class StringLiteralWriter { replacement = r'\$'; } else if (codeUnit == "'".codeUnitAt(0)) { replacement = r"\'"; - } - if (codeUnit == r''.codeUnitAt(0)) { - replacement = r'\'; + } else if (codeUnit == r'\'.codeUnitAt(0)) { + replacement = r'\\'; } else { replacement = r'\x' + codeUnit.toRadixString(16); } From 22df6eb5843ae63f73d8000c353d9380e3976818 Mon Sep 17 00:00:00 2001 From: "Lasse R.H. Nielsen" Date: Mon, 26 May 2025 16:47:49 +0200 Subject: [PATCH 2/6] Now without bug. --- pkgs/characters/lib/src/characters_impl.dart | 10 +++--- .../lib/src/grapheme_clusters/breaks.dart | 34 +++++++++++-------- pkgs/characters/tool/bin/generate_tables.dart | 22 ++++-------- 3 files changed, 31 insertions(+), 35 deletions(-) diff --git a/pkgs/characters/lib/src/characters_impl.dart b/pkgs/characters/lib/src/characters_impl.dart index 47578a987..a901b5175 100644 --- a/pkgs/characters/lib/src/characters_impl.dart +++ b/pkgs/characters/lib/src/characters_impl.dart @@ -509,16 +509,16 @@ class StringCharacterRange implements CharacterRange { var index = _end; while (index < _string.length) { var char = _string.codeUnitAt(index); - var surrogateBits = char ^ 0xD800; + var surrogate = char ^ 0xD800; var category = categoryControl; var nextIndex = index + 1; - if (surrogateBits >= 0x3FF) { + if (surrogate > 0x3FF) { category = low(char); } else if (nextIndex < _string.length) { - var nextChar = _string.codeUnitAt(nextIndex) ^ 0xDC00; - if (nextChar <= 0x3FF) { + var nextSurrogate = _string.codeUnitAt(nextIndex) ^ 0xDC00; + if (nextSurrogate <= 0x3FF) { nextIndex += 1; - category = high(char, nextChar); + category = high(surrogate, nextSurrogate); } } state = move(state, category); diff --git a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart index ff767b36e..ec373d70e 100644 --- a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart +++ b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart @@ -77,7 +77,7 @@ class Breaks { assert(cursor < end); var char = base.codeUnitAt(cursor++); var surrogate = char ^ 0xD800; - if (surrogate >= 0x3FF) { + if (surrogate > 0x3FF) { state = move(state, low(char)); return; } @@ -114,24 +114,18 @@ class Breaks { var cursorBefore = cursor - 1; var prevChar = base.codeUnitAt(cursorBefore); var prevSurrogate = prevChar ^ 0xD800; - int prevCategory; if (prevSurrogate > 0x7FF) { // Not surrogate. - prevCategory = low(prevChar); - } else if (prevSurrogate <= 0x3FF) { - // Lead surrogate. Check for a following tail surrogate. - int tailSurrogate; - if (cursor < end && - (tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) { - cursor += 1; - prevCategory = high(prevSurrogate, tailSurrogate); - } else { - prevCategory = categoryControl; - } - } else { + var prevCategory = low(prevChar); + state = move(stateCAny, prevCategory); + return cursorBefore; + } + int prevCategory; + if (prevSurrogate > 0x3FF) { // Tail surrogate, check for prior lead surrogate. int leadSurrogate; var leadIndex = cursorBefore - 1; + prevSurrogate &= 0x3FF; if (leadIndex >= start && (leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) { prevCategory = high(leadSurrogate, prevSurrogate); @@ -139,6 +133,16 @@ class Breaks { } else { prevCategory = categoryControl; } + } else { + // Lead surrogate. Check for a following tail surrogate. + int tailSurrogate; + if (cursor < end && + (tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) { + cursor += 1; + prevCategory = high(prevSurrogate, tailSurrogate); + } else { + prevCategory = categoryControl; + } } state = move(stateCAny, prevCategory); return cursorBefore; @@ -354,7 +358,7 @@ int previousBreak(String text, int start, int end, int index) { if (indexAfter < end) { var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00; if (secondSurrogate <= 0x3FF) { - category = high(nextChar, secondSurrogate); + category = high(nextSurrogate, secondSurrogate); } } } else { diff --git a/pkgs/characters/tool/bin/generate_tables.dart b/pkgs/characters/tool/bin/generate_tables.dart index 96822ccd8..cc91eebe8 100644 --- a/pkgs/characters/tool/bin/generate_tables.dart +++ b/pkgs/characters/tool/bin/generate_tables.dart @@ -320,18 +320,17 @@ String _lookupSurrogatesMethod( int startOffset, int chunkSize, ) { - if (chunkSize == 1024) { + var shift = chunkSize.bitLength - 1; + assert(shift <= 10); + if (shift == 10) { return ''' $preferInline int $name(int lead, int tail) { - var chunkStart = $startName.codeUnitAt($startOffset + (0x3ff & lead)); - var index = chunkStart + (0x3ff & tail); - return $dataName.codeUnitAt(index); + var chunkStart = $startName.codeUnitAt($startOffset + lead); + return $dataName.codeUnitAt(chunkStart + tail); } '''; } - var shift = chunkSize.bitLength - 1; - assert(shift <= 10); if (shift < 10) { return ''' $preferInline @@ -341,18 +340,11 @@ int $name(int lead, int tail) { var chunkStart = $startName.codeUnitAt($startOffset + offset); return $dataName.codeUnitAt(chunkStart + tail); } -'''; - } else { - assert(shift == 10); - return ''' -$preferInline -int $name(int lead, int tail) { - var chunkStart = $startName.codeUnitAt($startOffset + lead); - return $dataName.codeUnitAt(chunkStart + tail); -} '''; } // Add code if shift > 10 ever becomes optimal for table size. + // Example code: + throw UnimplementedError('No code for chunk sizes > 10 bits'); } // ----------------------------------------------------------------------------- From 232d7713fe927e1e0aa0564e5073d82fb4d88cad Mon Sep 17 00:00:00 2001 From: "Lasse R.H. Nielsen" Date: Wed, 18 Jun 2025 10:53:08 +0200 Subject: [PATCH 3/6] Address comments. --- .../lib/src/grapheme_clusters/table.dart | 10 ++++++---- pkgs/characters/pubspec.yaml | 6 +++--- pkgs/characters/test/characters_test.dart | 17 +++++++++-------- pkgs/characters/tool/benchmark.dart | 18 +++++++++--------- pkgs/characters/tool/bin/generate_tables.dart | 14 +++++++++----- .../tool/src/grapheme_category_loader.dart | 3 --- 6 files changed, 36 insertions(+), 32 deletions(-) diff --git a/pkgs/characters/lib/src/grapheme_clusters/table.dart b/pkgs/characters/lib/src/grapheme_clusters/table.dart index 46da8f703..79d336505 100644 --- a/pkgs/characters/lib/src/grapheme_clusters/table.dart +++ b/pkgs/characters/lib/src/grapheme_clusters/table.dart @@ -1130,6 +1130,7 @@ const String _start = '\u1132\u166c\u166c\u206f\u11c0\u13fb\u166c\u166c\u166c' @pragma('vm:prefer-inline') @pragma('wasm:prefer-inline') int low(int codeUnit) { + assert(codeUnit <= 0xFFFF); var chunkStart = _start.codeUnitAt(codeUnit >> 5); var index = chunkStart + (codeUnit & 31); return _data.codeUnitAt(index); @@ -1139,10 +1140,11 @@ int low(int codeUnit) { @pragma('vm:prefer-inline') @pragma('wasm:prefer-inline') int high(int lead, int tail) { - var offset = (tail >> 8) + (lead << 2); - tail &= 255; - var chunkStart = _start.codeUnitAt(2048 + offset); - return _data.codeUnitAt(chunkStart + tail); + assert(lead <= 0x3FF && tail <= 0x3FF); + var chunkIndex = (tail >> 8) + (lead << 2); + var byteIndex = tail & 255; + var chunkStart = _start.codeUnitAt(2048 + chunkIndex); + return _data.codeUnitAt(chunkStart + byteIndex); } const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey' diff --git a/pkgs/characters/pubspec.yaml b/pkgs/characters/pubspec.yaml index 183fbcd4d..7f6782654 100644 --- a/pkgs/characters/pubspec.yaml +++ b/pkgs/characters/pubspec.yaml @@ -10,8 +10,8 @@ topics: - unicode environment: - sdk: ^3.4.0 + sdk: ^3.8.0 dev_dependencies: - dart_flutter_team_lints: ^3.1.0 - test: ^1.16.6 + dart_flutter_team_lints: ^3.5.0 + test: ^1.26.0 diff --git a/pkgs/characters/test/characters_test.dart b/pkgs/characters/test/characters_test.dart index d259cbcab..9f39961b6 100644 --- a/pkgs/characters/test/characters_test.dart +++ b/pkgs/characters/test/characters_test.dart @@ -115,28 +115,29 @@ void main([List? args]) { var zwj = '\u200d'; // U+200D, ZWJ var rainbow = '\u{1f308}'; // U+1F308, Rainbow. Category Pictogram - var rbflag = '$flag$white$zwj$rainbow'; - var string = '-$rbflag-'; + var rainbowFlag = '$flag$white$zwj$rainbow'; + var string = '-$rainbowFlag-'; var range = CharacterRange.at(string, 1); expect(range.isEmpty, true); expect(range.moveNext(), true); - expect(range.current, rbflag); + expect(range.current, rainbowFlag); range = range = CharacterRange.at(string, 2); expect(range.isEmpty, false); - expect(range.current, rbflag); + expect(range.current, rainbowFlag); range = range = CharacterRange.at(string, 0, 2); expect(range.isEmpty, false); - expect(range.current, '-$rbflag'); + expect(range.current, '-$rainbowFlag'); range = range = CharacterRange.at(string, 0, 2); expect(range.isEmpty, false); - expect(range.current, '-$rbflag'); + expect(range.current, '-$rainbowFlag'); - range = range = CharacterRange.at(string, 2, '-$rbflag'.length - 1); + range = + range = CharacterRange.at(string, 2, '-$rainbowFlag'.length - 1); expect(range.isEmpty, false); - expect(range.current, rbflag); + expect(range.current, rainbowFlag); expect(range.stringBeforeLength, 1); range = range = CharacterRange.at(string, 0, string.length); diff --git a/pkgs/characters/tool/benchmark.dart b/pkgs/characters/tool/benchmark.dart index a77b811e5..a72f99373 100644 --- a/pkgs/characters/tool/benchmark.dart +++ b/pkgs/characters/tool/benchmark.dart @@ -16,8 +16,8 @@ void main(List args) { if (args.isNotEmpty) { count = int.parse(args[0]); } - var gcsf = 0; - var gcsb = 0; + var gcSumForward = 0; + var gcSumBackwards = 0; var text = genesis + hangul + @@ -29,19 +29,19 @@ void main(List args) { var codeUnits = text.length; var codePoints = text.runes.length; for (var i = 0; i < count; i++) { - gcsf = benchForward(text, i, codePoints, codeUnits); - gcsb = benchBackward(text, i, codePoints, codeUnits); + gcSumForward = benchForward(text, i, codePoints, codeUnits); + gcSumBackwards = benchBackward(text, i, codePoints, codeUnits); } print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.'); - if (gcsf != gcsb) { + if (gcSumForward != gcSumBackwards) { print( 'ERROR: Did not count the same number of grapheme clusters: ' - '$gcsf forward vs. $gcsb backward.', + '$gcSumForward forward vs. $gcSumBackwards backward.', ); } else { - print('Total: $gcsf gc, $codePoints cp, $codeUnits cu'); - print('Avg ${(codePoints / gcsf).toStringAsFixed(3)} cp/gc'); - print('Avg ${(codeUnits / gcsf).toStringAsFixed(3)} cu/gc'); + print('Total: $gcSumForward gc, $codePoints cp, $codeUnits cu'); + print('Avg ${(codePoints / gcSumForward).toStringAsFixed(3)} cp/gc'); + print('Avg ${(codeUnits / gcSumForward).toStringAsFixed(3)} cu/gc'); } } diff --git a/pkgs/characters/tool/bin/generate_tables.dart b/pkgs/characters/tool/bin/generate_tables.dart index cc91eebe8..a82924672 100644 --- a/pkgs/characters/tool/bin/generate_tables.dart +++ b/pkgs/characters/tool/bin/generate_tables.dart @@ -307,6 +307,7 @@ String _lookupMethod( ''' $preferInline int $name(int codeUnit) { + assert(codeUnit <= 0xFFFF); var chunkStart = $startName.codeUnitAt(codeUnit >> ${chunkSize.bitLength - 1}); var index = chunkStart + (codeUnit & ${chunkSize - 1}); return $dataName.codeUnitAt(index); @@ -326,6 +327,7 @@ String _lookupSurrogatesMethod( return ''' $preferInline int $name(int lead, int tail) { + assert(lead <= 0x3FF && tail <= 0x3FF); var chunkStart = $startName.codeUnitAt($startOffset + lead); return $dataName.codeUnitAt(chunkStart + tail); } @@ -335,15 +337,17 @@ int $name(int lead, int tail) { return ''' $preferInline int $name(int lead, int tail) { - var offset = (tail >> $shift) + (lead << ${10 - shift}); - tail &= ${chunkSize - 1}; - var chunkStart = $startName.codeUnitAt($startOffset + offset); - return $dataName.codeUnitAt(chunkStart + tail); + assert(lead <= 0x3FF && tail <= 0x3FF); + var chunkIndex = (tail >> $shift) + (lead << ${10 - shift}); + var byteIndex = tail & ${chunkSize - 1}; + var chunkStart = $startName.codeUnitAt($startOffset + chunkIndex); + return $dataName.codeUnitAt(chunkStart + byteIndex); } '''; } // Add code if shift > 10 ever becomes optimal for table size. - // Example code: + // Fx: chunkIndex = lead >> ${20 - shift}; + // byteIndex = tail + ((lead & ${(chunkSize >> 10) - 1}) << 10); throw UnimplementedError('No code for chunk sizes > 10 bits'); } diff --git a/pkgs/characters/tool/src/grapheme_category_loader.dart b/pkgs/characters/tool/src/grapheme_category_loader.dart index 1f436f80f..2b5c1777e 100644 --- a/pkgs/characters/tool/src/grapheme_category_loader.dart +++ b/pkgs/characters/tool/src/grapheme_category_loader.dart @@ -291,9 +291,6 @@ Uint8List _parseInCBCategories(String file, {required bool verbose}) { } // -------------------------------------------------------------------- -// TODO: Use a sparse table? -// Likely not worth it. - /// Fixed length table for Unicode properties. class UnicodePropertyTable { static const int _unicodeCodePoints = 0x110000; From 3b1142ec493d824a6abb1aa1b4e75767b31180c8 Mon Sep 17 00:00:00 2001 From: "Lasse R.H. Nielsen" Date: Wed, 18 Jun 2025 12:01:39 +0200 Subject: [PATCH 4/6] Tweak low-level benchmark. --- pkgs/characters/tool/benchmark.dart | 90 +++++++++++++++++++---------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/pkgs/characters/tool/benchmark.dart b/pkgs/characters/tool/benchmark.dart index a72f99373..42d5f9010 100644 --- a/pkgs/characters/tool/benchmark.dart +++ b/pkgs/characters/tool/benchmark.dart @@ -10,16 +10,16 @@ import '../test/src/unicode_grapheme_tests.dart'; import '../test/src/various_tests.dart'; // Low-level benchmark of the grapheme cluster step functions. +// Use ../benchmark/benchmark.dart for the more high-level `Characters` +// methods. void main(List args) { var count = 5; if (args.isNotEmpty) { count = int.parse(args[0]); } - var gcSumForward = 0; - var gcSumBackwards = 0; - - var text = genesis + + var text = + genesis + hangul + genesis + diacretics + @@ -28,9 +28,20 @@ void main(List args) { recJoin(zalgo); var codeUnits = text.length; var codePoints = text.runes.length; + // Warmup. + var gcSumForward = benchForward(text, -1, codePoints, codeUnits, 150); + var gcSumBackwards = benchBackward(text, -1, codePoints, codeUnits, 150); + if (gcSumForward != gcSumBackwards) { + print( + 'ERROR: Did not count the same number of grapheme clusters: ' + '$gcSumForward forward vs. $gcSumBackwards backward.', + ); + return; + } + for (var i = 0; i < count; i++) { - gcSumForward = benchForward(text, i, codePoints, codeUnits); - gcSumBackwards = benchBackward(text, i, codePoints, codeUnits); + gcSumForward = benchForward(text, i, codePoints, codeUnits, 1500); + gcSumBackwards = benchBackward(text, i, codePoints, codeUnits, 1500); } print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.'); if (gcSumForward != gcSumBackwards) { @@ -39,7 +50,12 @@ void main(List args) { '$gcSumForward forward vs. $gcSumBackwards backward.', ); } else { - print('Total: $gcSumForward gc, $codePoints cp, $codeUnits cu'); + var surrogates = codeUnits - codePoints; + print( + 'Total: $gcSumForward gc, $codePoints cp, $codeUnits cu, ' + '$surrogates surrogates ' + '(${(surrogates / codePoints * 100).toStringAsFixed(3)}%)', + ); print('Avg ${(codePoints / gcSumForward).toStringAsFixed(3)} cp/gc'); print('Avg ${(codeUnits / gcSumForward).toStringAsFixed(3)} cu/gc'); } @@ -48,46 +64,58 @@ void main(List args) { String recJoin(Iterable> texts) => texts.map((x) => x.join('')).join('\n'); -int benchForward(String text, int i, int cp, int cu) { +int benchForward(String text, int round, int cp, int cu, int limit) { var n = 0; + var step = 10; var gc = 0; var e = 0; var sw = Stopwatch()..start(); do { - var breaks = Breaks(text, 0, text.length, stateSoTNoBreak); - while (breaks.nextBreak() >= 0) { - gc++; + for (var i = 0; i < step; i++) { + var breaks = Breaks(text, 0, text.length, stateSoTNoBreak); + while (breaks.nextBreak() >= 0) { + gc++; + } } e = sw.elapsedMilliseconds; - n++; - } while (e < 2000); - print( - 'Forward #$i: ${(gc / e).round()} gc/ms, ' - '${(n * cp / e).round()} cp/ms, ' - '${(n * cu / e).round()} cu/ms, ' - '$n rounds', - ); + n += step; + step += step; + } while (e < limit); + if (limit > 500) { + print( + 'Forward #$round: ${(gc / e).round()} gc/ms, ' + '${(n * cp / e).round()} cp/ms, ' + '${(n * cu / e).round()} cu/ms, ' + '$n rounds in $e ms', + ); + } return gc ~/ n; } -int benchBackward(String text, int i, int cp, int cu) { +int benchBackward(String text, int round, int cp, int cu, int limit) { var n = 0; + var step = 10; var gc = 0; var e = 0; var sw = Stopwatch()..start(); do { - var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak); - while (breaks.nextBreak() >= 0) { - gc++; + for (var i = 0; i < step; i++) { + var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak); + while (breaks.nextBreak() >= 0) { + gc++; + } } e = sw.elapsedMilliseconds; - n++; - } while (e < 2000); - print( - 'Backward #$i: ${(gc / e).round()} gc/ms, ' - '${(n * cp / e).round()} cp/ms, ' - '${(n * cu / e).round()} cu/ms, ' - '$n rounds', - ); + n += step; + step += step; + } while (e < limit); + if (limit > 500) { + print( + 'Backward #$round: ${(gc / e).round()} gc/ms, ' + '${(n * cp / e).round()} cp/ms, ' + '${(n * cu / e).round()} cu/ms, ' + '$n rounds in $e ms', + ); + } return gc ~/ n; } From dc9ae68bea7821090a83a8ff05a41e907a87354f Mon Sep 17 00:00:00 2001 From: "Lasse R.H. Nielsen" Date: Wed, 18 Jun 2025 12:23:11 +0200 Subject: [PATCH 5/6] Don't tweak the SDK version. --- pkgs/characters/pubspec.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkgs/characters/pubspec.yaml b/pkgs/characters/pubspec.yaml index 7f6782654..3d36c1729 100644 --- a/pkgs/characters/pubspec.yaml +++ b/pkgs/characters/pubspec.yaml @@ -10,8 +10,8 @@ topics: - unicode environment: - sdk: ^3.8.0 + sdk: ^3.4.0 dev_dependencies: - dart_flutter_team_lints: ^3.5.0 - test: ^1.26.0 + dart_flutter_team_lints: ^3.1.0 + test: ^1.16.0 From 01ad611c83fac1ff482abbcca1187effb13eccea Mon Sep 17 00:00:00 2001 From: "Lasse R.H. Nielsen" Date: Wed, 18 Jun 2025 12:24:37 +0200 Subject: [PATCH 6/6] And undo the 3.8 formatting. --- pkgs/characters/tool/benchmark.dart | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkgs/characters/tool/benchmark.dart b/pkgs/characters/tool/benchmark.dart index 42d5f9010..2cf28cdeb 100644 --- a/pkgs/characters/tool/benchmark.dart +++ b/pkgs/characters/tool/benchmark.dart @@ -18,8 +18,7 @@ void main(List args) { if (args.isNotEmpty) { count = int.parse(args[0]); } - var text = - genesis + + var text = genesis + hangul + genesis + diacretics +