diff --git a/pkgs/characters/CHANGELOG.md b/pkgs/characters/CHANGELOG.md index 19924460..f8782bf8 100644 --- a/pkgs/characters/CHANGELOG.md +++ b/pkgs/characters/CHANGELOG.md @@ -1,6 +1,7 @@ -## 1.4.1-wip +## 1.4.1 -- Run `dart format` with the new style. +* Run `dart format` with the new style. +* Performance improvement for non-BMP characters. ## 1.4.0 diff --git a/pkgs/characters/lib/src/characters_impl.dart b/pkgs/characters/lib/src/characters_impl.dart index e8694339..a901b517 100644 --- a/pkgs/characters/lib/src/characters_impl.dart +++ b/pkgs/characters/lib/src/characters_impl.dart @@ -509,15 +509,16 @@ class StringCharacterRange implements CharacterRange { var index = _end; while (index < _string.length) { var char = _string.codeUnitAt(index); + var surrogate = char ^ 0xD800; var category = categoryControl; var nextIndex = index + 1; - if (char & 0xFC00 != 0xD800) { + if (surrogate > 0x3FF) { category = low(char); } else if (nextIndex < _string.length) { - var nextChar = _string.codeUnitAt(nextIndex); - if (nextChar & 0xFC00 == 0xDC00) { + var nextSurrogate = _string.codeUnitAt(nextIndex) ^ 0xDC00; + if (nextSurrogate <= 0x3FF) { nextIndex += 1; - category = high(char, nextChar); + category = high(surrogate, nextSurrogate); } } state = move(state, category); diff --git a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart index 4e8c9e8d..ec373d70 100644 --- a/pkgs/characters/lib/src/grapheme_clusters/breaks.dart +++ b/pkgs/characters/lib/src/grapheme_clusters/breaks.dart @@ -76,16 +76,17 @@ class Breaks { void step() { assert(cursor < end); var char = base.codeUnitAt(cursor++); - if (char & 0xFC00 != 0xD800) { + var surrogate = char ^ 0xD800; + if (surrogate > 0x3FF) { state = move(state, low(char)); return; } // The category of an unpaired lead surrogate is Control. int category; - int nextChar; + int nextSurrogate; if (cursor < end && - (nextChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) { - category = high(char, nextChar); + (nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) { + category = high(surrogate, nextSurrogate); cursor++; } else { category = categoryControl; @@ -112,31 +113,36 @@ class Breaks { } var cursorBefore = cursor - 1; var prevChar = base.codeUnitAt(cursorBefore); - int prevCategory; - if (prevChar & 0xF800 != 0xD800) { + var prevSurrogate = prevChar ^ 0xD800; + if (prevSurrogate > 0x7FF) { // Not surrogate. - prevCategory = low(prevChar); - } else if (prevChar & 0xFC00 == 0xD800) { - // Lead surrogate. Check for a following tail surrogate. - int tailChar; - if (cursor < end && - (tailChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) { - cursor += 1; - prevCategory = high(prevChar, tailChar); - } else { - prevCategory = categoryControl; - } - } else { + var prevCategory = low(prevChar); + state = move(stateCAny, prevCategory); + return cursorBefore; + } + int prevCategory; + if (prevSurrogate > 0x3FF) { // Tail surrogate, check for prior lead surrogate. - int leadChar; + int leadSurrogate; var leadIndex = cursorBefore - 1; + prevSurrogate &= 0x3FF; if (leadIndex >= start && - (leadChar = base.codeUnitAt(leadIndex)) & 0xFC00 == 0xD800) { - prevCategory = high(leadChar, prevChar); + (leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) { + prevCategory = high(leadSurrogate, prevSurrogate); cursorBefore = leadIndex; } else { prevCategory = categoryControl; } + } else { + // Lead surrogate. Check for a following tail surrogate. + int tailSurrogate; + if (cursor < end && + (tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) { + cursor += 1; + prevCategory = high(prevSurrogate, tailSurrogate); + } else { + prevCategory = categoryControl; + } } state = move(stateCAny, prevCategory); return cursorBefore; @@ -206,7 +212,8 @@ class BackBreaks { void step() { assert(cursor > start); var char = base.codeUnitAt(--cursor); - if (char & 0xFC00 != 0xDC00) { + var surrogate = char ^ 0xDC00; + if (surrogate > 0x3FF) { var category = low(char); state = moveBack(state, category); return; @@ -214,10 +221,10 @@ class BackBreaks { // Found tail surrogate, check for prior lead surrogate. // The category of an unpaired tail surrogate is Control. int category; - int prevChar; + int prevSurrogate; if (cursor >= start && - (prevChar = base.codeUnitAt(--cursor)) & 0xFC00 == 0xD800) { - category = high(prevChar, char); + (prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) { + category = high(prevSurrogate, surrogate); } else { category = categoryControl; cursor++; @@ -342,21 +349,23 @@ int previousBreak(String text, int start, int end, int index) { if (start < index && index < end) { var cursorBefore = index; var nextChar = text.codeUnitAt(index); + var nextSurrogate = nextChar ^ 0xD800; var category = categoryControl; - if (nextChar & 0xF800 != 0xD800) { + if (nextSurrogate > 0x7FF) { category = low(nextChar); - } else if (nextChar & 0xFC00 == 0xD800) { + } else if (nextSurrogate <= 0x3FF) { var indexAfter = index + 1; if (indexAfter < end) { - var secondChar = text.codeUnitAt(indexAfter); - if (secondChar & 0xFC00 == 0xDC00) { - category = high(nextChar, secondChar); + var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00; + if (secondSurrogate <= 0x3FF) { + category = high(nextSurrogate, secondSurrogate); } } } else { - var prevChar = text.codeUnitAt(index - 1); - if (prevChar & 0xFC00 == 0xD800) { - category = high(prevChar, nextChar); + var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800; + nextSurrogate &= 0x3FF; + if (prevSurrogate <= 0x3FF) { + category = high(prevSurrogate, nextSurrogate); cursorBefore -= 1; } } diff --git a/pkgs/characters/lib/src/grapheme_clusters/table.dart b/pkgs/characters/lib/src/grapheme_clusters/table.dart index fce9c85a..79d33650 100644 --- a/pkgs/characters/lib/src/grapheme_clusters/table.dart +++ b/pkgs/characters/lib/src/grapheme_clusters/table.dart @@ -1130,6 +1130,7 @@ const String _start = '\u1132\u166c\u166c\u206f\u11c0\u13fb\u166c\u166c\u166c' @pragma('vm:prefer-inline') @pragma('wasm:prefer-inline') int low(int codeUnit) { + assert(codeUnit <= 0xFFFF); var chunkStart = _start.codeUnitAt(codeUnit >> 5); var index = chunkStart + (codeUnit & 31); return _data.codeUnitAt(index); @@ -1139,10 +1140,11 @@ int low(int codeUnit) { @pragma('vm:prefer-inline') @pragma('wasm:prefer-inline') int high(int lead, int tail) { - var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + (2048 << 8); - var chunkStart = _start.codeUnitAt(offset >> 8); - var index = chunkStart + (tail & 255); - return _data.codeUnitAt(index); + assert(lead <= 0x3FF && tail <= 0x3FF); + var chunkIndex = (tail >> 8) + (lead << 2); + var byteIndex = tail & 255; + var chunkStart = _start.codeUnitAt(2048 + chunkIndex); + return _data.codeUnitAt(chunkStart + byteIndex); } const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey' diff --git a/pkgs/characters/pubspec.yaml b/pkgs/characters/pubspec.yaml index 0ab15fbe..3d36c172 100644 --- a/pkgs/characters/pubspec.yaml +++ b/pkgs/characters/pubspec.yaml @@ -1,5 +1,5 @@ name: characters -version: 1.4.1-wip +version: 1.4.1 description: >- String replacement with operations that are Unicode/grapheme cluster aware. repository: https://github.com/dart-lang/core/tree/main/pkgs/characters @@ -14,4 +14,4 @@ environment: dev_dependencies: dart_flutter_team_lints: ^3.1.0 - test: ^1.16.6 + test: ^1.16.0 diff --git a/pkgs/characters/test/characters_test.dart b/pkgs/characters/test/characters_test.dart index d259cbca..9f39961b 100644 --- a/pkgs/characters/test/characters_test.dart +++ b/pkgs/characters/test/characters_test.dart @@ -115,28 +115,29 @@ void main([List? args]) { var zwj = '\u200d'; // U+200D, ZWJ var rainbow = '\u{1f308}'; // U+1F308, Rainbow. Category Pictogram - var rbflag = '$flag$white$zwj$rainbow'; - var string = '-$rbflag-'; + var rainbowFlag = '$flag$white$zwj$rainbow'; + var string = '-$rainbowFlag-'; var range = CharacterRange.at(string, 1); expect(range.isEmpty, true); expect(range.moveNext(), true); - expect(range.current, rbflag); + expect(range.current, rainbowFlag); range = range = CharacterRange.at(string, 2); expect(range.isEmpty, false); - expect(range.current, rbflag); + expect(range.current, rainbowFlag); range = range = CharacterRange.at(string, 0, 2); expect(range.isEmpty, false); - expect(range.current, '-$rbflag'); + expect(range.current, '-$rainbowFlag'); range = range = CharacterRange.at(string, 0, 2); expect(range.isEmpty, false); - expect(range.current, '-$rbflag'); + expect(range.current, '-$rainbowFlag'); - range = range = CharacterRange.at(string, 2, '-$rbflag'.length - 1); + range = + range = CharacterRange.at(string, 2, '-$rainbowFlag'.length - 1); expect(range.isEmpty, false); - expect(range.current, rbflag); + expect(range.current, rainbowFlag); expect(range.stringBeforeLength, 1); range = range = CharacterRange.at(string, 0, string.length); diff --git a/pkgs/characters/test/src/unicode_tests.dart b/pkgs/characters/test/src/unicode_tests.dart index 5c01dda8..3fc50ee2 100644 --- a/pkgs/characters/test/src/unicode_tests.dart +++ b/pkgs/characters/test/src/unicode_tests.dart @@ -31,7 +31,7 @@ String testDescription(List expected) { int categoryOf(int codePoint) { if (codePoint < 0x10000) return low(codePoint); var nonBmpOffset = codePoint - 0x10000; - return high(0xD800 + (nonBmpOffset >> 10), 0xDC00 + (nonBmpOffset & 0x3ff)); + return high(nonBmpOffset >> 10, nonBmpOffset & 0x3ff); } String partCategories(List parts) { diff --git a/pkgs/characters/tool/benchmark.dart b/pkgs/characters/tool/benchmark.dart index a77b811e..2cf28cde 100644 --- a/pkgs/characters/tool/benchmark.dart +++ b/pkgs/characters/tool/benchmark.dart @@ -10,15 +10,14 @@ import '../test/src/unicode_grapheme_tests.dart'; import '../test/src/various_tests.dart'; // Low-level benchmark of the grapheme cluster step functions. +// Use ../benchmark/benchmark.dart for the more high-level `Characters` +// methods. void main(List args) { var count = 5; if (args.isNotEmpty) { count = int.parse(args[0]); } - var gcsf = 0; - var gcsb = 0; - var text = genesis + hangul + genesis + @@ -28,66 +27,94 @@ void main(List args) { recJoin(zalgo); var codeUnits = text.length; var codePoints = text.runes.length; + // Warmup. + var gcSumForward = benchForward(text, -1, codePoints, codeUnits, 150); + var gcSumBackwards = benchBackward(text, -1, codePoints, codeUnits, 150); + if (gcSumForward != gcSumBackwards) { + print( + 'ERROR: Did not count the same number of grapheme clusters: ' + '$gcSumForward forward vs. $gcSumBackwards backward.', + ); + return; + } + for (var i = 0; i < count; i++) { - gcsf = benchForward(text, i, codePoints, codeUnits); - gcsb = benchBackward(text, i, codePoints, codeUnits); + gcSumForward = benchForward(text, i, codePoints, codeUnits, 1500); + gcSumBackwards = benchBackward(text, i, codePoints, codeUnits, 1500); } print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.'); - if (gcsf != gcsb) { + if (gcSumForward != gcSumBackwards) { print( 'ERROR: Did not count the same number of grapheme clusters: ' - '$gcsf forward vs. $gcsb backward.', + '$gcSumForward forward vs. $gcSumBackwards backward.', ); } else { - print('Total: $gcsf gc, $codePoints cp, $codeUnits cu'); - print('Avg ${(codePoints / gcsf).toStringAsFixed(3)} cp/gc'); - print('Avg ${(codeUnits / gcsf).toStringAsFixed(3)} cu/gc'); + var surrogates = codeUnits - codePoints; + print( + 'Total: $gcSumForward gc, $codePoints cp, $codeUnits cu, ' + '$surrogates surrogates ' + '(${(surrogates / codePoints * 100).toStringAsFixed(3)}%)', + ); + print('Avg ${(codePoints / gcSumForward).toStringAsFixed(3)} cp/gc'); + print('Avg ${(codeUnits / gcSumForward).toStringAsFixed(3)} cu/gc'); } } String recJoin(Iterable> texts) => texts.map((x) => x.join('')).join('\n'); -int benchForward(String text, int i, int cp, int cu) { +int benchForward(String text, int round, int cp, int cu, int limit) { var n = 0; + var step = 10; var gc = 0; var e = 0; var sw = Stopwatch()..start(); do { - var breaks = Breaks(text, 0, text.length, stateSoTNoBreak); - while (breaks.nextBreak() >= 0) { - gc++; + for (var i = 0; i < step; i++) { + var breaks = Breaks(text, 0, text.length, stateSoTNoBreak); + while (breaks.nextBreak() >= 0) { + gc++; + } } e = sw.elapsedMilliseconds; - n++; - } while (e < 2000); - print( - 'Forward #$i: ${(gc / e).round()} gc/ms, ' - '${(n * cp / e).round()} cp/ms, ' - '${(n * cu / e).round()} cu/ms, ' - '$n rounds', - ); + n += step; + step += step; + } while (e < limit); + if (limit > 500) { + print( + 'Forward #$round: ${(gc / e).round()} gc/ms, ' + '${(n * cp / e).round()} cp/ms, ' + '${(n * cu / e).round()} cu/ms, ' + '$n rounds in $e ms', + ); + } return gc ~/ n; } -int benchBackward(String text, int i, int cp, int cu) { +int benchBackward(String text, int round, int cp, int cu, int limit) { var n = 0; + var step = 10; var gc = 0; var e = 0; var sw = Stopwatch()..start(); do { - var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak); - while (breaks.nextBreak() >= 0) { - gc++; + for (var i = 0; i < step; i++) { + var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak); + while (breaks.nextBreak() >= 0) { + gc++; + } } e = sw.elapsedMilliseconds; - n++; - } while (e < 2000); - print( - 'Backward #$i: ${(gc / e).round()} gc/ms, ' - '${(n * cp / e).round()} cp/ms, ' - '${(n * cu / e).round()} cu/ms, ' - '$n rounds', - ); + n += step; + step += step; + } while (e < limit); + if (limit > 500) { + print( + 'Backward #$round: ${(gc / e).round()} gc/ms, ' + '${(n * cp / e).round()} cp/ms, ' + '${(n * cu / e).round()} cu/ms, ' + '$n rounds in $e ms', + ); + } return gc ~/ n; } diff --git a/pkgs/characters/tool/bin/generate_tables.dart b/pkgs/characters/tool/bin/generate_tables.dart index f348276c..a8292467 100644 --- a/pkgs/characters/tool/bin/generate_tables.dart +++ b/pkgs/characters/tool/bin/generate_tables.dart @@ -307,6 +307,7 @@ String _lookupMethod( ''' $preferInline int $name(int codeUnit) { + assert(codeUnit <= 0xFFFF); var chunkStart = $startName.codeUnitAt(codeUnit >> ${chunkSize.bitLength - 1}); var index = chunkStart + (codeUnit & ${chunkSize - 1}); return $dataName.codeUnitAt(index); @@ -320,27 +321,34 @@ String _lookupSurrogatesMethod( int startOffset, int chunkSize, ) { - if (chunkSize == 1024) { + var shift = chunkSize.bitLength - 1; + assert(shift <= 10); + if (shift == 10) { return ''' $preferInline int $name(int lead, int tail) { - var chunkStart = $startName.codeUnitAt($startOffset + (0x3ff & lead)); - var index = chunkStart + (0x3ff & tail); - return $dataName.codeUnitAt(index); + assert(lead <= 0x3FF && tail <= 0x3FF); + var chunkStart = $startName.codeUnitAt($startOffset + lead); + return $dataName.codeUnitAt(chunkStart + tail); } '''; } - var shift = chunkSize.bitLength - 1; - var indexVar = chunkSize < 1024 ? 'tail' : 'offset'; - return ''' + if (shift < 10) { + return ''' $preferInline int $name(int lead, int tail) { - var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift); - var chunkStart = $startName.codeUnitAt(offset >> $shift); - var index = chunkStart + ($indexVar & ${chunkSize - 1}); - return $dataName.codeUnitAt(index); + assert(lead <= 0x3FF && tail <= 0x3FF); + var chunkIndex = (tail >> $shift) + (lead << ${10 - shift}); + var byteIndex = tail & ${chunkSize - 1}; + var chunkStart = $startName.codeUnitAt($startOffset + chunkIndex); + return $dataName.codeUnitAt(chunkStart + byteIndex); } '''; + } + // Add code if shift > 10 ever becomes optimal for table size. + // Fx: chunkIndex = lead >> ${20 - shift}; + // byteIndex = tail + ((lead & ${(chunkSize >> 10) - 1}) << 10); + throw UnimplementedError('No code for chunk sizes > 10 bits'); } // ----------------------------------------------------------------------------- diff --git a/pkgs/characters/tool/src/grapheme_category_loader.dart b/pkgs/characters/tool/src/grapheme_category_loader.dart index 1f436f80..2b5c1777 100644 --- a/pkgs/characters/tool/src/grapheme_category_loader.dart +++ b/pkgs/characters/tool/src/grapheme_category_loader.dart @@ -291,9 +291,6 @@ Uint8List _parseInCBCategories(String file, {required bool verbose}) { } // -------------------------------------------------------------------- -// TODO: Use a sparse table? -// Likely not worth it. - /// Fixed length table for Unicode properties. class UnicodePropertyTable { static const int _unicodeCodePoints = 0x110000; diff --git a/pkgs/characters/tool/src/string_literal_writer.dart b/pkgs/characters/tool/src/string_literal_writer.dart index 9259b4ea..96ca93ff 100644 --- a/pkgs/characters/tool/src/string_literal_writer.dart +++ b/pkgs/characters/tool/src/string_literal_writer.dart @@ -43,12 +43,12 @@ class StringLiteralWriter { /// Adds a single UTF-16 code unit. void add(int codeUnit) { // Always escape: `\n`, `\r`, `'`, `$` and `\`, plus anything the user wants. - if (_escape(codeUnit) || - codeUnit == 0x24 || - codeUnit == 0x27 || - codeUnit == 0x5c || - codeUnit == 0x0a || - codeUnit == 0x0d) { + if (_escape(codeUnit) || // Anything the user wants encoded. + codeUnit == 0x24 /* $ */ || + codeUnit == 0x27 /* ' */ || + codeUnit == 0x5c /* \ */ || + codeUnit == 0x0a /* \n */ || + codeUnit == 0x0d /* \r */) { _writeEscape(codeUnit); return; } @@ -59,6 +59,9 @@ class StringLiteralWriter { buffer.writeCharCode(codeUnit); } + /// Writes an escape for the [codeUnit]. + /// + /// Is only called for characters that need escaping. void _writeEscape(int codeUnit) { var replacement = _escapeCache[codeUnit]; if (replacement == null) { @@ -83,9 +86,8 @@ class StringLiteralWriter { replacement = r'\$'; } else if (codeUnit == "'".codeUnitAt(0)) { replacement = r"\'"; - } - if (codeUnit == r''.codeUnitAt(0)) { - replacement = r'\'; + } else if (codeUnit == r'\'.codeUnitAt(0)) { + replacement = r'\\'; } else { replacement = r'\x' + codeUnit.toRadixString(16); }