Skip to content

Optimize surrogate decoding. #894

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pkgs/characters/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 1.4.1-wip
## 1.4.1

- Run `dart format` with the new style.
* Run `dart format` with the new style.
* Performance improvement for non-BMP characters.

## 1.4.0

Expand Down
9 changes: 5 additions & 4 deletions pkgs/characters/lib/src/characters_impl.dart
Original file line number Diff line number Diff line change
Expand Up @@ -509,15 +509,16 @@ class StringCharacterRange implements CharacterRange {
var index = _end;
while (index < _string.length) {
var char = _string.codeUnitAt(index);
var surrogate = char ^ 0xD800;
var category = categoryControl;
var nextIndex = index + 1;
if (char & 0xFC00 != 0xD800) {
if (surrogate > 0x3FF) {
category = low(char);
} else if (nextIndex < _string.length) {
var nextChar = _string.codeUnitAt(nextIndex);
if (nextChar & 0xFC00 == 0xDC00) {
var nextSurrogate = _string.codeUnitAt(nextIndex) ^ 0xDC00;
if (nextSurrogate <= 0x3FF) {
nextIndex += 1;
category = high(char, nextChar);
category = high(surrogate, nextSurrogate);
}
}
state = move(state, category);
Expand Down
75 changes: 42 additions & 33 deletions pkgs/characters/lib/src/grapheme_clusters/breaks.dart
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,17 @@ class Breaks {
void step() {
assert(cursor < end);
var char = base.codeUnitAt(cursor++);
if (char & 0xFC00 != 0xD800) {
var surrogate = char ^ 0xD800;
if (surrogate > 0x3FF) {
state = move(state, low(char));
return;
}
// The category of an unpaired lead surrogate is Control.
int category;
int nextChar;
int nextSurrogate;
if (cursor < end &&
(nextChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
category = high(char, nextChar);
(nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
category = high(surrogate, nextSurrogate);
cursor++;
} else {
category = categoryControl;
Expand All @@ -112,31 +113,36 @@ class Breaks {
}
var cursorBefore = cursor - 1;
var prevChar = base.codeUnitAt(cursorBefore);
int prevCategory;
if (prevChar & 0xF800 != 0xD800) {
var prevSurrogate = prevChar ^ 0xD800;
if (prevSurrogate > 0x7FF) {
// Not surrogate.
prevCategory = low(prevChar);
} else if (prevChar & 0xFC00 == 0xD800) {
// Lead surrogate. Check for a following tail surrogate.
int tailChar;
if (cursor < end &&
(tailChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
cursor += 1;
prevCategory = high(prevChar, tailChar);
} else {
prevCategory = categoryControl;
}
} else {
var prevCategory = low(prevChar);
state = move(stateCAny, prevCategory);
return cursorBefore;
}
int prevCategory;
if (prevSurrogate > 0x3FF) {
// Tail surrogate, check for prior lead surrogate.
int leadChar;
int leadSurrogate;
var leadIndex = cursorBefore - 1;
prevSurrogate &= 0x3FF;
if (leadIndex >= start &&
(leadChar = base.codeUnitAt(leadIndex)) & 0xFC00 == 0xD800) {
prevCategory = high(leadChar, prevChar);
(leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) {
prevCategory = high(leadSurrogate, prevSurrogate);
cursorBefore = leadIndex;
} else {
prevCategory = categoryControl;
}
} else {
// Lead surrogate. Check for a following tail surrogate.
int tailSurrogate;
if (cursor < end &&
(tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
cursor += 1;
prevCategory = high(prevSurrogate, tailSurrogate);
} else {
prevCategory = categoryControl;
}
}
state = move(stateCAny, prevCategory);
return cursorBefore;
Expand Down Expand Up @@ -206,18 +212,19 @@ class BackBreaks {
void step() {
assert(cursor > start);
var char = base.codeUnitAt(--cursor);
if (char & 0xFC00 != 0xDC00) {
var surrogate = char ^ 0xDC00;
if (surrogate > 0x3FF) {
var category = low(char);
state = moveBack(state, category);
return;
}
// Found tail surrogate, check for prior lead surrogate.
// The category of an unpaired tail surrogate is Control.
int category;
int prevChar;
int prevSurrogate;
if (cursor >= start &&
(prevChar = base.codeUnitAt(--cursor)) & 0xFC00 == 0xD800) {
category = high(prevChar, char);
(prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) {
category = high(prevSurrogate, surrogate);
} else {
category = categoryControl;
cursor++;
Expand Down Expand Up @@ -342,21 +349,23 @@ int previousBreak(String text, int start, int end, int index) {
if (start < index && index < end) {
var cursorBefore = index;
var nextChar = text.codeUnitAt(index);
var nextSurrogate = nextChar ^ 0xD800;
var category = categoryControl;
if (nextChar & 0xF800 != 0xD800) {
if (nextSurrogate > 0x7FF) {
category = low(nextChar);
} else if (nextChar & 0xFC00 == 0xD800) {
} else if (nextSurrogate <= 0x3FF) {
var indexAfter = index + 1;
if (indexAfter < end) {
var secondChar = text.codeUnitAt(indexAfter);
if (secondChar & 0xFC00 == 0xDC00) {
category = high(nextChar, secondChar);
var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00;
if (secondSurrogate <= 0x3FF) {
category = high(nextSurrogate, secondSurrogate);
}
}
} else {
var prevChar = text.codeUnitAt(index - 1);
if (prevChar & 0xFC00 == 0xD800) {
category = high(prevChar, nextChar);
var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800;
nextSurrogate &= 0x3FF;
if (prevSurrogate <= 0x3FF) {
category = high(prevSurrogate, nextSurrogate);
cursorBefore -= 1;
}
}
Expand Down
10 changes: 6 additions & 4 deletions pkgs/characters/lib/src/grapheme_clusters/table.dart
Original file line number Diff line number Diff line change
Expand Up @@ -1130,6 +1130,7 @@ const String _start = '\u1132\u166c\u166c\u206f\u11c0\u13fb\u166c\u166c\u166c'
@pragma('vm:prefer-inline')
@pragma('wasm:prefer-inline')
int low(int codeUnit) {
assert(codeUnit <= 0xFFFF);
var chunkStart = _start.codeUnitAt(codeUnit >> 5);
var index = chunkStart + (codeUnit & 31);
return _data.codeUnitAt(index);
Expand All @@ -1139,10 +1140,11 @@ int low(int codeUnit) {
@pragma('vm:prefer-inline')
@pragma('wasm:prefer-inline')
int high(int lead, int tail) {
var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + (2048 << 8);
var chunkStart = _start.codeUnitAt(offset >> 8);
var index = chunkStart + (tail & 255);
return _data.codeUnitAt(index);
assert(lead <= 0x3FF && tail <= 0x3FF);
var chunkIndex = (tail >> 8) + (lead << 2);
var byteIndex = tail & 255;
var chunkStart = _start.codeUnitAt(2048 + chunkIndex);
return _data.codeUnitAt(chunkStart + byteIndex);
}

const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey'
Expand Down
4 changes: 2 additions & 2 deletions pkgs/characters/pubspec.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: characters
version: 1.4.1-wip
version: 1.4.1
description: >-
String replacement with operations that are Unicode/grapheme cluster aware.
repository: https://github.com/dart-lang/core/tree/main/pkgs/characters
Expand All @@ -14,4 +14,4 @@ environment:

dev_dependencies:
dart_flutter_team_lints: ^3.1.0
test: ^1.16.6
test: ^1.16.0
17 changes: 9 additions & 8 deletions pkgs/characters/test/characters_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -115,28 +115,29 @@ void main([List<String>? args]) {
var zwj = '\u200d'; // U+200D, ZWJ
var rainbow = '\u{1f308}'; // U+1F308, Rainbow. Category Pictogram

var rbflag = '$flag$white$zwj$rainbow';
var string = '-$rbflag-';
var rainbowFlag = '$flag$white$zwj$rainbow';
var string = '-$rainbowFlag-';
var range = CharacterRange.at(string, 1);
expect(range.isEmpty, true);
expect(range.moveNext(), true);
expect(range.current, rbflag);
expect(range.current, rainbowFlag);

range = range = CharacterRange.at(string, 2);
expect(range.isEmpty, false);
expect(range.current, rbflag);
expect(range.current, rainbowFlag);

range = range = CharacterRange.at(string, 0, 2);
expect(range.isEmpty, false);
expect(range.current, '-$rbflag');
expect(range.current, '-$rainbowFlag');

range = range = CharacterRange.at(string, 0, 2);
expect(range.isEmpty, false);
expect(range.current, '-$rbflag');
expect(range.current, '-$rainbowFlag');

range = range = CharacterRange.at(string, 2, '-$rbflag'.length - 1);
range =
range = CharacterRange.at(string, 2, '-$rainbowFlag'.length - 1);
expect(range.isEmpty, false);
expect(range.current, rbflag);
expect(range.current, rainbowFlag);
expect(range.stringBeforeLength, 1);

range = range = CharacterRange.at(string, 0, string.length);
Expand Down
2 changes: 1 addition & 1 deletion pkgs/characters/test/src/unicode_tests.dart
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ String testDescription(List<String> expected) {
int categoryOf(int codePoint) {
if (codePoint < 0x10000) return low(codePoint);
var nonBmpOffset = codePoint - 0x10000;
return high(0xD800 + (nonBmpOffset >> 10), 0xDC00 + (nonBmpOffset & 0x3ff));
return high(nonBmpOffset >> 10, nonBmpOffset & 0x3ff);
}

String partCategories(List<String> parts) {
Expand Down
95 changes: 61 additions & 34 deletions pkgs/characters/tool/benchmark.dart
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,14 @@ import '../test/src/unicode_grapheme_tests.dart';
import '../test/src/various_tests.dart';

// Low-level benchmark of the grapheme cluster step functions.
// Use ../benchmark/benchmark.dart for the more high-level `Characters`
// methods.

void main(List<String> args) {
var count = 5;
if (args.isNotEmpty) {
count = int.parse(args[0]);
}
var gcsf = 0;
var gcsb = 0;

var text = genesis +
hangul +
genesis +
Expand All @@ -28,66 +27,94 @@ void main(List<String> args) {
recJoin(zalgo);
var codeUnits = text.length;
var codePoints = text.runes.length;
// Warmup.
var gcSumForward = benchForward(text, -1, codePoints, codeUnits, 150);
var gcSumBackwards = benchBackward(text, -1, codePoints, codeUnits, 150);
if (gcSumForward != gcSumBackwards) {
print(
'ERROR: Did not count the same number of grapheme clusters: '
'$gcSumForward forward vs. $gcSumBackwards backward.',
);
return;
}

for (var i = 0; i < count; i++) {
gcsf = benchForward(text, i, codePoints, codeUnits);
gcsb = benchBackward(text, i, codePoints, codeUnits);
gcSumForward = benchForward(text, i, codePoints, codeUnits, 1500);
gcSumBackwards = benchBackward(text, i, codePoints, codeUnits, 1500);
}
print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.');
if (gcsf != gcsb) {
if (gcSumForward != gcSumBackwards) {
print(
'ERROR: Did not count the same number of grapheme clusters: '
'$gcsf forward vs. $gcsb backward.',
'$gcSumForward forward vs. $gcSumBackwards backward.',
);
} else {
print('Total: $gcsf gc, $codePoints cp, $codeUnits cu');
print('Avg ${(codePoints / gcsf).toStringAsFixed(3)} cp/gc');
print('Avg ${(codeUnits / gcsf).toStringAsFixed(3)} cu/gc');
var surrogates = codeUnits - codePoints;
print(
'Total: $gcSumForward gc, $codePoints cp, $codeUnits cu, '
'$surrogates surrogates '
'(${(surrogates / codePoints * 100).toStringAsFixed(3)}%)',
);
print('Avg ${(codePoints / gcSumForward).toStringAsFixed(3)} cp/gc');
print('Avg ${(codeUnits / gcSumForward).toStringAsFixed(3)} cu/gc');
}
}

String recJoin(Iterable<List<String>> texts) =>
texts.map((x) => x.join('')).join('\n');

int benchForward(String text, int i, int cp, int cu) {
int benchForward(String text, int round, int cp, int cu, int limit) {
var n = 0;
var step = 10;
var gc = 0;
var e = 0;
var sw = Stopwatch()..start();
do {
var breaks = Breaks(text, 0, text.length, stateSoTNoBreak);
while (breaks.nextBreak() >= 0) {
gc++;
for (var i = 0; i < step; i++) {
var breaks = Breaks(text, 0, text.length, stateSoTNoBreak);
while (breaks.nextBreak() >= 0) {
gc++;
}
}
e = sw.elapsedMilliseconds;
n++;
} while (e < 2000);
print(
'Forward #$i: ${(gc / e).round()} gc/ms, '
'${(n * cp / e).round()} cp/ms, '
'${(n * cu / e).round()} cu/ms, '
'$n rounds',
);
n += step;
step += step;
} while (e < limit);
if (limit > 500) {
print(
'Forward #$round: ${(gc / e).round()} gc/ms, '
'${(n * cp / e).round()} cp/ms, '
'${(n * cu / e).round()} cu/ms, '
'$n rounds in $e ms',
);
}
return gc ~/ n;
}

int benchBackward(String text, int i, int cp, int cu) {
int benchBackward(String text, int round, int cp, int cu, int limit) {
var n = 0;
var step = 10;
var gc = 0;
var e = 0;
var sw = Stopwatch()..start();
do {
var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak);
while (breaks.nextBreak() >= 0) {
gc++;
for (var i = 0; i < step; i++) {
var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak);
while (breaks.nextBreak() >= 0) {
gc++;
}
}
e = sw.elapsedMilliseconds;
n++;
} while (e < 2000);
print(
'Backward #$i: ${(gc / e).round()} gc/ms, '
'${(n * cp / e).round()} cp/ms, '
'${(n * cu / e).round()} cu/ms, '
'$n rounds',
);
n += step;
step += step;
} while (e < limit);
if (limit > 500) {
print(
'Backward #$round: ${(gc / e).round()} gc/ms, '
'${(n * cp / e).round()} cp/ms, '
'${(n * cu / e).round()} cu/ms, '
'$n rounds in $e ms',
);
}
return gc ~/ n;
}
Loading