Skip to content

Commit fb93992

Browse files
committed
Optimize surrogate decoding.
Use `char ^ 0xD800 <= 0x3FF` to check if a char code is a lead surrogate. That avoids doing a later `& 0x3FF` to get rid of the top bits. Similar for tail surrogate. This ensures that the `high` function gets values without high bits. Also optimize that function to reduce dependency depth and try to hit `base + (something < small)` expressions that can optimized into a single x64 address computation. Gives a ~7% increase on backwards traversal and 38% increase for forward traversal, based on tool/benchmark.dart compiled with `dart compile exe`.
1 parent af37fe5 commit fb93992

File tree

8 files changed

+72
-48
lines changed

8 files changed

+72
-48
lines changed

pkgs/characters/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.4.1
2+
3+
* Performance improvement for non-BMP characters.
4+
15
## 1.4.0
26

37
* Updated to use Unicode 16.0.0.

pkgs/characters/lib/src/characters_impl.dart

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -489,13 +489,14 @@ class StringCharacterRange implements CharacterRange {
489489
var index = _end;
490490
while (index < _string.length) {
491491
var char = _string.codeUnitAt(index);
492+
var surrogateBits = char ^ 0xD800;
492493
var category = categoryControl;
493494
var nextIndex = index + 1;
494-
if (char & 0xFC00 != 0xD800) {
495+
if (surrogateBits >= 0x3FF) {
495496
category = low(char);
496497
} else if (nextIndex < _string.length) {
497-
var nextChar = _string.codeUnitAt(nextIndex);
498-
if (nextChar & 0xFC00 == 0xDC00) {
498+
var nextChar = _string.codeUnitAt(nextIndex) ^ 0xDC00;
499+
if (nextChar <= 0x3FF) {
499500
nextIndex += 1;
500501
category = high(char, nextChar);
501502
}

pkgs/characters/lib/src/grapheme_clusters/breaks.dart

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,17 @@ class Breaks {
7676
void step() {
7777
assert(cursor < end);
7878
var char = base.codeUnitAt(cursor++);
79-
if (char & 0xFC00 != 0xD800) {
79+
var surrogate = char ^ 0xD800;
80+
if (surrogate >= 0x3FF) {
8081
state = move(state, low(char));
8182
return;
8283
}
8384
// The category of an unpaired lead surrogate is Control.
8485
int category;
85-
int nextChar;
86+
int nextSurrogate;
8687
if (cursor < end &&
87-
(nextChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
88-
category = high(char, nextChar);
88+
(nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
89+
category = high(surrogate, nextSurrogate);
8990
cursor++;
9091
} else {
9192
category = categoryControl;
@@ -112,27 +113,28 @@ class Breaks {
112113
}
113114
var cursorBefore = cursor - 1;
114115
var prevChar = base.codeUnitAt(cursorBefore);
116+
var prevSurrogate = prevChar ^ 0xD800;
115117
int prevCategory;
116-
if (prevChar & 0xF800 != 0xD800) {
118+
if (prevSurrogate > 0x7FF) {
117119
// Not surrogate.
118120
prevCategory = low(prevChar);
119-
} else if (prevChar & 0xFC00 == 0xD800) {
121+
} else if (prevSurrogate <= 0x3FF) {
120122
// Lead surrogate. Check for a following tail surrogate.
121-
int tailChar;
123+
int tailSurrogate;
122124
if (cursor < end &&
123-
(tailChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
125+
(tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
124126
cursor += 1;
125-
prevCategory = high(prevChar, tailChar);
127+
prevCategory = high(prevSurrogate, tailSurrogate);
126128
} else {
127129
prevCategory = categoryControl;
128130
}
129131
} else {
130132
// Tail surrogate, check for prior lead surrogate.
131-
int leadChar;
133+
int leadSurrogate;
132134
var leadIndex = cursorBefore - 1;
133135
if (leadIndex >= start &&
134-
(leadChar = base.codeUnitAt(leadIndex)) & 0xFC00 == 0xD800) {
135-
prevCategory = high(leadChar, prevChar);
136+
(leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) {
137+
prevCategory = high(leadSurrogate, prevSurrogate);
136138
cursorBefore = leadIndex;
137139
} else {
138140
prevCategory = categoryControl;
@@ -206,18 +208,19 @@ class BackBreaks {
206208
void step() {
207209
assert(cursor > start);
208210
var char = base.codeUnitAt(--cursor);
209-
if (char & 0xFC00 != 0xDC00) {
211+
var surrogate = char ^ 0xDC00;
212+
if (surrogate > 0x3FF) {
210213
var category = low(char);
211214
state = moveBack(state, category);
212215
return;
213216
}
214217
// Found tail surrogate, check for prior lead surrogate.
215218
// The category of an unpaired tail surrogate is Control.
216219
int category;
217-
int prevChar;
220+
int prevSurrogate;
218221
if (cursor >= start &&
219-
(prevChar = base.codeUnitAt(--cursor)) & 0xFC00 == 0xD800) {
220-
category = high(prevChar, char);
222+
(prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) {
223+
category = high(prevSurrogate, surrogate);
221224
} else {
222225
category = categoryControl;
223226
cursor++;
@@ -339,21 +342,23 @@ int previousBreak(String text, int start, int end, int index) {
339342
if (start < index && index < end) {
340343
var cursorBefore = index;
341344
var nextChar = text.codeUnitAt(index);
345+
var nextSurrogate = nextChar ^ 0xD800;
342346
var category = categoryControl;
343-
if (nextChar & 0xF800 != 0xD800) {
347+
if (nextSurrogate > 0x7FF) {
344348
category = low(nextChar);
345-
} else if (nextChar & 0xFC00 == 0xD800) {
349+
} else if (nextSurrogate <= 0x3FF) {
346350
var indexAfter = index + 1;
347351
if (indexAfter < end) {
348-
var secondChar = text.codeUnitAt(indexAfter);
349-
if (secondChar & 0xFC00 == 0xDC00) {
350-
category = high(nextChar, secondChar);
352+
var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00;
353+
if (secondSurrogate <= 0x3FF) {
354+
category = high(nextChar, secondSurrogate);
351355
}
352356
}
353357
} else {
354-
var prevChar = text.codeUnitAt(index - 1);
355-
if (prevChar & 0xFC00 == 0xD800) {
356-
category = high(prevChar, nextChar);
358+
var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800;
359+
nextSurrogate &= 0x3FF;
360+
if (prevSurrogate <= 0x3FF) {
361+
category = high(prevSurrogate, nextSurrogate);
357362
cursorBefore -= 1;
358363
}
359364
}

pkgs/characters/lib/src/grapheme_clusters/table.dart

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,10 +1139,10 @@ int low(int codeUnit) {
11391139
@pragma('vm:prefer-inline')
11401140
@pragma('wasm:prefer-inline')
11411141
int high(int lead, int tail) {
1142-
var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + (2048 << 8);
1143-
var chunkStart = _start.codeUnitAt(offset >> 8);
1144-
var index = chunkStart + (tail & 255);
1145-
return _data.codeUnitAt(index);
1142+
var offset = (tail >> 8) + (lead << 2);
1143+
tail &= 255;
1144+
var chunkStart = _start.codeUnitAt(2048 + offset);
1145+
return _data.codeUnitAt(chunkStart + tail);
11461146
}
11471147

11481148
const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey'

pkgs/characters/pubspec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: characters
2-
version: 1.4.0
2+
version: 1.4.1
33
description: >-
44
String replacement with operations that are Unicode/grapheme cluster aware.
55
repository: https://github.com/dart-lang/core/tree/main/pkgs/characters

pkgs/characters/test/src/unicode_tests.dart

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ String testDescription(List<String> expected) {
2929
int categoryOf(int codePoint) {
3030
if (codePoint < 0x10000) return low(codePoint);
3131
var nonBmpOffset = codePoint - 0x10000;
32-
return high(0xD800 + (nonBmpOffset >> 10), 0xDC00 + (nonBmpOffset & 0x3ff));
32+
return high(nonBmpOffset >> 10, nonBmpOffset & 0x3ff);
3333
}
3434

3535
String partCategories(List<String> parts) {

pkgs/characters/tool/bin/generate_tables.dart

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -265,16 +265,28 @@ int $name(int lead, int tail) {
265265
''';
266266
}
267267
var shift = chunkSize.bitLength - 1;
268-
var indexVar = chunkSize < 1024 ? 'tail' : 'offset';
269-
return '''
268+
assert(shift <= 10);
269+
if (shift < 10) {
270+
return '''
270271
$preferInline
271272
int $name(int lead, int tail) {
272-
var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift);
273-
var chunkStart = $startName.codeUnitAt(offset >> $shift);
274-
var index = chunkStart + ($indexVar & ${chunkSize - 1});
275-
return $dataName.codeUnitAt(index);
273+
var offset = (tail >> $shift) + (lead << ${10 - shift});
274+
tail &= ${chunkSize - 1};
275+
var chunkStart = $startName.codeUnitAt($startOffset + offset);
276+
return $dataName.codeUnitAt(chunkStart + tail);
277+
}
278+
''';
279+
} else {
280+
assert(shift == 10);
281+
return '''
282+
$preferInline
283+
int $name(int lead, int tail) {
284+
var chunkStart = $startName.codeUnitAt($startOffset + lead);
285+
return $dataName.codeUnitAt(chunkStart + tail);
276286
}
277287
''';
288+
}
289+
// Add code if shift > 10 ever becomes optimal for table size.
278290
}
279291

280292
// -----------------------------------------------------------------------------

pkgs/characters/tool/src/string_literal_writer.dart

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,12 @@ class StringLiteralWriter {
4040
/// Adds a single UTF-16 code unit.
4141
void add(int codeUnit) {
4242
// Always escape: `\n`, `\r`, `'`, `$` and `\`, plus anything the user wants.
43-
if (_escape(codeUnit) ||
44-
codeUnit == 0x24 ||
45-
codeUnit == 0x27 ||
46-
codeUnit == 0x5c ||
47-
codeUnit == 0x0a ||
48-
codeUnit == 0x0d) {
43+
if (_escape(codeUnit) || // Anything the user wants encoded.
44+
codeUnit == 0x24 /* $ */ ||
45+
codeUnit == 0x27 /* ' */ ||
46+
codeUnit == 0x5c /* \ */ ||
47+
codeUnit == 0x0a /* \n */ ||
48+
codeUnit == 0x0d /* \r */) {
4949
_writeEscape(codeUnit);
5050
return;
5151
}
@@ -56,6 +56,9 @@ class StringLiteralWriter {
5656
buffer.writeCharCode(codeUnit);
5757
}
5858

59+
/// Writes an escape for the [codeUnit].
60+
///
61+
/// Is only called for characters that need escaping.
5962
void _writeEscape(int codeUnit) {
6063
var replacement = _escapeCache[codeUnit];
6164
if (replacement == null) {
@@ -80,9 +83,8 @@ class StringLiteralWriter {
8083
replacement = r'\$';
8184
} else if (codeUnit == "'".codeUnitAt(0)) {
8285
replacement = r"\'";
83-
}
84-
if (codeUnit == r''.codeUnitAt(0)) {
85-
replacement = r'\';
86+
} else if (codeUnit == r'\'.codeUnitAt(0)) {
87+
replacement = r'\\';
8688
} else {
8789
replacement = r'\x' + codeUnit.toRadixString(16);
8890
}

0 commit comments

Comments
 (0)