Skip to content

Commit 0c4d7b3

Browse files
committed
Optimize surrogate decoding.
Use `char ^ 0xD800 <= 0x3FF` to check if a char code is a lead surrogate. That avoids doing a later `& 0x3FF` to get rid of the top bits. Similar for tail surrogate. This ensures that the `high` function gets values without high bits. Also optimize that function to reduce dependency depth and try to hit `base + (something < small)` expressions that can optimized into a single x64 address computation. Gives a ~7% increase on backwards traversal and 38% increase for forward traversal, based on tool/benchmark.dart compiled with `dart compile exe`.
1 parent dc97530 commit 0c4d7b3

File tree

8 files changed

+71
-50
lines changed

8 files changed

+71
-50
lines changed

pkgs/characters/CHANGELOG.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 1.4.1-wip
1+
## 1.4.1
22

3-
- Run `dart format` with the new style.
3+
* Run `dart format` with the new style.
4+
* Performance improvement for non-BMP characters.
45

56
## 1.4.0
67

pkgs/characters/lib/src/characters_impl.dart

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -509,13 +509,14 @@ class StringCharacterRange implements CharacterRange {
509509
var index = _end;
510510
while (index < _string.length) {
511511
var char = _string.codeUnitAt(index);
512+
var surrogateBits = char ^ 0xD800;
512513
var category = categoryControl;
513514
var nextIndex = index + 1;
514-
if (char & 0xFC00 != 0xD800) {
515+
if (surrogateBits >= 0x3FF) {
515516
category = low(char);
516517
} else if (nextIndex < _string.length) {
517-
var nextChar = _string.codeUnitAt(nextIndex);
518-
if (nextChar & 0xFC00 == 0xDC00) {
518+
var nextChar = _string.codeUnitAt(nextIndex) ^ 0xDC00;
519+
if (nextChar <= 0x3FF) {
519520
nextIndex += 1;
520521
category = high(char, nextChar);
521522
}

pkgs/characters/lib/src/grapheme_clusters/breaks.dart

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,17 @@ class Breaks {
7676
void step() {
7777
assert(cursor < end);
7878
var char = base.codeUnitAt(cursor++);
79-
if (char & 0xFC00 != 0xD800) {
79+
var surrogate = char ^ 0xD800;
80+
if (surrogate >= 0x3FF) {
8081
state = move(state, low(char));
8182
return;
8283
}
8384
// The category of an unpaired lead surrogate is Control.
8485
int category;
85-
int nextChar;
86+
int nextSurrogate;
8687
if (cursor < end &&
87-
(nextChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
88-
category = high(char, nextChar);
88+
(nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
89+
category = high(surrogate, nextSurrogate);
8990
cursor++;
9091
} else {
9192
category = categoryControl;
@@ -112,27 +113,28 @@ class Breaks {
112113
}
113114
var cursorBefore = cursor - 1;
114115
var prevChar = base.codeUnitAt(cursorBefore);
116+
var prevSurrogate = prevChar ^ 0xD800;
115117
int prevCategory;
116-
if (prevChar & 0xF800 != 0xD800) {
118+
if (prevSurrogate > 0x7FF) {
117119
// Not surrogate.
118120
prevCategory = low(prevChar);
119-
} else if (prevChar & 0xFC00 == 0xD800) {
121+
} else if (prevSurrogate <= 0x3FF) {
120122
// Lead surrogate. Check for a following tail surrogate.
121-
int tailChar;
123+
int tailSurrogate;
122124
if (cursor < end &&
123-
(tailChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
125+
(tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
124126
cursor += 1;
125-
prevCategory = high(prevChar, tailChar);
127+
prevCategory = high(prevSurrogate, tailSurrogate);
126128
} else {
127129
prevCategory = categoryControl;
128130
}
129131
} else {
130132
// Tail surrogate, check for prior lead surrogate.
131-
int leadChar;
133+
int leadSurrogate;
132134
var leadIndex = cursorBefore - 1;
133135
if (leadIndex >= start &&
134-
(leadChar = base.codeUnitAt(leadIndex)) & 0xFC00 == 0xD800) {
135-
prevCategory = high(leadChar, prevChar);
136+
(leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) {
137+
prevCategory = high(leadSurrogate, prevSurrogate);
136138
cursorBefore = leadIndex;
137139
} else {
138140
prevCategory = categoryControl;
@@ -206,18 +208,19 @@ class BackBreaks {
206208
void step() {
207209
assert(cursor > start);
208210
var char = base.codeUnitAt(--cursor);
209-
if (char & 0xFC00 != 0xDC00) {
211+
var surrogate = char ^ 0xDC00;
212+
if (surrogate > 0x3FF) {
210213
var category = low(char);
211214
state = moveBack(state, category);
212215
return;
213216
}
214217
// Found tail surrogate, check for prior lead surrogate.
215218
// The category of an unpaired tail surrogate is Control.
216219
int category;
217-
int prevChar;
220+
int prevSurrogate;
218221
if (cursor >= start &&
219-
(prevChar = base.codeUnitAt(--cursor)) & 0xFC00 == 0xD800) {
220-
category = high(prevChar, char);
222+
(prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) {
223+
category = high(prevSurrogate, surrogate);
221224
} else {
222225
category = categoryControl;
223226
cursor++;
@@ -342,21 +345,23 @@ int previousBreak(String text, int start, int end, int index) {
342345
if (start < index && index < end) {
343346
var cursorBefore = index;
344347
var nextChar = text.codeUnitAt(index);
348+
var nextSurrogate = nextChar ^ 0xD800;
345349
var category = categoryControl;
346-
if (nextChar & 0xF800 != 0xD800) {
350+
if (nextSurrogate > 0x7FF) {
347351
category = low(nextChar);
348-
} else if (nextChar & 0xFC00 == 0xD800) {
352+
} else if (nextSurrogate <= 0x3FF) {
349353
var indexAfter = index + 1;
350354
if (indexAfter < end) {
351-
var secondChar = text.codeUnitAt(indexAfter);
352-
if (secondChar & 0xFC00 == 0xDC00) {
353-
category = high(nextChar, secondChar);
355+
var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00;
356+
if (secondSurrogate <= 0x3FF) {
357+
category = high(nextChar, secondSurrogate);
354358
}
355359
}
356360
} else {
357-
var prevChar = text.codeUnitAt(index - 1);
358-
if (prevChar & 0xFC00 == 0xD800) {
359-
category = high(prevChar, nextChar);
361+
var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800;
362+
nextSurrogate &= 0x3FF;
363+
if (prevSurrogate <= 0x3FF) {
364+
category = high(prevSurrogate, nextSurrogate);
360365
cursorBefore -= 1;
361366
}
362367
}

pkgs/characters/lib/src/grapheme_clusters/table.dart

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,10 +1139,10 @@ int low(int codeUnit) {
11391139
@pragma('vm:prefer-inline')
11401140
@pragma('wasm:prefer-inline')
11411141
int high(int lead, int tail) {
1142-
var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + (2048 << 8);
1143-
var chunkStart = _start.codeUnitAt(offset >> 8);
1144-
var index = chunkStart + (tail & 255);
1145-
return _data.codeUnitAt(index);
1142+
var offset = (tail >> 8) + (lead << 2);
1143+
tail &= 255;
1144+
var chunkStart = _start.codeUnitAt(2048 + offset);
1145+
return _data.codeUnitAt(chunkStart + tail);
11461146
}
11471147

11481148
const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey'

pkgs/characters/pubspec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: characters
2-
version: 1.4.1-wip
2+
version: 1.4.1
33
description: >-
44
String replacement with operations that are Unicode/grapheme cluster aware.
55
repository: https://github.com/dart-lang/core/tree/main/pkgs/characters

pkgs/characters/test/src/unicode_tests.dart

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ String testDescription(List<String> expected) {
3131
int categoryOf(int codePoint) {
3232
if (codePoint < 0x10000) return low(codePoint);
3333
var nonBmpOffset = codePoint - 0x10000;
34-
return high(0xD800 + (nonBmpOffset >> 10), 0xDC00 + (nonBmpOffset & 0x3ff));
34+
return high(nonBmpOffset >> 10, nonBmpOffset & 0x3ff);
3535
}
3636

3737
String partCategories(List<String> parts) {

pkgs/characters/tool/bin/generate_tables.dart

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -331,16 +331,28 @@ int $name(int lead, int tail) {
331331
''';
332332
}
333333
var shift = chunkSize.bitLength - 1;
334-
var indexVar = chunkSize < 1024 ? 'tail' : 'offset';
335-
return '''
334+
assert(shift <= 10);
335+
if (shift < 10) {
336+
return '''
336337
$preferInline
337338
int $name(int lead, int tail) {
338-
var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + ($startOffset << $shift);
339-
var chunkStart = $startName.codeUnitAt(offset >> $shift);
340-
var index = chunkStart + ($indexVar & ${chunkSize - 1});
341-
return $dataName.codeUnitAt(index);
339+
var offset = (tail >> $shift) + (lead << ${10 - shift});
340+
tail &= ${chunkSize - 1};
341+
var chunkStart = $startName.codeUnitAt($startOffset + offset);
342+
return $dataName.codeUnitAt(chunkStart + tail);
343+
}
344+
''';
345+
} else {
346+
assert(shift == 10);
347+
return '''
348+
$preferInline
349+
int $name(int lead, int tail) {
350+
var chunkStart = $startName.codeUnitAt($startOffset + lead);
351+
return $dataName.codeUnitAt(chunkStart + tail);
342352
}
343353
''';
354+
}
355+
// Add code if shift > 10 ever becomes optimal for table size.
344356
}
345357

346358
// -----------------------------------------------------------------------------

pkgs/characters/tool/src/string_literal_writer.dart

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,12 @@ class StringLiteralWriter {
4343
/// Adds a single UTF-16 code unit.
4444
void add(int codeUnit) {
4545
// Always escape: `\n`, `\r`, `'`, `$` and `\`, plus anything the user wants.
46-
if (_escape(codeUnit) ||
47-
codeUnit == 0x24 ||
48-
codeUnit == 0x27 ||
49-
codeUnit == 0x5c ||
50-
codeUnit == 0x0a ||
51-
codeUnit == 0x0d) {
46+
if (_escape(codeUnit) || // Anything the user wants encoded.
47+
codeUnit == 0x24 /* $ */ ||
48+
codeUnit == 0x27 /* ' */ ||
49+
codeUnit == 0x5c /* \ */ ||
50+
codeUnit == 0x0a /* \n */ ||
51+
codeUnit == 0x0d /* \r */) {
5252
_writeEscape(codeUnit);
5353
return;
5454
}
@@ -59,6 +59,9 @@ class StringLiteralWriter {
5959
buffer.writeCharCode(codeUnit);
6060
}
6161

62+
/// Writes an escape for the [codeUnit].
63+
///
64+
/// Is only called for characters that need escaping.
6265
void _writeEscape(int codeUnit) {
6366
var replacement = _escapeCache[codeUnit];
6467
if (replacement == null) {
@@ -83,9 +86,8 @@ class StringLiteralWriter {
8386
replacement = r'\$';
8487
} else if (codeUnit == "'".codeUnitAt(0)) {
8588
replacement = r"\'";
86-
}
87-
if (codeUnit == r''.codeUnitAt(0)) {
88-
replacement = r'\';
89+
} else if (codeUnit == r'\'.codeUnitAt(0)) {
90+
replacement = r'\\';
8991
} else {
9092
replacement = r'\x' + codeUnit.toRadixString(16);
9193
}

0 commit comments

Comments
 (0)