Skip to content

Commit cd35c99

Browse files
Lexer: use standard JS functions to handle Unicode (#3322)
1 parent 7da9cd3 commit cd35c99

File tree

1 file changed

+12
-35
lines changed

1 file changed

+12
-35
lines changed

src/language/lexer.ts

Lines changed: 12 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -144,17 +144,6 @@ function isTrailingSurrogate(code: number): boolean {
144144
return code >= 0xdc00 && code <= 0xdfff;
145145
}
146146

147-
function encodeSurrogatePair(point: number): string {
148-
return String.fromCharCode(
149-
0xd800 | ((point - 0x10000) >> 10), // Leading Surrogate
150-
0xdc00 | ((point - 0x10000) & 0x3ff), // Trailing Surrogate
151-
);
152-
}
153-
154-
function decodeSurrogatePair(leading: number, trailing: number): number {
155-
return 0x10000 + (((leading & 0x03ff) << 10) | (trailing & 0x03ff));
156-
}
157-
158147
/**
159148
* Prints the code point (or end of file reference) at a given location in a
160149
* source for use in error messages.
@@ -163,22 +152,18 @@ function decodeSurrogatePair(leading: number, trailing: number): number {
163152
* code point form (ie. U+1234).
164153
*/
165154
function printCodePointAt(lexer: Lexer, location: number): string {
166-
const body = lexer.source.body;
167-
if (location >= body.length) {
155+
const code = lexer.source.body.codePointAt(location);
156+
157+
if (code === undefined) {
168158
return TokenKind.EOF;
159+
} else if (code >= 0x0020 && code <= 0x007e) {
160+
// Printable ASCII
161+
const char = String.fromCodePoint(code);
162+
return char === '"' ? "'\"'" : `"${char}"`;
169163
}
170-
const code = body.charCodeAt(location);
171-
// Printable ASCII
172-
if (code >= 0x0020 && code <= 0x007e) {
173-
return code === 0x0022 ? "'\"'" : `"${body[location]}"`;
174-
}
164+
175165
// Unicode code point
176-
const point = isSupplementaryCodePoint(body, location)
177-
? decodeSurrogatePair(code, body.charCodeAt(location + 1))
178-
: code;
179-
const zeroPad =
180-
point > 0xfff ? '' : point > 0xff ? '0' : point > 0xf ? '00' : '000';
181-
return `U+${zeroPad}${point.toString(16).toUpperCase()}`;
166+
return 'U+' + code.toString(16).toUpperCase().padStart(4, '0');
182167
}
183168

184169
/**
@@ -596,15 +581,7 @@ function readEscapedUnicodeVariableWidth(
596581
if (size < 5 || !isUnicodeScalarValue(point)) {
597582
break;
598583
}
599-
// JavaScript defines strings as a sequence of UTF-16 code units and
600-
// encodes Unicode code points above U+FFFF using a surrogate pair.
601-
return {
602-
value:
603-
point <= 0xffff
604-
? String.fromCharCode(point)
605-
: encodeSurrogatePair(point),
606-
size,
607-
};
584+
return { value: String.fromCodePoint(point), size };
608585
}
609586
// Append this hex digit to the code point.
610587
point = (point << 4) | readHexDigit(code);
@@ -631,7 +608,7 @@ function readEscapedUnicodeFixedWidth(
631608
const code = read16BitHexCode(body, position + 2);
632609

633610
if (isUnicodeScalarValue(code)) {
634-
return { value: String.fromCharCode(code), size: 6 };
611+
return { value: String.fromCodePoint(code), size: 6 };
635612
}
636613

637614
// GraphQL allows JSON-style surrogate pair escape sequences, but only when
@@ -650,7 +627,7 @@ function readEscapedUnicodeFixedWidth(
650627
// include both codes into the JavaScript string value. Had JavaScript
651628
// not been internally based on UTF-16, then this surrogate pair would
652629
// be decoded to retrieve the supplementary code point.
653-
return { value: String.fromCharCode(code, trailingCode), size: 12 };
630+
return { value: String.fromCodePoint(code, trailingCode), size: 12 };
654631
}
655632
}
656633
}

0 commit comments

Comments
 (0)