From 0725a3a74ebb92701db1942e07e988c34799121d Mon Sep 17 00:00:00 2001 From: graphemecluster Date: Wed, 22 May 2024 11:48:34 +0800 Subject: [PATCH 1/3] Correct flags scanning for non-BMP characters (cherry picked from commit e67692acb3fdd068b4e577dc9ad9fa350f2e4ca8) --- src/compiler/scanner.ts | 38 ++++++++++--------- ...egularExpressionWithNonBMPFlags.errors.txt | 23 +++++++++++ .../regularExpressionWithNonBMPFlags.js | 8 ++++ .../regularExpressionWithNonBMPFlags.symbols | 6 +++ .../regularExpressionWithNonBMPFlags.types | 9 +++++ .../regularExpressionWithNonBMPFlags.ts | 3 ++ 6 files changed, 69 insertions(+), 18 deletions(-) create mode 100644 tests/baselines/reference/regularExpressionWithNonBMPFlags.errors.txt create mode 100644 tests/baselines/reference/regularExpressionWithNonBMPFlags.js create mode 100644 tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols create mode 100644 tests/baselines/reference/regularExpressionWithNonBMPFlags.types create mode 100644 tests/cases/compiler/regularExpressionWithNonBMPFlags.ts diff --git a/src/compiler/scanner.ts b/src/compiler/scanner.ts index 5d93e57ebe660..0e9666dedddfe 100644 --- a/src/compiler/scanner.ts +++ b/src/compiler/scanner.ts @@ -2474,28 +2474,29 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean const isUnterminated = !!(tokenFlags & TokenFlags.Unterminated); const endOfBody = p - (isUnterminated ? 0 : 1); let regExpFlags = RegularExpressionFlags.None; - while (p < end) { - const ch = charCodeUnchecked(p); - if (!isIdentifierPart(ch, languageVersion)) { + while (true) { + const ch = codePointChecked(p); + if (ch === CharacterCodes.EOF || !isIdentifierPart(ch, languageVersion)) { break; } + const size = charSize(ch); if (reportErrors) { - const flag = characterToRegularExpressionFlag(String.fromCharCode(ch)); + const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch)); if (flag === undefined) { - error(Diagnostics.Unknown_regular_expression_flag, p, 1); + error(Diagnostics.Unknown_regular_expression_flag, p, size); } else if (regExpFlags & flag) { - error(Diagnostics.Duplicate_regular_expression_flag, p, 1); + error(Diagnostics.Duplicate_regular_expression_flag, p, size); } else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) { - error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, p, 1); + error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, p, size); } else { regExpFlags |= flag; - checkRegularExpressionFlagAvailable(flag, p); + checkRegularExpressionFlagAvailability(flag, p, size); } } - p++; + p += size; } pos = p; if (reportErrors) { @@ -2763,25 +2764,26 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags { while (true) { - const ch = charCodeChecked(pos); + const ch = codePointChecked(pos); if (ch === CharacterCodes.EOF || !isIdentifierPart(ch, languageVersion)) { break; } - const flag = characterToRegularExpressionFlag(String.fromCharCode(ch)); + const size = charSize(ch); + const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch)); if (flag === undefined) { - error(Diagnostics.Unknown_regular_expression_flag, pos, 1); + error(Diagnostics.Unknown_regular_expression_flag, pos, size); } else if (currFlags & flag) { - error(Diagnostics.Duplicate_regular_expression_flag, pos, 1); + error(Diagnostics.Duplicate_regular_expression_flag, pos, size); } else if (!(flag & RegularExpressionFlags.Modifiers)) { - error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1); + error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, size); } else { currFlags |= flag; - checkRegularExpressionFlagAvailable(flag, pos); + checkRegularExpressionFlagAvailability(flag, pos, size); } - pos++; + pos += size; } return currFlags; } @@ -3494,10 +3496,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean }); } - function checkRegularExpressionFlagAvailable(flag: RegularExpressionFlags, pos: number) { + function checkRegularExpressionFlagAvailability(flag: RegularExpressionFlags, pos: number, size: number) { const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag) as ScriptTarget | undefined; if (availableFrom && languageVersion < availableFrom) { - error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom)); + error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, size, getNameOfScriptTarget(availableFrom)); } } diff --git a/tests/baselines/reference/regularExpressionWithNonBMPFlags.errors.txt b/tests/baselines/reference/regularExpressionWithNonBMPFlags.errors.txt new file mode 100644 index 0000000000000..b91d0d9c12fd1 --- /dev/null +++ b/tests/baselines/reference/regularExpressionWithNonBMPFlags.errors.txt @@ -0,0 +1,23 @@ +regularExpressionWithNonBMPFlags.ts(1,23): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(1,25): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(1,28): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(1,41): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(1,43): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(1,45): error TS1499: Unknown regular expression flag. + + +==== regularExpressionWithNonBMPFlags.ts (6 errors) ==== + const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; + ~~ +!!! error TS1499: Unknown regular expression flag. + ~~ +!!! error TS1499: Unknown regular expression flag. + ~~ +!!! error TS1499: Unknown regular expression flag. + ~~ +!!! error TS1499: Unknown regular expression flag. + ~~ +!!! error TS1499: Unknown regular expression flag. + ~~ +!!! error TS1499: Unknown regular expression flag. + \ No newline at end of file diff --git a/tests/baselines/reference/regularExpressionWithNonBMPFlags.js b/tests/baselines/reference/regularExpressionWithNonBMPFlags.js new file mode 100644 index 0000000000000..847b74684b459 --- /dev/null +++ b/tests/baselines/reference/regularExpressionWithNonBMPFlags.js @@ -0,0 +1,8 @@ +//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] //// + +//// [regularExpressionWithNonBMPFlags.ts] +const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; + + +//// [regularExpressionWithNonBMPFlags.js] +const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; diff --git a/tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols b/tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols new file mode 100644 index 0000000000000..29c7a53335550 --- /dev/null +++ b/tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols @@ -0,0 +1,6 @@ +//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] //// + +=== regularExpressionWithNonBMPFlags.ts === +const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; +>𝘳𝘦𝘨𝘦𝘹 : Symbol(𝘳𝘦𝘨𝘦𝘹, Decl(regularExpressionWithNonBMPFlags.ts, 0, 5)) + diff --git a/tests/baselines/reference/regularExpressionWithNonBMPFlags.types b/tests/baselines/reference/regularExpressionWithNonBMPFlags.types new file mode 100644 index 0000000000000..5f385d608df02 --- /dev/null +++ b/tests/baselines/reference/regularExpressionWithNonBMPFlags.types @@ -0,0 +1,9 @@ +//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] //// + +=== regularExpressionWithNonBMPFlags.ts === +const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; +>𝘳𝘦𝘨𝘦𝘹 : RegExp +> : ^^^^^^ +>/(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶 : RegExp +> : ^^^^^^ + diff --git a/tests/cases/compiler/regularExpressionWithNonBMPFlags.ts b/tests/cases/compiler/regularExpressionWithNonBMPFlags.ts new file mode 100644 index 0000000000000..5f85755c0c5c1 --- /dev/null +++ b/tests/cases/compiler/regularExpressionWithNonBMPFlags.ts @@ -0,0 +1,3 @@ +// @target: esnext + +const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; From 406468ca6242f9f168c37de22b80351b1eed78b2 Mon Sep 17 00:00:00 2001 From: graphemecluster Date: Fri, 31 May 2024 01:10:33 +0800 Subject: [PATCH 2/3] Optimization: Lookup by `CharacterCodes` --- src/compiler/scanner.ts | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/compiler/scanner.ts b/src/compiler/scanner.ts index 0de8c10dea9e0..32b519058cc59 100644 --- a/src/compiler/scanner.ts +++ b/src/compiler/scanner.ts @@ -282,16 +282,16 @@ const textToToken = new Map(Object.entries({ "`": SyntaxKind.BacktickToken, })); -const charToRegExpFlag = new Map(Object.entries({ - d: RegularExpressionFlags.HasIndices, - g: RegularExpressionFlags.Global, - i: RegularExpressionFlags.IgnoreCase, - m: RegularExpressionFlags.Multiline, - s: RegularExpressionFlags.DotAll, - u: RegularExpressionFlags.Unicode, - v: RegularExpressionFlags.UnicodeSets, - y: RegularExpressionFlags.Sticky, -})); +const charCodeToRegExpFlag = new Map([ + [CharacterCodes.d, RegularExpressionFlags.HasIndices], + [CharacterCodes.g, RegularExpressionFlags.Global], + [CharacterCodes.i, RegularExpressionFlags.IgnoreCase], + [CharacterCodes.m, RegularExpressionFlags.Multiline], + [CharacterCodes.s, RegularExpressionFlags.DotAll], + [CharacterCodes.u, RegularExpressionFlags.Unicode], + [CharacterCodes.v, RegularExpressionFlags.UnicodeSets], + [CharacterCodes.y, RegularExpressionFlags.Sticky], +]); const regExpFlagToFirstAvailableLanguageVersion = new Map([ [RegularExpressionFlags.HasIndices, LanguageFeatureMinimumTarget.RegularExpressionFlagsHasIndices], @@ -394,8 +394,8 @@ function isUnicodeIdentifierPart(code: number, languageVersion: ScriptTarget | u lookupInUnicodeMap(code, unicodeES5IdentifierPart); } -function makeReverseMap(source: Map): string[] { - const result: string[] = []; +function makeReverseMap(source: Map): T[] { + const result: T[] = []; source.forEach((value, name) => { result[value] = name; }); @@ -416,16 +416,16 @@ export function stringToToken(s: string): SyntaxKind | undefined { return textToToken.get(s); } -const regExpFlagChars = makeReverseMap(charToRegExpFlag); +const regExpFlagCharCodes = makeReverseMap(charCodeToRegExpFlag); /** @internal */ -export function regularExpressionFlagToCharacter(f: RegularExpressionFlags): string | undefined { - return regExpFlagChars[f]; +export function regularExpressionFlagToCharacterCode(f: RegularExpressionFlags): CharacterCodes | undefined { + return regExpFlagCharCodes[f]; } /** @internal */ -export function characterToRegularExpressionFlag(c: string): RegularExpressionFlags | undefined { - return charToRegExpFlag.get(c); +export function characterCodeToRegularExpressionFlag(ch: CharacterCodes): RegularExpressionFlags | undefined { + return charCodeToRegExpFlag.get(ch); } /** @internal */ @@ -2564,7 +2564,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean } const size = charSize(ch); if (reportErrors) { - const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch)); + const flag = characterCodeToRegularExpressionFlag(ch); if (flag === undefined) { error(Diagnostics.Unknown_regular_expression_flag, pos, size); } @@ -2849,7 +2849,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean break; } const size = charSize(ch); - const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch)); + const flag = characterCodeToRegularExpressionFlag(ch); if (flag === undefined) { error(Diagnostics.Unknown_regular_expression_flag, pos, size); } From 723b1d68c6c7409ddac5a9267efee9089324d283 Mon Sep 17 00:00:00 2001 From: Ron Buckton Date: Sat, 1 Jun 2024 00:49:57 +0800 Subject: [PATCH 3/3] Add remarks to test case file --- ...regularExpressionWithNonBMPFlags.errors.txt | 18 ++++++++++++------ .../regularExpressionWithNonBMPFlags.js | 12 ++++++++++++ .../regularExpressionWithNonBMPFlags.symbols | 8 +++++++- .../regularExpressionWithNonBMPFlags.types | 6 ++++++ .../regularExpressionWithNonBMPFlags.ts | 6 ++++++ 5 files changed, 43 insertions(+), 7 deletions(-) diff --git a/tests/baselines/reference/regularExpressionWithNonBMPFlags.errors.txt b/tests/baselines/reference/regularExpressionWithNonBMPFlags.errors.txt index b91d0d9c12fd1..9ad9c43caa0b7 100644 --- a/tests/baselines/reference/regularExpressionWithNonBMPFlags.errors.txt +++ b/tests/baselines/reference/regularExpressionWithNonBMPFlags.errors.txt @@ -1,12 +1,18 @@ -regularExpressionWithNonBMPFlags.ts(1,23): error TS1499: Unknown regular expression flag. -regularExpressionWithNonBMPFlags.ts(1,25): error TS1499: Unknown regular expression flag. -regularExpressionWithNonBMPFlags.ts(1,28): error TS1499: Unknown regular expression flag. -regularExpressionWithNonBMPFlags.ts(1,41): error TS1499: Unknown regular expression flag. -regularExpressionWithNonBMPFlags.ts(1,43): error TS1499: Unknown regular expression flag. -regularExpressionWithNonBMPFlags.ts(1,45): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(7,23): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(7,25): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(7,28): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(7,41): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(7,43): error TS1499: Unknown regular expression flag. +regularExpressionWithNonBMPFlags.ts(7,45): error TS1499: Unknown regular expression flag. ==== regularExpressionWithNonBMPFlags.ts (6 errors) ==== + // The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including: + // - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S) + // - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I) + // - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M) + // + // See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; ~~ !!! error TS1499: Unknown regular expression flag. diff --git a/tests/baselines/reference/regularExpressionWithNonBMPFlags.js b/tests/baselines/reference/regularExpressionWithNonBMPFlags.js index 847b74684b459..a6581efa2827e 100644 --- a/tests/baselines/reference/regularExpressionWithNonBMPFlags.js +++ b/tests/baselines/reference/regularExpressionWithNonBMPFlags.js @@ -1,8 +1,20 @@ //// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] //// //// [regularExpressionWithNonBMPFlags.ts] +// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including: +// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S) +// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I) +// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M) +// +// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; //// [regularExpressionWithNonBMPFlags.js] +// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including: +// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S) +// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I) +// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M) +// +// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; diff --git a/tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols b/tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols index 29c7a53335550..af3ec681f3252 100644 --- a/tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols +++ b/tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols @@ -1,6 +1,12 @@ //// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] //// === regularExpressionWithNonBMPFlags.ts === +// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including: +// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S) +// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I) +// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M) +// +// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; ->𝘳𝘦𝘨𝘦𝘹 : Symbol(𝘳𝘦𝘨𝘦𝘹, Decl(regularExpressionWithNonBMPFlags.ts, 0, 5)) +>𝘳𝘦𝘨𝘦𝘹 : Symbol(𝘳𝘦𝘨𝘦𝘹, Decl(regularExpressionWithNonBMPFlags.ts, 6, 5)) diff --git a/tests/baselines/reference/regularExpressionWithNonBMPFlags.types b/tests/baselines/reference/regularExpressionWithNonBMPFlags.types index 5f385d608df02..cffacb8589261 100644 --- a/tests/baselines/reference/regularExpressionWithNonBMPFlags.types +++ b/tests/baselines/reference/regularExpressionWithNonBMPFlags.types @@ -1,6 +1,12 @@ //// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] //// === regularExpressionWithNonBMPFlags.ts === +// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including: +// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S) +// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I) +// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M) +// +// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶; >𝘳𝘦𝘨𝘦𝘹 : RegExp > : ^^^^^^ diff --git a/tests/cases/compiler/regularExpressionWithNonBMPFlags.ts b/tests/cases/compiler/regularExpressionWithNonBMPFlags.ts index 5f85755c0c5c1..85ffde8d8fcb5 100644 --- a/tests/cases/compiler/regularExpressionWithNonBMPFlags.ts +++ b/tests/cases/compiler/regularExpressionWithNonBMPFlags.ts @@ -1,3 +1,9 @@ // @target: esnext +// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including: +// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S) +// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I) +// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M) +// +// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;