From ee80947612d93105f2251005ac1d0772e232223a Mon Sep 17 00:00:00 2001 From: Nathan Shively-Sanders <293473+sandersn@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:32:30 -0800 Subject: [PATCH 1/4] More regex error recovery in reScanSlashToken This still does NOT include the ability to log diagnostics from https://github.com/microsoft/TypeScript/pull/55600. But it does match tsc's smarter regex-end detection for unterminated regexes (which you get a lot when parsing JSX as JS). --- internal/scanner/scanner.go | 57 +++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index bdf5429b9b..e381997e13 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -805,6 +805,7 @@ func (s *Scanner) ReScanTemplateToken(isTaggedTemplate bool) ast.Kind { func (s *Scanner) ReScanSlashToken() ast.Kind { if s.token == ast.KindSlashToken || s.token == ast.KindSlashEqualsToken { s.pos = s.tokenStart + 1 + startOfRegExpBody := s.pos inEscape := false inCharacterClass := false loop: @@ -824,7 +825,6 @@ func (s *Scanner) ReScanSlashToken() ast.Kind { case ch == '/' && !inCharacterClass: // A slash within a character class is permissible, // but in general it signals the end of the regexp literal. - s.pos++ break loop case ch == '[': inCharacterClass = true @@ -835,12 +835,57 @@ func (s *Scanner) ReScanSlashToken() ast.Kind { } s.pos += size } - for { - ch, size := s.charAndSize() - if size == 0 || !isIdentifierPart(ch, s.languageVersion) { - break + if s.tokenFlags&ast.TokenFlagsUnterminated != 0 { + // Search for the nearest unbalanced bracket for better recovery. Since the expression is + // invalid anyways, we take nested square brackets into consideration for the best guess. + endOfRegExpBody := s.pos + s.pos = startOfRegExpBody + inEscape = false + characterClassDepth := 0 + inDecimalQuantifier := false + groupDepth := 0 + for s.pos < endOfRegExpBody { + ch, size := s.charAndSize() + if inEscape { + inEscape = false + } else if ch == '\\' { + inEscape = true + } else if ch == '[' { + characterClassDepth++ + } else if ch == ']' && characterClassDepth != 0 { + characterClassDepth-- + } else if !(characterClassDepth != 0) { + if ch == '{' { + inDecimalQuantifier = true + } else if ch == '}' && inDecimalQuantifier { + inDecimalQuantifier = false + } else if !inDecimalQuantifier { + if ch == '(' { + groupDepth++ + } else if ch == ')' && groupDepth != 0 { + groupDepth-- + } else if ch == ')' || ch == ']' || ch == '}' { + // We encountered an unbalanced bracket outside a character class. Treat this position as the end of regex. + break + } + } + } + s.pos += size + } + // Whitespaces and semicolons at the end are not likely to be part of the regex + for stringutil.IsWhiteSpaceLike(s.charAt(-1)) || s.charAt(-1) == ';' { + s.pos-- + } + } else { + // Consume the slash character + s.pos++ + for { + ch, size := s.charAndSize() + if size == 0 || !isIdentifierPart(ch, s.languageVersion) { + break + } + s.pos += size } - s.pos += size } s.tokenValue = s.text[s.tokenStart:s.pos] s.token = ast.KindRegularExpressionLiteral From c5e94125cb00adb957c7fba5b7c97bc025ba7540 Mon Sep 17 00:00:00 2001 From: Nathan Shively-Sanders <293473+sandersn@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:26:00 -0800 Subject: [PATCH 2/4] account for 2-byte whitespace --- internal/scanner/scanner.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index e381997e13..623ea44dd5 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -873,8 +873,13 @@ func (s *Scanner) ReScanSlashToken() ast.Kind { s.pos += size } // Whitespaces and semicolons at the end are not likely to be part of the regex - for stringutil.IsWhiteSpaceLike(s.charAt(-1)) || s.charAt(-1) == ';' { - s.pos-- + for { + ch, size := utf8.DecodeLastRuneInString(s.text[:s.pos]) + if stringutil.IsWhiteSpaceLike(ch) || ch == ';' { + s.pos -= size + } else { + break + } } } else { // Consume the slash character From a7b3c84f4c095b1fc40134b178d351c1294d9ea8 Mon Sep 17 00:00:00 2001 From: Nathan Shively-Sanders <293473+sandersn@users.noreply.github.com> Date: Mon, 16 Dec 2024 09:46:15 -0800 Subject: [PATCH 3/4] Address PR comments 1. Flip bad ts-to-go conversion of truthiness check. 2. Re-add simple "Untermined regular expression literal" error. --- internal/scanner/scanner.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index 623ea44dd5..9ce0ea475a 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -854,7 +854,7 @@ func (s *Scanner) ReScanSlashToken() ast.Kind { characterClassDepth++ } else if ch == ']' && characterClassDepth != 0 { characterClassDepth-- - } else if !(characterClassDepth != 0) { + } else if characterClassDepth == 0 { if ch == '{' { inDecimalQuantifier = true } else if ch == '}' && inDecimalQuantifier { @@ -881,6 +881,7 @@ func (s *Scanner) ReScanSlashToken() ast.Kind { break } } + s.errorAt(diagnostics.Unterminated_regular_expression_literal, s.tokenStart, s.pos - s.tokenStart); } else { // Consume the slash character s.pos++ From 4e003d3fad03679de1039ee59a8834f50ac140a3 Mon Sep 17 00:00:00 2001 From: Nathan Shively-Sanders <293473+sandersn@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:15:20 -0800 Subject: [PATCH 4/4] hereby format --- internal/scanner/scanner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index 9ce0ea475a..9a7a074b53 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -881,7 +881,7 @@ func (s *Scanner) ReScanSlashToken() ast.Kind { break } } - s.errorAt(diagnostics.Unterminated_regular_expression_literal, s.tokenStart, s.pos - s.tokenStart); + s.errorAt(diagnostics.Unterminated_regular_expression_literal, s.tokenStart, s.pos-s.tokenStart) } else { // Consume the slash character s.pos++