From 1f964c3056b9e63c0db2acc18e9bd42508106756 Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Thu, 31 Jul 2025 11:16:57 +0200 Subject: [PATCH 1/2] fix: Adjust regex for escaped unicode character literals --- grammar.js | 4 ++-- test/corpus/literals.txt | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/grammar.js b/grammar.js index 33ae72b4..4eac196f 100644 --- a/grammar.js +++ b/grammar.js @@ -1561,7 +1561,7 @@ module.exports = grammar({ seq('\\', choice( /[^xu]/, /u[0-9a-fA-F]{4}/, - /u\{[0-9a-fA-F]+\}/, + /u\{[0-9a-fA-F_]+\}/, /x[0-9a-fA-F]{2}/, )), /[^\\']/, @@ -1574,7 +1574,7 @@ module.exports = grammar({ choice( /[^xu]/, /u[0-9a-fA-F]{4}/, - /u\{[0-9a-fA-F]+\}/, + /u\{[0-9a-fA-F_]+\}/, /x[0-9a-fA-F]{2}/, ), )), diff --git a/test/corpus/literals.txt b/test/corpus/literals.txt index f6fa74de..be9f890b 100644 --- a/test/corpus/literals.txt +++ b/test/corpus/literals.txt @@ -81,6 +81,7 @@ b"foo\nbar"; "/* foo bar */ foo bar"; "foo\x42\x43bar"; "foo \x42 \x43 bar"; +"\u{10__FFFF}"; -------------------------------------------------------------------------------- @@ -123,7 +124,10 @@ b"foo\nbar"; (escape_sequence) (string_content) (escape_sequence) - (string_content)))) + (string_content))) + (expression_statement + (string_literal + (escape_sequence)))) ================================================================================ Raw string literals @@ -198,6 +202,7 @@ b'x'; '\t'; '\xff'; '\\'; +'\u{10__FFFF}'; -------------------------------------------------------------------------------- @@ -214,6 +219,8 @@ b'x'; (char_literal)) (expression_statement (char_literal)) + (expression_statement + (char_literal)) (expression_statement (char_literal))) From bb723fa5db73847d77d33891db2e51fb4baea5a2 Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Wed, 1 Oct 2025 09:24:07 +0200 Subject: [PATCH 2/2] tree-sitter generate --- src/grammar.json | 4 ++-- src/parser.c | 32 ++++++++++++++++++-------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/grammar.json b/src/grammar.json index f345735b..f3882c6c 100644 --- a/src/grammar.json +++ b/src/grammar.json @@ -9154,7 +9154,7 @@ }, { "type": "PATTERN", - "value": "u\\{[0-9a-fA-F]+\\}" + "value": "u\\{[0-9a-fA-F_]+\\}" }, { "type": "PATTERN", @@ -9204,7 +9204,7 @@ }, { "type": "PATTERN", - "value": "u\\{[0-9a-fA-F]+\\}" + "value": "u\\{[0-9a-fA-F_]+\\}" }, { "type": "PATTERN", diff --git a/src/parser.c b/src/parser.c index 533552b4..2e888a16 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1,4 +1,4 @@ -/* Automatically @generated by tree-sitter v0.25.10 */ +/* Automatically @generated by tree-sitter v0.25.9 */ #include "tree_sitter/parser.h" @@ -9021,39 +9021,41 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) { END_STATE(); case 50: if (lookahead == 'u') ADVANCE(53); - if (lookahead == 'x') ADVANCE(64); + if (lookahead == 'x') ADVANCE(62); if (lookahead != 0) ADVANCE(161); END_STATE(); case 51: if (lookahead == 'u') ADVANCE(54); - if (lookahead == 'x') ADVANCE(65); + if (lookahead == 'x') ADVANCE(63); if (lookahead != 0) ADVANCE(29); END_STATE(); case 52: if (lookahead == 'z') ADVANCE(48); END_STATE(); case 53: - if (lookahead == '{') ADVANCE(62); + if (lookahead == '{') ADVANCE(66); if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || ('a' <= lookahead && lookahead <= 'f')) ADVANCE(60); END_STATE(); case 54: - if (lookahead == '{') ADVANCE(63); + if (lookahead == '{') ADVANCE(67); if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || - ('a' <= lookahead && lookahead <= 'f')) ADVANCE(66); + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(64); END_STATE(); case 55: if (lookahead == '}') ADVANCE(29); if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || + lookahead == '_' || ('a' <= lookahead && lookahead <= 'f')) ADVANCE(55); END_STATE(); case 56: if (lookahead == '}') ADVANCE(161); if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || + lookahead == '_' || ('a' <= lookahead && lookahead <= 'f')) ADVANCE(56); END_STATE(); case 57: @@ -9073,7 +9075,7 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) { case 60: if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || - ('a' <= lookahead && lookahead <= 'f')) ADVANCE(64); + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(62); END_STATE(); case 61: if (('0' <= lookahead && lookahead <= '9') || @@ -9083,33 +9085,35 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) { case 62: if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || - ('a' <= lookahead && lookahead <= 'f')) ADVANCE(56); + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(61); END_STATE(); case 63: if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || - ('a' <= lookahead && lookahead <= 'f')) ADVANCE(55); + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(59); END_STATE(); case 64: if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || - ('a' <= lookahead && lookahead <= 'f')) ADVANCE(61); + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(63); END_STATE(); case 65: if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || - ('a' <= lookahead && lookahead <= 'f')) ADVANCE(59); + lookahead == '_' || + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(157); END_STATE(); case 66: if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || - ('a' <= lookahead && lookahead <= 'f')) ADVANCE(65); + lookahead == '_' || + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(56); END_STATE(); case 67: if (('0' <= lookahead && lookahead <= '9') || ('A' <= lookahead && lookahead <= 'F') || lookahead == '_' || - ('a' <= lookahead && lookahead <= 'f')) ADVANCE(157); + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(55); END_STATE(); case 68: if (('A' <= lookahead && lookahead <= 'Z') || @@ -9594,7 +9598,7 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) { if (lookahead == 'i') ADVANCE(38); if (lookahead == 'o') ADVANCE(58); if (lookahead == 'u') ADVANCE(38); - if (lookahead == 'x') ADVANCE(67); + if (lookahead == 'x') ADVANCE(65); if (('0' <= lookahead && lookahead <= '9') || lookahead == '_') ADVANCE(156); END_STATE();