Skip to content

Commit d3c11a3

Browse files
committed
Decode unicode group names to canonical forms
1 parent fa5737f commit d3c11a3

File tree

6 files changed

+141
-28
lines changed

6 files changed

+141
-28
lines changed

index.d.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ declare module 'regexp-tree/ast' {
5656
capturing: true;
5757
number: number;
5858
name?: string;
59+
canonicalName?: string;
5960
expression: Expression | null;
6061
}
6162

@@ -76,6 +77,7 @@ declare module 'regexp-tree/ast' {
7677
kind: 'name';
7778
number: number;
7879
reference: string;
80+
canonicalReference: string;
7981
}
8082

8183
export type Backreference = NumericBackreference | NamedBackreference;

src/generator/__tests__/generator-basic-test.js

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ function test(re) {
1414
}
1515

1616
describe('generator-basic', () => {
17-
1817
it('simple char', () => {
1918
test(/a/);
2019
});
@@ -81,11 +80,11 @@ describe('generator-basic', () => {
8180
});
8281

8382
it('named group', () => {
84-
test('/(?<foo\\u003B\\u{003B}>bar)/');
83+
test('/(?<foo\\u003B\\u{003B}>bar)/u');
8584
});
8685

8786
it('empty named group', () => {
88-
test('/(?<foo\\u003B\\u{003B}>)/');
87+
test('/(?<foo\\u003B\\u{003B}>)/u');
8988
});
9089

9190
it('empty non-capturing group', () => {
@@ -97,7 +96,7 @@ describe('generator-basic', () => {
9796
});
9897

9998
it('named backreference', () => {
100-
test('/(?<foo\\u003B\\u{003B}>)\\k<foo\\u003B\\u{003B}>/');
99+
test('/(?<foo\\u003B\\u{003B}>)\\k<foo\\u003B\\u{003B}>/u');
101100
});
102101

103102
it('basic-assertion', () => {
@@ -179,5 +178,4 @@ describe('generator-basic', () => {
179178
test(/a{1,}?/);
180179
test(/a{1,3}?/);
181180
});
182-
183-
});
181+
});

src/parser/__tests__/parser-extended-test.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ describe('extended', () => {
2525
{
2626
type: 'Group',
2727
name: 'year',
28+
canonicalName: 'year',
2829
number: 1,
2930
capturing: true,
3031
expression: {
@@ -55,6 +56,7 @@ describe('extended', () => {
5556
{
5657
type: 'Group',
5758
name: 'month',
59+
canonicalName: 'month',
5860
number: 2,
5961
capturing: true,
6062
expression: {
@@ -85,6 +87,7 @@ describe('extended', () => {
8587
{
8688
type: 'Group',
8789
name: 'day',
90+
canonicalName: 'day',
8891
number: 3,
8992
capturing: true,
9093
expression: {
@@ -263,6 +266,7 @@ describe('extended', () => {
263266
type: 'Group',
264267
capturing: true,
265268
name: 'c',
269+
canonicalName: 'c',
266270
number: 1,
267271
expression: {
268272
type: 'Alternative',
@@ -271,6 +275,7 @@ describe('extended', () => {
271275
type: 'Group',
272276
capturing: true,
273277
name: 'b',
278+
canonicalName: 'b',
274279
number: 2,
275280
expression: {
276281
type: 'Alternative',
@@ -279,6 +284,7 @@ describe('extended', () => {
279284
type: 'Group',
280285
capturing: true,
281286
name: 'a',
287+
canonicalName: 'a',
282288
number: 3,
283289
expression: {
284290
type: 'Char',
@@ -312,6 +318,7 @@ describe('extended', () => {
312318
type: 'Group',
313319
capturing: true,
314320
name: 'd',
321+
canonicalName: 'd',
315322
number: 4,
316323
expression: {
317324
type: 'Char',
@@ -325,6 +332,7 @@ describe('extended', () => {
325332
type: 'Group',
326333
capturing: true,
327334
name: 'e',
335+
canonicalName: 'e',
328336
number: 5,
329337
expression: {
330338
type: 'Char',

src/parser/__tests__/parser-test262-test.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ describe('test262', () => {
138138
it('unicode group names', () => {
139139
valid('/(?<π>a)/u');
140140
valid('/(?<\\u{03C0}>a)/u');
141+
invalid('/(?<π>a)(?<\\u{03C0}>a)/u', 'Duplicate of the named group');
141142
valid('/(?<$𐒤>a)/u');
142143
valid('/(?<_\\u200C>a)/u');
143144
valid('/(?<_\\u200D>a)/u');

src/parser/generated/regexp-tree.js

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -228,18 +228,19 @@ const productions = [[-1,1,(_1,_1loc) => { __loc = yyloc(_1loc, _1loc);__ = _1 }
228228
[14,1,(_1,_1loc) => { __loc = yyloc(_1loc, _1loc);__ = _1 }],
229229
[14,1,(_1,_1loc) => { __loc = yyloc(_1loc, _1loc);__ = _1 }],
230230
[15,3,(_1,_2,_3,_1loc,_2loc,_3loc) => { __loc = yyloc(_1loc, _3loc);
231-
if (namedGroups.hasOwnProperty(_1)) {
232-
throw new SyntaxError(`Duplicate of the named group "${_1}".`);
233-
}
234-
235231
const name = String(_1);
232+
const decodedName = decodeUnicodeGroupName(name);
233+
if (namedGroups.hasOwnProperty(decodedName)) {
234+
throw new SyntaxError(`Duplicate of the named group "${decodedName}".`);
235+
}
236236

237-
namedGroups[name] = _1.groupNumber;
237+
namedGroups[decodedName] = _1.groupNumber;
238238

239239
__ = Node({
240240
type: 'Group',
241241
capturing: true,
242242
name,
243+
canonicalName: decodedName,
243244
number: _1.groupNumber,
244245
expression: _2,
245246
}, __loc);
@@ -1104,19 +1105,68 @@ function GroupRefOrDecChar(text, textLoc) {
11041105
/**
11051106
* Unicode names.
11061107
*/
1107-
const uRe = /^\\u[0-9a-fA-F]{4}/;
1108-
const ucpRe = /^\\u\{[0-9a-fA-F]{1,}\}/;
1108+
const uReStart = /^\\u[0-9a-fA-F]{4}/; // only matches start of string
1109+
const ucpReStart = /^\\u\{[0-9a-fA-F]{1,}\}/; // only matches start of string
1110+
const ucpReAnywhere = /\\u\{[0-9a-fA-F]{1,}\}/; // matches anywhere in string
11091111

11101112
/**
11111113
* Validates Unicode group name.
11121114
*/
11131115
function validateUnicodeGroupName(name, state) {
1114-
const isUnicodeName = uRe.test(name) || ucpRe.test(name);
1116+
const isUnicodeName = ucpReAnywhere.test(name);
11151117
const isUnicodeState = (state === 'u' || state === 'xu' || state === 'u_class');
11161118

11171119
if (isUnicodeName && !isUnicodeState) {
11181120
throw new SyntaxError(`invalid group Unicode name "${name}", use \`u\` flag.`);
11191121
}
1122+
1123+
return name;
1124+
}
1125+
1126+
// Matches the following production: https://tc39.es/ecma262/#prod-RegExpUnicodeEscapeSequence
1127+
//
1128+
// RegExpUnicodeEscapeSequence ::
1129+
// `u` LeadSurrogate `\u` TrailSurrogate # as 'leadSurrogate', 'trailSurrogate'
1130+
// `u` LeadSurrogate # as 'leadSurrogateOnly'
1131+
// `u` TrailSurrogate # as 'trailSurrogateOnly'
1132+
// `u` NonSurrogate # as 'nonSurrogate'
1133+
// `u` `{` CodePoint `}` # as 'codePoint'
1134+
//
1135+
// LeadSurrogate ::
1136+
// Hex4Digits but only if the SV of Hex4Digits is in the inclusive range 0xD800 to 0xDBFF # [dD][89aAbB][0-9a-fA-F]{2}
1137+
//
1138+
// TrailSurrogate ::
1139+
// Hex4Digits but only if the SV of Hex4Digits is in the inclusive range 0xDC00 to 0xDFFF # [dD][c-fC-F][0-9a-fA-F]{2}
1140+
//
1141+
// NonSurrogate ::
1142+
// Hex4Digits but only if the SV of Hex4Digits is not in the inclusive range 0xD800 to 0xDFFF # [0-9a-ce-fA-CE-F][0-9a-fA-F]{3}|[dD][0-7][0-9a-fA-F]{2}
1143+
//
1144+
// CodePoint ::
1145+
// HexDigits but only if MV of HexDigits ≤ 0x10FFFF # 0*(?:[0-9a-fA-F]{1,5}|10[0-9a-fA-F]{4})
1146+
//
1147+
const uidRe = /\\u(?:([dD][89aAbB][0-9a-fA-F]{2})\\u([dD][c-fC-F][0-9a-fA-F]{2})|([dD][89aAbB][0-9a-fA-F]{2})|([dD][c-fC-F][0-9a-fA-F]{2})|([0-9a-ce-fA-CE-F][0-9a-fA-F]{3}|[dD][0-7][0-9a-fA-F]{2})|\{(0*(?:[0-9a-fA-F]{1,5}|10[0-9a-fA-F]{4}))\})/;
1148+
1149+
function decodeUnicodeGroupName(name) {
1150+
return name.replace(new RegExp(uidRe, 'g'), function (_, leadSurrogate, trailSurrogate, leadSurrogateOnly, trailSurrogateOnly, nonSurrogate, codePoint) {
1151+
if (leadSurrogate) {
1152+
return String.fromCodePoint(parseInt(leadSurrogate, 16), parseInt(trailSurrogate, 16));
1153+
}
1154+
if (leadSurrogateOnly) {
1155+
return String.fromCodePoint(parseInt(leadSurrogateOnly, 16));
1156+
}
1157+
if (trailSurrogateOnly) {
1158+
// TODO: Per the spec: https://tc39.es/ecma262/#prod-RegExpUnicodeEscapeSequence
1159+
// > Each `\u` TrailSurrogate for which the choice of associated `u` LeadSurrogate is ambiguous shall be associated with the nearest possible `u` LeadSurrogate that would otherwise have no corresponding `\u` TrailSurrogate.
1160+
return String.fromCodePoint(parseInt(trailSurrogateOnly, 16));
1161+
}
1162+
if (nonSurrogate) {
1163+
return String.fromCodePoint(parseInt(nonSurrogate, 16));
1164+
}
1165+
if (codePoint) {
1166+
return String.fromCodePoint(parseInt(codePoint, 16));
1167+
}
1168+
return _;
1169+
});
11201170
}
11211171

11221172
/**
@@ -1126,13 +1176,15 @@ function validateUnicodeGroupName(name, state) {
11261176
*/
11271177
function NamedGroupRefOrChars(text, textLoc) {
11281178
const groupName = text.slice(3, -1);
1179+
const decodedName = decodeUnicodeGroupName(groupName);
11291180

1130-
if (namedGroups.hasOwnProperty(groupName)) {
1181+
if (namedGroups.hasOwnProperty(decodedName)) {
11311182
return Node({
11321183
type: 'Backreference',
11331184
kind: 'name',
1134-
number: namedGroups[groupName],
1185+
number: namedGroups[decodedName],
11351186
reference: groupName,
1187+
canonicalReference: decodedName,
11361188
}, textLoc);
11371189
}
11381190

@@ -1182,7 +1234,7 @@ function NamedGroupRefOrChars(text, textLoc) {
11821234
let matched = null;
11831235

11841236
// Unicode, \u003B or \u{003B}
1185-
if ((matched = text.match(uRe)) || (matched = text.match(ucpRe))) {
1237+
if ((matched = text.match(uReStart)) || (matched = text.match(ucpReStart))) {
11861238
if (startOffset) {
11871239
loc = {
11881240
startLine,

src/parser/regexp.bnf

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -443,19 +443,68 @@ function GroupRefOrDecChar(text, textLoc) {
443443
/**
444444
* Unicode names.
445445
*/
446-
const uRe = /^\\u[0-9a-fA-F]{4}/;
447-
const ucpRe = /^\\u\{[0-9a-fA-F]{1,}\}/;
446+
const uReStart = /^\\u[0-9a-fA-F]{4}/; // only matches start of string
447+
const ucpReStart = /^\\u\{[0-9a-fA-F]{1,}\}/; // only matches start of string
448+
const ucpReAnywhere = /\\u\{[0-9a-fA-F]{1,}\}/; // matches anywhere in string
448449

449450
/**
450451
* Validates Unicode group name.
451452
*/
452453
function validateUnicodeGroupName(name, state) {
453-
const isUnicodeName = uRe.test(name) || ucpRe.test(name);
454+
const isUnicodeName = ucpReAnywhere.test(name);
454455
const isUnicodeState = (state === 'u' || state === 'xu' || state === 'u_class');
455456

456457
if (isUnicodeName && !isUnicodeState) {
457458
throw new SyntaxError(`invalid group Unicode name "${name}", use \`u\` flag.`);
458459
}
460+
461+
return name;
462+
}
463+
464+
// Matches the following production: https://tc39.es/ecma262/#prod-RegExpUnicodeEscapeSequence
465+
//
466+
// RegExpUnicodeEscapeSequence ::
467+
// `u` LeadSurrogate `\u` TrailSurrogate # as 'leadSurrogate', 'trailSurrogate'
468+
// `u` LeadSurrogate # as 'leadSurrogateOnly'
469+
// `u` TrailSurrogate # as 'trailSurrogateOnly'
470+
// `u` NonSurrogate # as 'nonSurrogate'
471+
// `u` `{` CodePoint `}` # as 'codePoint'
472+
//
473+
// LeadSurrogate ::
474+
// Hex4Digits but only if the SV of Hex4Digits is in the inclusive range 0xD800 to 0xDBFF # [dD][89aAbB][0-9a-fA-F]{2}
475+
//
476+
// TrailSurrogate ::
477+
// Hex4Digits but only if the SV of Hex4Digits is in the inclusive range 0xDC00 to 0xDFFF # [dD][c-fC-F][0-9a-fA-F]{2}
478+
//
479+
// NonSurrogate ::
480+
// Hex4Digits but only if the SV of Hex4Digits is not in the inclusive range 0xD800 to 0xDFFF # [0-9a-ce-fA-CE-F][0-9a-fA-F]{3}|[dD][0-7][0-9a-fA-F]{2}
481+
//
482+
// CodePoint ::
483+
// HexDigits but only if MV of HexDigits ≤ 0x10FFFF # 0*(?:[0-9a-fA-F]{1,5}|10[0-9a-fA-F]{4})
484+
//
485+
const uidRe = /\\u(?:([dD][89aAbB][0-9a-fA-F]{2})\\u([dD][c-fC-F][0-9a-fA-F]{2})|([dD][89aAbB][0-9a-fA-F]{2})|([dD][c-fC-F][0-9a-fA-F]{2})|([0-9a-ce-fA-CE-F][0-9a-fA-F]{3}|[dD][0-7][0-9a-fA-F]{2})|\{(0*(?:[0-9a-fA-F]{1,5}|10[0-9a-fA-F]{4}))\})/;
486+
487+
function decodeUnicodeGroupName(name) {
488+
return name.replace(new RegExp(uidRe, 'g'), function (_, leadSurrogate, trailSurrogate, leadSurrogateOnly, trailSurrogateOnly, nonSurrogate, codePoint) {
489+
if (leadSurrogate) {
490+
return String.fromCodePoint(parseInt(leadSurrogate, 16), parseInt(trailSurrogate, 16));
491+
}
492+
if (leadSurrogateOnly) {
493+
return String.fromCodePoint(parseInt(leadSurrogateOnly, 16));
494+
}
495+
if (trailSurrogateOnly) {
496+
// TODO: Per the spec: https://tc39.es/ecma262/#prod-RegExpUnicodeEscapeSequence
497+
// > Each `\u` TrailSurrogate for which the choice of associated `u` LeadSurrogate is ambiguous shall be associated with the nearest possible `u` LeadSurrogate that would otherwise have no corresponding `\u` TrailSurrogate.
498+
return String.fromCodePoint(parseInt(trailSurrogateOnly, 16));
499+
}
500+
if (nonSurrogate) {
501+
return String.fromCodePoint(parseInt(nonSurrogate, 16));
502+
}
503+
if (codePoint) {
504+
return String.fromCodePoint(parseInt(codePoint, 16));
505+
}
506+
return _;
507+
});
459508
}
460509

461510
/**
@@ -465,13 +514,15 @@ function validateUnicodeGroupName(name, state) {
465514
*/
466515
function NamedGroupRefOrChars(text, textLoc) {
467516
const groupName = text.slice(3, -1);
517+
const decodedName = decodeUnicodeGroupName(groupName);
468518

469-
if (namedGroups.hasOwnProperty(groupName)) {
519+
if (namedGroups.hasOwnProperty(decodedName)) {
470520
return Node({
471521
type: 'Backreference',
472522
kind: 'name',
473-
number: namedGroups[groupName],
523+
number: namedGroups[decodedName],
474524
reference: groupName,
525+
canonicalReference: decodedName,
475526
}, textLoc);
476527
}
477528

@@ -521,7 +572,7 @@ function NamedGroupRefOrChars(text, textLoc) {
521572
let matched = null;
522573

523574
// Unicode, \u003B or \u{003B}
524-
if ((matched = text.match(uRe)) || (matched = text.match(ucpRe))) {
575+
if ((matched = text.match(uReStart)) || (matched = text.match(ucpReStart))) {
525576
if (startOffset) {
526577
loc = {
527578
startLine,
@@ -861,18 +912,19 @@ Group
861912
CapturingGroup
862913
: NAMED_CAPTURE_GROUP Disjunction R_PAREN
863914
{
864-
if (namedGroups.hasOwnProperty($1)) {
865-
throw new SyntaxError(`Duplicate of the named group "${$1}".`);
866-
}
867-
868915
const name = String($1);
916+
const decodedName = decodeUnicodeGroupName(name);
917+
if (namedGroups.hasOwnProperty(decodedName)) {
918+
throw new SyntaxError(`Duplicate of the named group "${decodedName}".`);
919+
}
869920

870-
namedGroups[name] = $1.groupNumber;
921+
namedGroups[decodedName] = $1.groupNumber;
871922

872923
$$ = Node({
873924
type: 'Group',
874925
capturing: true,
875926
name,
927+
canonicalName: decodedName,
876928
number: $1.groupNumber,
877929
expression: $2,
878930
}, @$);

0 commit comments

Comments
 (0)