Skip to content

Commit 6cc88a5

Browse files
committed
Decode unicode group names to canonical forms
1 parent fa5737f commit 6cc88a5

File tree

10 files changed

+227
-37
lines changed

10 files changed

+227
-37
lines changed

README.md

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1336,6 +1336,34 @@ We have the following node (the `name` property with value `foo` is added):
13361336
type: 'Group',
13371337
capturing: true,
13381338
name: 'foo',
1339+
nameRaw: 'foo',
1340+
number: 1,
1341+
expression: {
1342+
type: 'Char',
1343+
value: 'x',
1344+
symbol: 'x',
1345+
kind: 'simple',
1346+
codePoint: 120
1347+
}
1348+
}
1349+
```
1350+
1351+
Note: The `nameRaw` property represents the name *as parsed from the original source*, including escape sequences. The `name` property represents the canonical decoded form of the name.
1352+
1353+
For example, given the `/u` flag and the following group:
1354+
1355+
```regexp
1356+
(?<\u{03C0}>x)
1357+
```
1358+
1359+
We would have the following node:
1360+
1361+
```js
1362+
{
1363+
type: 'Group',
1364+
capturing: true,
1365+
name: 'π',
1366+
nameRaw: '\\u{03C0}',
13391367
number: 1,
13401368
expression: {
13411369
type: 'Char',
@@ -1465,6 +1493,7 @@ A node:
14651493
type: 'Group',
14661494
capturing: true,
14671495
name: 'foo',
1496+
nameRaw: 'foo',
14681497
number: 1,
14691498
expression: {
14701499
type: 'Char',
@@ -1478,7 +1507,8 @@ A node:
14781507
type: 'Backreference',
14791508
kind: 'name',
14801509
number: 1,
1481-
reference: 'foo'
1510+
reference: 'foo',
1511+
referenceRaw: 'foo'
14821512
},
14831513
{
14841514
type: 'Backreference',
@@ -1490,6 +1520,52 @@ A node:
14901520
}
14911521
```
14921522
1523+
Note: The `referenceRaw` property represents the reference *as parsed from the original source*, including escape sequences. The `reference` property represents the canonical decoded form of the reference.
1524+
1525+
For example, given the `/u` flag and the following pattern (matches `www`):
1526+
1527+
```regexp
1528+
(?<π>w)\k<\u{03C0}>\1
1529+
```
1530+
1531+
We would have the following node:
1532+
1533+
```js
1534+
{
1535+
type: 'Alternative',
1536+
expressions: [
1537+
{
1538+
type: 'Group',
1539+
capturing: true,
1540+
name: 'π',
1541+
nameRaw: 'π',
1542+
number: 1,
1543+
expression: {
1544+
type: 'Char',
1545+
value: 'w',
1546+
symbol: 'w',
1547+
kind: 'simple',
1548+
codePoint: 119
1549+
}
1550+
},
1551+
{
1552+
type: 'Backreference',
1553+
kind: 'name',
1554+
number: 1,
1555+
reference: 'π',
1556+
referenceRaw: '\\u{03C0}'
1557+
},
1558+
{
1559+
type: 'Backreference',
1560+
kind: 'number',
1561+
number: 1,
1562+
reference: 1
1563+
}
1564+
]
1565+
}
1566+
```
1567+
1568+
14931569
#### Quantifiers
14941570
14951571
Quantifiers specify _repetition_ of a regular expression (or of its part). Below are the quantifiers which _wrap_ a parsed expression into a `Repetition` node. The quantifier itself can be of different _kinds_, and has `Quantifier` node type.

index.d.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ declare module 'regexp-tree/ast' {
5656
capturing: true;
5757
number: number;
5858
name?: string;
59+
nameRaw?: string;
5960
expression: Expression | null;
6061
}
6162

@@ -76,6 +77,7 @@ declare module 'regexp-tree/ast' {
7677
kind: 'name';
7778
number: number;
7879
reference: string;
80+
referenceRaw: string;
7981
}
8082

8183
export type Backreference = NumericBackreference | NamedBackreference;

scripts/generate-unicode-id-parts.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ const isContinue = c => /\p{ID_Continue}/u.test(c);
137137
let idStartWriter = new Writer();
138138
let idContinueWriter = new Writer();
139139

140-
for (let cp = 0; cp < MAX_UNICODE_CODEPOINT; cp++) {
140+
for (let cp = 0; cp <= MAX_UNICODE_CODEPOINT; cp++) {
141141
const ch = String.fromCodePoint(cp);
142142
if (isStart(ch)) {
143143
idStartWriter.push(cp);
@@ -151,7 +151,7 @@ console.log(`/**
151151
* Generated by scripts/generate-unicode-id-parts.js on node ${
152152
process.version
153153
} with unicode ${process.versions.unicode}
154-
* based on http://www.unicode.org/reports/tr31/ and https://www.ecma-international.org/ecma-262/6.0/#sec-names-and-keywords
154+
* based on http://www.unicode.org/reports/tr31/ and https://tc39.es/ecma262/#sec-names-and-keywords
155155
* U_ID_START corresponds to the ID_Start property, and U_ID_CONTINUE corresponds to ID_Continue property.
156156
*/`);
157157
console.log('U_ID_START ' + idStartWriter.toString());

src/compat-transpiler/transforms/compat-named-capturing-groups-transform.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
* A regexp-tree plugin to translate `/(?<name>a)\k<name>/` to `/(a)\1/`.
1010
*/
1111
module.exports = {
12-
1312
// To track the names of the groups, and return them
1413
// in the transform result state.
1514
//
@@ -41,6 +40,7 @@ module.exports = {
4140
this._groupNames[node.name] = node.number;
4241

4342
delete node.name;
43+
delete node.nameRaw;
4444
},
4545

4646
Backreference(path) {
@@ -52,5 +52,6 @@ module.exports = {
5252

5353
node.kind = 'number';
5454
node.reference = node.number;
55+
delete node.referenceRaw;
5556
},
56-
};
57+
};

src/generator/__tests__/generator-basic-test.js

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ function test(re) {
1414
}
1515

1616
describe('generator-basic', () => {
17-
1817
it('simple char', () => {
1918
test(/a/);
2019
});
@@ -81,11 +80,11 @@ describe('generator-basic', () => {
8180
});
8281

8382
it('named group', () => {
84-
test('/(?<foo\\u003B\\u{003B}>bar)/');
83+
test('/(?<foo\\u003B\\u{003B}>bar)/u');
8584
});
8685

8786
it('empty named group', () => {
88-
test('/(?<foo\\u003B\\u{003B}>)/');
87+
test('/(?<foo\\u003B\\u{003B}>)/u');
8988
});
9089

9190
it('empty non-capturing group', () => {
@@ -97,7 +96,7 @@ describe('generator-basic', () => {
9796
});
9897

9998
it('named backreference', () => {
100-
test('/(?<foo\\u003B\\u{003B}>)\\k<foo\\u003B\\u{003B}>/');
99+
test('/(?<foo\\u003B\\u{003B}>)\\k<foo\\u003B\\u{003B}>/u');
101100
});
102101

103102
it('basic-assertion', () => {
@@ -179,5 +178,4 @@ describe('generator-basic', () => {
179178
test(/a{1,}?/);
180179
test(/a{1,3}?/);
181180
});
182-
183-
});
181+
});

src/generator/index.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ const generator = {
3434
if (node.capturing) {
3535
// A named group.
3636
if (node.name) {
37-
return `(?<${node.name}>${expression})`;
37+
return `(?<${node.nameRaw || node.name}>${expression})`;
3838
}
3939

4040
return `(${expression})`;
@@ -48,7 +48,7 @@ const generator = {
4848
case 'number':
4949
return `\\${node.reference}`;
5050
case 'name':
51-
return `\\k<${node.reference}>`;
51+
return `\\k<${node.referenceRaw || node.reference}>`;
5252
default:
5353
throw new TypeError(`Unknown Backreference kind: ${node.kind}`);
5454
}

src/parser/__tests__/parser-extended-test.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ describe('extended', () => {
2525
{
2626
type: 'Group',
2727
name: 'year',
28+
nameRaw: 'year',
2829
number: 1,
2930
capturing: true,
3031
expression: {
@@ -55,6 +56,7 @@ describe('extended', () => {
5556
{
5657
type: 'Group',
5758
name: 'month',
59+
nameRaw: 'month',
5860
number: 2,
5961
capturing: true,
6062
expression: {
@@ -85,6 +87,7 @@ describe('extended', () => {
8587
{
8688
type: 'Group',
8789
name: 'day',
90+
nameRaw: 'day',
8891
number: 3,
8992
capturing: true,
9093
expression: {
@@ -263,6 +266,7 @@ describe('extended', () => {
263266
type: 'Group',
264267
capturing: true,
265268
name: 'c',
269+
nameRaw: 'c',
266270
number: 1,
267271
expression: {
268272
type: 'Alternative',
@@ -271,6 +275,7 @@ describe('extended', () => {
271275
type: 'Group',
272276
capturing: true,
273277
name: 'b',
278+
nameRaw: 'b',
274279
number: 2,
275280
expression: {
276281
type: 'Alternative',
@@ -279,6 +284,7 @@ describe('extended', () => {
279284
type: 'Group',
280285
capturing: true,
281286
name: 'a',
287+
nameRaw: 'a',
282288
number: 3,
283289
expression: {
284290
type: 'Char',
@@ -312,6 +318,7 @@ describe('extended', () => {
312318
type: 'Group',
313319
capturing: true,
314320
name: 'd',
321+
nameRaw: 'd',
315322
number: 4,
316323
expression: {
317324
type: 'Char',
@@ -325,6 +332,7 @@ describe('extended', () => {
325332
type: 'Group',
326333
capturing: true,
327334
name: 'e',
335+
nameRaw: 'e',
328336
number: 5,
329337
expression: {
330338
type: 'Char',

src/parser/__tests__/parser-test262-test.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ describe('test262', () => {
138138
it('unicode group names', () => {
139139
valid('/(?<π>a)/u');
140140
valid('/(?<\\u{03C0}>a)/u');
141+
invalid('/(?<π>a)(?<\\u{03C0}>a)/u', 'Duplicate of the named group');
141142
valid('/(?<$𐒤>a)/u');
142143
valid('/(?<_\\u200C>a)/u');
143144
valid('/(?<_\\u200D>a)/u');

0 commit comments

Comments
 (0)