Skip to content

Commit 77d10df

Browse files
Merge pull request #184 from rbuckton/fixUnicodeGroupNames
Improve parser suppport for regexp group names
2 parents da6fd06 + 6cc88a5 commit 77d10df

File tree

10 files changed

+410
-40
lines changed

10 files changed

+410
-40
lines changed

README.md

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1336,6 +1336,34 @@ We have the following node (the `name` property with value `foo` is added):
13361336
type: 'Group',
13371337
capturing: true,
13381338
name: 'foo',
1339+
nameRaw: 'foo',
1340+
number: 1,
1341+
expression: {
1342+
type: 'Char',
1343+
value: 'x',
1344+
symbol: 'x',
1345+
kind: 'simple',
1346+
codePoint: 120
1347+
}
1348+
}
1349+
```
1350+
1351+
Note: The `nameRaw` property represents the name *as parsed from the original source*, including escape sequences. The `name` property represents the canonical decoded form of the name.
1352+
1353+
For example, given the `/u` flag and the following group:
1354+
1355+
```regexp
1356+
(?<\u{03C0}>x)
1357+
```
1358+
1359+
We would have the following node:
1360+
1361+
```js
1362+
{
1363+
type: 'Group',
1364+
capturing: true,
1365+
name: 'π',
1366+
nameRaw: '\\u{03C0}',
13391367
number: 1,
13401368
expression: {
13411369
type: 'Char',
@@ -1465,6 +1493,7 @@ A node:
14651493
type: 'Group',
14661494
capturing: true,
14671495
name: 'foo',
1496+
nameRaw: 'foo',
14681497
number: 1,
14691498
expression: {
14701499
type: 'Char',
@@ -1478,7 +1507,8 @@ A node:
14781507
type: 'Backreference',
14791508
kind: 'name',
14801509
number: 1,
1481-
reference: 'foo'
1510+
reference: 'foo',
1511+
referenceRaw: 'foo'
14821512
},
14831513
{
14841514
type: 'Backreference',
@@ -1490,6 +1520,52 @@ A node:
14901520
}
14911521
```
14921522
1523+
Note: The `referenceRaw` property represents the reference *as parsed from the original source*, including escape sequences. The `reference` property represents the canonical decoded form of the reference.
1524+
1525+
For example, given the `/u` flag and the following pattern (matches `www`):
1526+
1527+
```regexp
1528+
(?<π>w)\k<\u{03C0}>\1
1529+
```
1530+
1531+
We would have the following node:
1532+
1533+
```js
1534+
{
1535+
type: 'Alternative',
1536+
expressions: [
1537+
{
1538+
type: 'Group',
1539+
capturing: true,
1540+
name: 'π',
1541+
nameRaw: 'π',
1542+
number: 1,
1543+
expression: {
1544+
type: 'Char',
1545+
value: 'w',
1546+
symbol: 'w',
1547+
kind: 'simple',
1548+
codePoint: 119
1549+
}
1550+
},
1551+
{
1552+
type: 'Backreference',
1553+
kind: 'name',
1554+
number: 1,
1555+
reference: 'π',
1556+
referenceRaw: '\\u{03C0}'
1557+
},
1558+
{
1559+
type: 'Backreference',
1560+
kind: 'number',
1561+
number: 1,
1562+
reference: 1
1563+
}
1564+
]
1565+
}
1566+
```
1567+
1568+
14931569
#### Quantifiers
14941570
14951571
Quantifiers specify _repetition_ of a regular expression (or of its part). Below are the quantifiers which _wrap_ a parsed expression into a `Repetition` node. The quantifier itself can be of different _kinds_, and has `Quantifier` node type.

index.d.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ declare module 'regexp-tree/ast' {
6262
capturing: true;
6363
number: number;
6464
name?: string;
65+
nameRaw?: string;
6566
expression: Expression | null;
6667
}
6768

@@ -84,6 +85,7 @@ declare module 'regexp-tree/ast' {
8485
kind: 'name';
8586
number: number;
8687
reference: string;
88+
referenceRaw: string;
8789
}
8890

8991
export type Backreference =

scripts/generate-unicode-id-parts.js

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
// based on https://github.com/microsoft/TypeScript/tree/master/scripts/regenerate-unicode-identifier-parts.js
2+
3+
/** @param {number} i */
4+
function toHex4Digits(i) {
5+
let s = i.toString(16);
6+
while (s.length < 4) {
7+
s = '0' + s;
8+
}
9+
if (s.length > 4) throw new Error('Invalid Hex4Digits value');
10+
return s;
11+
}
12+
13+
class NonSurrogateRange {
14+
/** @param {number} codePoint */
15+
constructor(codePoint) {
16+
this.firstCodePoint = codePoint;
17+
this.lastCodePoint = codePoint;
18+
}
19+
toString() {
20+
let text = '\\u' + toHex4Digits(this.firstCodePoint);
21+
if (this.lastCodePoint !== this.firstCodePoint) {
22+
text += '-\\u' + toHex4Digits(this.lastCodePoint);
23+
}
24+
return text;
25+
}
26+
}
27+
28+
class LeadSurrogateRange {
29+
/** @param {number} leadSurrogate */
30+
constructor(leadSurrogate) {
31+
this.leadSurrogate = leadSurrogate;
32+
/** @type {TrailSurrogateRange[]} */
33+
this.ranges = [];
34+
}
35+
36+
toString() {
37+
return (
38+
'\\u' +
39+
toHex4Digits(this.leadSurrogate) +
40+
'[' +
41+
this.ranges.join('') +
42+
']'
43+
);
44+
}
45+
}
46+
47+
class TrailSurrogateRange {
48+
/** @param {number} trailSurrogate */
49+
constructor(trailSurrogate) {
50+
this.firstTrailSurrogate = trailSurrogate;
51+
this.lastTrailSurrogate = trailSurrogate;
52+
}
53+
toString() {
54+
let text = '\\u' + toHex4Digits(this.firstTrailSurrogate);
55+
if (this.lastTrailSurrogate !== this.firstTrailSurrogate) {
56+
text += '-\\u' + toHex4Digits(this.lastTrailSurrogate);
57+
}
58+
return text;
59+
}
60+
}
61+
62+
class Writer {
63+
constructor() {
64+
/** @type {number} */
65+
this.lastCodePoint = -1;
66+
/** @type {NonSurrogateRange[]} */
67+
this.nonSurrogateRanges = [];
68+
/** @type {LeadSurrogateRange[]} */
69+
this.surrogateRanges = [];
70+
/** @type {NonSurrogateRange} */
71+
this.nonSurrogateRange;
72+
/** @type {LeadSurrogateRange} */
73+
this.leadSurrogateRange;
74+
/** @type {TrailSurrogateRange} */
75+
this.trailSurrogateRange;
76+
}
77+
78+
/** @param {number} codePoint */
79+
push(codePoint) {
80+
if (codePoint <= this.lastCodePoint)
81+
throw new Error('Code points must be added in order.');
82+
this.lastCodePoint = codePoint;
83+
84+
if (codePoint < MAX_UNICODE_NON_SURROGATE) {
85+
if (
86+
this.nonSurrogateRange &&
87+
this.nonSurrogateRange.lastCodePoint === codePoint - 1
88+
) {
89+
this.nonSurrogateRange.lastCodePoint = codePoint;
90+
return;
91+
}
92+
this.nonSurrogateRange = new NonSurrogateRange(codePoint);
93+
this.nonSurrogateRanges.push(this.nonSurrogateRange);
94+
} else {
95+
const leadSurrogate = Math.floor((codePoint - 0x10000) / 0x400) + 0xd800;
96+
const trailSurrogate = ((codePoint - 0x10000) % 0x400) + 0xdc00;
97+
if (
98+
!this.leadSurrogateRange ||
99+
this.leadSurrogateRange.leadSurrogate !== leadSurrogate
100+
) {
101+
this.trailSurrogateRange = undefined;
102+
this.leadSurrogateRange = new LeadSurrogateRange(leadSurrogate);
103+
this.surrogateRanges.push(this.leadSurrogateRange);
104+
}
105+
106+
if (
107+
this.trailSurrogateRange &&
108+
this.trailSurrogateRange.lastTrailSurrogate === trailSurrogate - 1
109+
) {
110+
this.trailSurrogateRange.lastTrailSurrogate = trailSurrogate;
111+
return;
112+
}
113+
114+
this.trailSurrogateRange = new TrailSurrogateRange(trailSurrogate);
115+
this.leadSurrogateRange.ranges.push(this.trailSurrogateRange);
116+
}
117+
}
118+
119+
toString() {
120+
let first = this.nonSurrogateRanges.join('');
121+
let second = this.surrogateRanges.join('|');
122+
return first && second
123+
? `([${first}]|${second})`
124+
: first
125+
? `[${first}]`
126+
: second
127+
? `(${second})`
128+
: '';
129+
}
130+
}
131+
132+
const MAX_UNICODE_NON_SURROGATE = 0xffff;
133+
const MAX_UNICODE_CODEPOINT = 0x10ffff;
134+
const isStart = c => /\p{ID_Start}/u.test(c);
135+
const isContinue = c => /\p{ID_Continue}/u.test(c);
136+
137+
let idStartWriter = new Writer();
138+
let idContinueWriter = new Writer();
139+
140+
for (let cp = 0; cp <= MAX_UNICODE_CODEPOINT; cp++) {
141+
const ch = String.fromCodePoint(cp);
142+
if (isStart(ch)) {
143+
idStartWriter.push(cp);
144+
}
145+
if (isContinue(ch)) {
146+
idContinueWriter.push(cp);
147+
}
148+
}
149+
150+
console.log(`/**
151+
* Generated by scripts/generate-unicode-id-parts.js on node ${
152+
process.version
153+
} with unicode ${process.versions.unicode}
154+
* based on http://www.unicode.org/reports/tr31/ and https://tc39.es/ecma262/#sec-names-and-keywords
155+
* U_ID_START corresponds to the ID_Start property, and U_ID_CONTINUE corresponds to ID_Continue property.
156+
*/`);
157+
console.log('U_ID_START ' + idStartWriter.toString());
158+
console.log('U_ID_CONTINUE ' + idContinueWriter.toString());

src/compat-transpiler/transforms/compat-named-capturing-groups-transform.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
* A regexp-tree plugin to translate `/(?<name>a)\k<name>/` to `/(a)\1/`.
1010
*/
1111
module.exports = {
12-
1312
// To track the names of the groups, and return them
1413
// in the transform result state.
1514
//
@@ -41,6 +40,7 @@ module.exports = {
4140
this._groupNames[node.name] = node.number;
4241

4342
delete node.name;
43+
delete node.nameRaw;
4444
},
4545

4646
Backreference(path) {
@@ -52,5 +52,6 @@ module.exports = {
5252

5353
node.kind = 'number';
5454
node.reference = node.number;
55+
delete node.referenceRaw;
5556
},
56-
};
57+
};

src/generator/__tests__/generator-basic-test.js

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ function test(re) {
1414
}
1515

1616
describe('generator-basic', () => {
17-
1817
it('simple char', () => {
1918
test(/a/);
2019
});
@@ -81,11 +80,11 @@ describe('generator-basic', () => {
8180
});
8281

8382
it('named group', () => {
84-
test('/(?<foo\\u003B\\u{003B}>bar)/');
83+
test('/(?<foo\\u003B\\u{003B}>bar)/u');
8584
});
8685

8786
it('empty named group', () => {
88-
test('/(?<foo\\u003B\\u{003B}>)/');
87+
test('/(?<foo\\u003B\\u{003B}>)/u');
8988
});
9089

9190
it('empty non-capturing group', () => {
@@ -97,7 +96,7 @@ describe('generator-basic', () => {
9796
});
9897

9998
it('named backreference', () => {
100-
test('/(?<foo\\u003B\\u{003B}>)\\k<foo\\u003B\\u{003B}>/');
99+
test('/(?<foo\\u003B\\u{003B}>)\\k<foo\\u003B\\u{003B}>/u');
101100
});
102101

103102
it('basic-assertion', () => {
@@ -179,5 +178,4 @@ describe('generator-basic', () => {
179178
test(/a{1,}?/);
180179
test(/a{1,3}?/);
181180
});
182-
183-
});
181+
});

src/generator/index.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ const generator = {
3434
if (node.capturing) {
3535
// A named group.
3636
if (node.name) {
37-
return `(?<${node.name}>${expression})`;
37+
return `(?<${node.nameRaw || node.name}>${expression})`;
3838
}
3939

4040
return `(${expression})`;
@@ -48,7 +48,7 @@ const generator = {
4848
case 'number':
4949
return `\\${node.reference}`;
5050
case 'name':
51-
return `\\k<${node.reference}>`;
51+
return `\\k<${node.referenceRaw || node.reference}>`;
5252
default:
5353
throw new TypeError(`Unknown Backreference kind: ${node.kind}`);
5454
}

0 commit comments

Comments
 (0)