Skip to content

Commit 3511044

Browse files
authored
Merge pull request dlclark#52 from mstoykov/ecmascriptUnicodeEscape
Support \u{HEX} syntax with ECMAScript with Unicode flag
2 parents 304ee33 + e8de5ea commit 3511044

File tree

4 files changed

+34
-1
lines changed

4 files changed

+34
-1
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ if isMatch, _ := re.MatchString(`Something to match`); isMatch {
9292

9393
This feature is a work in progress and I'm open to ideas for more things to put here (maybe more relaxed character escaping rules?).
9494

95+
## ECMAScript compatibility mode
96+
In this mode the engine provides compatibility with the [regex engine](https://tc39.es/ecma262/multipage/text-processing.html#sec-regexp-regular-expression-objects) described in the ECMAScript specification.
97+
98+
Additionally a Unicode mode is provided which allows parsing of `\u{CodePoint}` syntax that is only when both are provided.
9599

96100
## Library features that I'm still working on
97101
- Regex split

regexp.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ const (
121121
Debug = 0x0080 // "d"
122122
ECMAScript = 0x0100 // "e"
123123
RE2 = 0x0200 // RE2 (regexp package) compatibility mode
124+
Unicode = 0x0400 // "u"
124125
)
125126

126127
func (re *Regexp) RightToLeft() bool {

regexp_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -858,6 +858,20 @@ func TestECMAScriptXCurlyBraceEscape(t *testing.T) {
858858
}
859859
}
860860

861+
func TestEcmaScriptUnicodeRange(t *testing.T) {
862+
r, err := Compile(`([\u{001a}-\u{ffff}]+)`, ECMAScript|Unicode)
863+
if err != nil {
864+
panic(err)
865+
}
866+
m, err := r.FindStringMatch("qqqq")
867+
if err != nil {
868+
panic(err)
869+
}
870+
if m == nil {
871+
t.Fatal("Expected non-nil, got nil")
872+
}
873+
}
874+
861875
func TestNegateRange(t *testing.T) {
862876
re := MustCompile(`[\D]`, 0)
863877
if m, err := re.MatchString("A"); err != nil {

syntax/parser.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const (
2222
Debug = 0x0080 // "d"
2323
ECMAScript = 0x0100 // "e"
2424
RE2 = 0x0200 // RE2 compat mode
25+
Unicode = 0x0400 // "u"
2526
)
2627

2728
func optionFromCode(ch rune) RegexOptions {
@@ -43,6 +44,8 @@ func optionFromCode(ch rune) RegexOptions {
4344
return Debug
4445
case 'e', 'E':
4546
return ECMAScript
47+
case 'u', 'U':
48+
return Unicode
4649
default:
4750
return 0
4851
}
@@ -1695,7 +1698,13 @@ func (p *parser) scanCharEscape() (r rune, err error) {
16951698
r, err = p.scanHex(2)
16961699
}
16971700
case 'u':
1698-
r, err = p.scanHex(4)
1701+
// ECMAscript suppot \u{HEX} only if `u` is also set
1702+
if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
1703+
p.moveRight(1)
1704+
return p.scanHexUntilBrace()
1705+
} else {
1706+
r, err = p.scanHex(4)
1707+
}
16991708
case 'a':
17001709
return '\u0007', nil
17011710
case 'b':
@@ -1972,6 +1981,11 @@ func (p *parser) useRE2() bool {
19721981
return (p.options & RE2) != 0
19731982
}
19741983

1984+
// True if U option enabling ECMAScript's Unicode behavior on.
1985+
func (p *parser) useOptionU() bool {
1986+
return (p.options & Unicode) != 0
1987+
}
1988+
19751989
// True if options stack is empty.
19761990
func (p *parser) emptyOptionsStack() bool {
19771991
return len(p.optionsStack) == 0

0 commit comments

Comments
 (0)