@@ -91,15 +91,49 @@ const (
91
91
// until we've allocated at least maxHeight Regexp structures.
92
92
const maxHeight = 1000
93
93
94
+ // maxSize is the maximum size of a compiled regexp in Insts.
95
+ // It too is somewhat arbitrarily chosen, but the idea is to be large enough
96
+ // to allow significant regexps while at the same time small enough that
97
+ // the compiled form will not take up too much memory.
98
+ // 128 MB is enough for a 3.3 million Inst structures, which roughly
99
+ // corresponds to a 3.3 MB regexp.
100
+ const (
101
+ maxSize = 128 << 20 / instSize
102
+ instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words
103
+ )
104
+
105
+ // maxRunes is the maximum number of runes allowed in a regexp tree
106
+ // counting the runes in all the nodes.
107
+ // Ignoring character classes p.numRunes is always less than the length of the regexp.
108
+ // Character classes can make it much larger: each \pL adds 1292 runes.
109
+ // 128 MB is enough for 32M runes, which is over 26k \pL instances.
110
+ // Note that repetitions do not make copies of the rune slices,
111
+ // so \pL{1000} is only one rune slice, not 1000.
112
+ // We could keep a cache of character classes we've seen,
113
+ // so that all the \pL we see use the same rune list,
114
+ // but that doesn't remove the problem entirely:
115
+ // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
116
+ // And because the Rune slice is exposed directly in the Regexp,
117
+ // there is not an opportunity to change the representation to allow
118
+ // partial sharing between different character classes.
119
+ // So the limit is the best we can do.
120
+ const (
121
+ maxRunes = 128 << 20 / runeSize
122
+ runeSize = 4 // rune is int32
123
+ )
124
+
94
125
type parser struct {
95
126
flags Flags // parse mode flags
96
127
stack []* Regexp // stack of parsed expressions
97
128
free * Regexp
98
129
numCap int // number of capturing groups seen
99
130
wholeRegexp string
100
- tmpClass []rune // temporary char class work space
101
- numRegexp int // number of regexps allocated
102
- height map [* Regexp ]int // regexp height for height limit check
131
+ tmpClass []rune // temporary char class work space
132
+ numRegexp int // number of regexps allocated
133
+ numRunes int // number of runes in char classes
134
+ repeats int64 // product of all repetitions seen
135
+ height map [* Regexp ]int // regexp height, for height limit check
136
+ size map [* Regexp ]int64 // regexp compiled size, for size limit check
103
137
}
104
138
105
139
func (p * parser ) newRegexp (op Op ) * Regexp {
@@ -123,6 +157,104 @@ func (p *parser) reuse(re *Regexp) {
123
157
p .free = re
124
158
}
125
159
160
+ func (p * parser ) checkLimits (re * Regexp ) {
161
+ if p .numRunes > maxRunes {
162
+ panic (ErrInternalError )
163
+ }
164
+ p .checkSize (re )
165
+ p .checkHeight (re )
166
+ }
167
+
168
+ func (p * parser ) checkSize (re * Regexp ) {
169
+ if p .size == nil {
170
+ // We haven't started tracking size yet.
171
+ // Do a relatively cheap check to see if we need to start.
172
+ // Maintain the product of all the repeats we've seen
173
+ // and don't track if the total number of regexp nodes
174
+ // we've seen times the repeat product is in budget.
175
+ if p .repeats == 0 {
176
+ p .repeats = 1
177
+ }
178
+ if re .Op == OpRepeat {
179
+ n := re .Max
180
+ if n == - 1 {
181
+ n = re .Min
182
+ }
183
+ if n <= 0 {
184
+ n = 1
185
+ }
186
+ if int64 (n ) > maxSize / p .repeats {
187
+ p .repeats = maxSize
188
+ } else {
189
+ p .repeats *= int64 (n )
190
+ }
191
+ }
192
+ if int64 (p .numRegexp ) < maxSize / p .repeats {
193
+ return
194
+ }
195
+
196
+ // We need to start tracking size.
197
+ // Make the map and belatedly populate it
198
+ // with info about everything we've constructed so far.
199
+ p .size = make (map [* Regexp ]int64 )
200
+ for _ , re := range p .stack {
201
+ p .checkSize (re )
202
+ }
203
+ }
204
+
205
+ if p .calcSize (re , true ) > maxSize {
206
+ panic (ErrInternalError )
207
+ }
208
+ }
209
+
210
+ func (p * parser ) calcSize (re * Regexp , force bool ) int64 {
211
+ if ! force {
212
+ if size , ok := p .size [re ]; ok {
213
+ return size
214
+ }
215
+ }
216
+
217
+ var size int64
218
+ switch re .Op {
219
+ case OpLiteral :
220
+ size = int64 (len (re .Rune ))
221
+ case OpCapture , OpStar :
222
+ // star can be 1+ or 2+; assume 2 pessimistically
223
+ size = 2 + p .calcSize (re .Sub [0 ], false )
224
+ case OpPlus , OpQuest :
225
+ size = 1 + p .calcSize (re .Sub [0 ], false )
226
+ case OpConcat :
227
+ for _ , sub := range re .Sub {
228
+ size += p .calcSize (sub , false )
229
+ }
230
+ case OpAlternate :
231
+ for _ , sub := range re .Sub {
232
+ size += p .calcSize (sub , false )
233
+ }
234
+ if len (re .Sub ) > 1 {
235
+ size += int64 (len (re .Sub )) - 1
236
+ }
237
+ case OpRepeat :
238
+ sub := p .calcSize (re .Sub [0 ], false )
239
+ if re .Max == - 1 {
240
+ if re .Min == 0 {
241
+ size = 2 + sub // x*
242
+ } else {
243
+ size = 1 + int64 (re .Min )* sub // xxx+
244
+ }
245
+ break
246
+ }
247
+ // x{2,5} = xx(x(x(x)?)?)?
248
+ size = int64 (re .Max )* sub + int64 (re .Max - re .Min )
249
+ }
250
+
251
+ if size < 1 {
252
+ size = 1
253
+ }
254
+ p .size [re ] = size
255
+ return size
256
+ }
257
+
126
258
func (p * parser ) checkHeight (re * Regexp ) {
127
259
if p .numRegexp < maxHeight {
128
260
return
@@ -159,6 +291,7 @@ func (p *parser) calcHeight(re *Regexp, force bool) int {
159
291
160
292
// push pushes the regexp re onto the parse stack and returns the regexp.
161
293
func (p * parser ) push (re * Regexp ) * Regexp {
294
+ p .numRunes += len (re .Rune )
162
295
if re .Op == OpCharClass && len (re .Rune ) == 2 && re .Rune [0 ] == re .Rune [1 ] {
163
296
// Single rune.
164
297
if p .maybeConcat (re .Rune [0 ], p .flags &^FoldCase ) {
@@ -190,7 +323,7 @@ func (p *parser) push(re *Regexp) *Regexp {
190
323
}
191
324
192
325
p .stack = append (p .stack , re )
193
- p .checkHeight (re )
326
+ p .checkLimits (re )
194
327
return re
195
328
}
196
329
@@ -300,7 +433,7 @@ func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (
300
433
re .Sub = re .Sub0 [:1 ]
301
434
re .Sub [0 ] = sub
302
435
p .stack [n - 1 ] = re
303
- p .checkHeight (re )
436
+ p .checkLimits (re )
304
437
305
438
if op == OpRepeat && (min >= 2 || max >= 2 ) && ! repeatIsValid (re , 1000 ) {
306
439
return "" , & Error {ErrInvalidRepeatSize , before [:len (before )- len (after )]}
@@ -508,6 +641,7 @@ func (p *parser) factor(sub []*Regexp) []*Regexp {
508
641
509
642
for j := start ; j < i ; j ++ {
510
643
sub [j ] = p .removeLeadingString (sub [j ], len (str ))
644
+ p .checkLimits (sub [j ])
511
645
}
512
646
suffix := p .collapse (sub [start :i ], OpAlternate ) // recurse
513
647
@@ -565,6 +699,7 @@ func (p *parser) factor(sub []*Regexp) []*Regexp {
565
699
for j := start ; j < i ; j ++ {
566
700
reuse := j != start // prefix came from sub[start]
567
701
sub [j ] = p .removeLeadingRegexp (sub [j ], reuse )
702
+ p .checkLimits (sub [j ])
568
703
}
569
704
suffix := p .collapse (sub [start :i ], OpAlternate ) // recurse
570
705
@@ -762,6 +897,8 @@ func parse(s string, flags Flags) (_ *Regexp, err error) {
762
897
panic (r )
763
898
case nil :
764
899
// ok
900
+ case ErrInternalError : // too big
901
+ err = & Error {Code : ErrInternalError , Expr : s }
765
902
case ErrNestingDepth :
766
903
err = & Error {Code : ErrNestingDepth , Expr : s }
767
904
}
0 commit comments