@@ -90,15 +90,49 @@ const (
90
90
// until we've allocated at least maxHeight Regexp structures.
91
91
const maxHeight = 1000
92
92
93
+ // maxSize is the maximum size of a compiled regexp in Insts.
94
+ // It too is somewhat arbitrarily chosen, but the idea is to be large enough
95
+ // to allow significant regexps while at the same time small enough that
96
+ // the compiled form will not take up too much memory.
97
+ // 128 MB is enough for a 3.3 million Inst structures, which roughly
98
+ // corresponds to a 3.3 MB regexp.
99
+ const (
100
+ maxSize = 128 << 20 / instSize
101
+ instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words
102
+ )
103
+
104
+ // maxRunes is the maximum number of runes allowed in a regexp tree
105
+ // counting the runes in all the nodes.
106
+ // Ignoring character classes p.numRunes is always less than the length of the regexp.
107
+ // Character classes can make it much larger: each \pL adds 1292 runes.
108
+ // 128 MB is enough for 32M runes, which is over 26k \pL instances.
109
+ // Note that repetitions do not make copies of the rune slices,
110
+ // so \pL{1000} is only one rune slice, not 1000.
111
+ // We could keep a cache of character classes we've seen,
112
+ // so that all the \pL we see use the same rune list,
113
+ // but that doesn't remove the problem entirely:
114
+ // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
115
+ // And because the Rune slice is exposed directly in the Regexp,
116
+ // there is not an opportunity to change the representation to allow
117
+ // partial sharing between different character classes.
118
+ // So the limit is the best we can do.
119
+ const (
120
+ maxRunes = 128 << 20 / runeSize
121
+ runeSize = 4 // rune is int32
122
+ )
123
+
93
124
type parser struct {
94
125
flags Flags // parse mode flags
95
126
stack []* Regexp // stack of parsed expressions
96
127
free * Regexp
97
128
numCap int // number of capturing groups seen
98
129
wholeRegexp string
99
- tmpClass []rune // temporary char class work space
100
- numRegexp int // number of regexps allocated
101
- height map [* Regexp ]int // regexp height for height limit check
130
+ tmpClass []rune // temporary char class work space
131
+ numRegexp int // number of regexps allocated
132
+ numRunes int // number of runes in char classes
133
+ repeats int64 // product of all repetitions seen
134
+ height map [* Regexp ]int // regexp height, for height limit check
135
+ size map [* Regexp ]int64 // regexp compiled size, for size limit check
102
136
}
103
137
104
138
func (p * parser ) newRegexp (op Op ) * Regexp {
@@ -122,6 +156,104 @@ func (p *parser) reuse(re *Regexp) {
122
156
p .free = re
123
157
}
124
158
159
+ func (p * parser ) checkLimits (re * Regexp ) {
160
+ if p .numRunes > maxRunes {
161
+ panic (ErrInternalError )
162
+ }
163
+ p .checkSize (re )
164
+ p .checkHeight (re )
165
+ }
166
+
167
+ func (p * parser ) checkSize (re * Regexp ) {
168
+ if p .size == nil {
169
+ // We haven't started tracking size yet.
170
+ // Do a relatively cheap check to see if we need to start.
171
+ // Maintain the product of all the repeats we've seen
172
+ // and don't track if the total number of regexp nodes
173
+ // we've seen times the repeat product is in budget.
174
+ if p .repeats == 0 {
175
+ p .repeats = 1
176
+ }
177
+ if re .Op == OpRepeat {
178
+ n := re .Max
179
+ if n == - 1 {
180
+ n = re .Min
181
+ }
182
+ if n <= 0 {
183
+ n = 1
184
+ }
185
+ if int64 (n ) > maxSize / p .repeats {
186
+ p .repeats = maxSize
187
+ } else {
188
+ p .repeats *= int64 (n )
189
+ }
190
+ }
191
+ if int64 (p .numRegexp ) < maxSize / p .repeats {
192
+ return
193
+ }
194
+
195
+ // We need to start tracking size.
196
+ // Make the map and belatedly populate it
197
+ // with info about everything we've constructed so far.
198
+ p .size = make (map [* Regexp ]int64 )
199
+ for _ , re := range p .stack {
200
+ p .checkSize (re )
201
+ }
202
+ }
203
+
204
+ if p .calcSize (re , true ) > maxSize {
205
+ panic (ErrInternalError )
206
+ }
207
+ }
208
+
209
+ func (p * parser ) calcSize (re * Regexp , force bool ) int64 {
210
+ if ! force {
211
+ if size , ok := p .size [re ]; ok {
212
+ return size
213
+ }
214
+ }
215
+
216
+ var size int64
217
+ switch re .Op {
218
+ case OpLiteral :
219
+ size = int64 (len (re .Rune ))
220
+ case OpCapture , OpStar :
221
+ // star can be 1+ or 2+; assume 2 pessimistically
222
+ size = 2 + p .calcSize (re .Sub [0 ], false )
223
+ case OpPlus , OpQuest :
224
+ size = 1 + p .calcSize (re .Sub [0 ], false )
225
+ case OpConcat :
226
+ for _ , sub := range re .Sub {
227
+ size += p .calcSize (sub , false )
228
+ }
229
+ case OpAlternate :
230
+ for _ , sub := range re .Sub {
231
+ size += p .calcSize (sub , false )
232
+ }
233
+ if len (re .Sub ) > 1 {
234
+ size += int64 (len (re .Sub )) - 1
235
+ }
236
+ case OpRepeat :
237
+ sub := p .calcSize (re .Sub [0 ], false )
238
+ if re .Max == - 1 {
239
+ if re .Min == 0 {
240
+ size = 2 + sub // x*
241
+ } else {
242
+ size = 1 + int64 (re .Min )* sub // xxx+
243
+ }
244
+ break
245
+ }
246
+ // x{2,5} = xx(x(x(x)?)?)?
247
+ size = int64 (re .Max )* sub + int64 (re .Max - re .Min )
248
+ }
249
+
250
+ if size < 1 {
251
+ size = 1
252
+ }
253
+ p .size [re ] = size
254
+ return size
255
+ }
256
+
125
257
func (p * parser ) checkHeight (re * Regexp ) {
126
258
if p .numRegexp < maxHeight {
127
259
return
@@ -158,6 +290,7 @@ func (p *parser) calcHeight(re *Regexp, force bool) int {
158
290
159
291
// push pushes the regexp re onto the parse stack and returns the regexp.
160
292
func (p * parser ) push (re * Regexp ) * Regexp {
293
+ p .numRunes += len (re .Rune )
161
294
if re .Op == OpCharClass && len (re .Rune ) == 2 && re .Rune [0 ] == re .Rune [1 ] {
162
295
// Single rune.
163
296
if p .maybeConcat (re .Rune [0 ], p .flags &^FoldCase ) {
@@ -189,7 +322,7 @@ func (p *parser) push(re *Regexp) *Regexp {
189
322
}
190
323
191
324
p .stack = append (p .stack , re )
192
- p .checkHeight (re )
325
+ p .checkLimits (re )
193
326
return re
194
327
}
195
328
@@ -299,7 +432,7 @@ func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (
299
432
re .Sub = re .Sub0 [:1 ]
300
433
re .Sub [0 ] = sub
301
434
p .stack [n - 1 ] = re
302
- p .checkHeight (re )
435
+ p .checkLimits (re )
303
436
304
437
if op == OpRepeat && (min >= 2 || max >= 2 ) && ! repeatIsValid (re , 1000 ) {
305
438
return "" , & Error {ErrInvalidRepeatSize , before [:len (before )- len (after )]}
@@ -503,6 +636,7 @@ func (p *parser) factor(sub []*Regexp) []*Regexp {
503
636
504
637
for j := start ; j < i ; j ++ {
505
638
sub [j ] = p .removeLeadingString (sub [j ], len (str ))
639
+ p .checkLimits (sub [j ])
506
640
}
507
641
suffix := p .collapse (sub [start :i ], OpAlternate ) // recurse
508
642
@@ -560,6 +694,7 @@ func (p *parser) factor(sub []*Regexp) []*Regexp {
560
694
for j := start ; j < i ; j ++ {
561
695
reuse := j != start // prefix came from sub[start]
562
696
sub [j ] = p .removeLeadingRegexp (sub [j ], reuse )
697
+ p .checkLimits (sub [j ])
563
698
}
564
699
suffix := p .collapse (sub [start :i ], OpAlternate ) // recurse
565
700
0 commit comments