Skip to content

Commit 63d3b2f

Browse files
committed
libhtml: Fix edge case in entity parsing
When a named entity is aborted, we need to backtrack to the longest prefix that doesn't require a semicolon.
1 parent 4ad5be4 commit 63d3b2f

File tree

1 file changed

+30
-14
lines changed

1 file changed

+30
-14
lines changed

src/libhtml/escape.rs

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ pub struct UnescapeWriter<W> {
140140
enum UnescapeState {
141141
CharData,
142142
Begin,
143-
Named(uint, uint), // index into ENTITIES, and prefix len
143+
Named(uint, uint, uint), // index into ENTITIES, prefix len, last non-semi index
144144
HexStart(bool), // boolean indicates if x is lower or upper case
145145
Hex(u32),
146146
DecStart,
@@ -209,10 +209,14 @@ impl<W: Writer> UnescapeWriter<W> {
209209
Begin => {
210210
try!(self.inner.get_mut_ref().write_str("&"));
211211
}
212-
Named(cursor, plen) => {
212+
Named(cursor, plen, lastcur) => {
213213
let (name, chars, needs_semi) = ENTITIES[cursor];
214214
if !needs_semi && name.len() == plen {
215215
try!(self.inner.get_mut_ref().write_str(chars));
216+
} else if lastcur != -1 {
217+
let (lastname, chars, _) = ENTITIES[lastcur];
218+
try!(self.inner.get_mut_ref().write_str(chars));
219+
try!(self.inner.get_mut_ref().write_str(name.slice(lastname.len(), plen)));
216220
} else {
217221
try!(self.inner.get_mut_ref().write_str(name.slice_to(plen)));
218222
}
@@ -349,13 +353,13 @@ impl<W:Writer> Writer for UnescapeWriter<W> {
349353
// with our character as a prefix.
350354
// There's at least one entity that starts with every letter, so we don't
351355
// have to worry about not finding one.
352-
self.state = Named(base, 2); // plen is 2 to include &
356+
self.state = Named(base, 2, -1); // plen is 2 to include &
353357
}
354-
(Named(cursor, plen), ';') => {
358+
(Named(cursor, plen, _), ';') => {
359+
it.next(); // consume ;
355360
let (name, chars, _) = ENTITIES[cursor];
356361
if name.len() == plen {
357362
// valid entity
358-
it.next(); // consume ;
359363
try!(self.inner_write_str(chars));
360364
self.state = CharData;
361365
cdata = i+1;
@@ -365,25 +369,32 @@ impl<W:Writer> Writer for UnescapeWriter<W> {
365369
cdata = i;
366370
}
367371
}
368-
(Named(cursor, plen), 'a'..'z') |
369-
(Named(cursor, plen), 'A'..'Z') |
370-
(Named(cursor, plen), '0'..'9') => {
372+
(Named(cursor, plen, lastcur), 'a'..'z') |
373+
(Named(cursor, plen, lastcur), 'A'..'Z') |
374+
(Named(cursor, plen, lastcur), '0'..'9') => {
371375
let mut cursor = cursor;
372-
let (name, _, _) = ENTITIES[cursor];
376+
it.next(); // consume character
377+
let (mut name, _, mut needs_semi) = ENTITIES[cursor];
373378
if name.len() > plen && name[plen] == b {
374379
// existing cursor is still a match
375380
} else {
376381
// search forward to find the next entity with our prefix
377382
let prefix = name.slice_to(plen);
378383
for ix in range(cursor+1, ENTITIES.len()) {
379-
let (name, _, _) = ENTITIES[ix];
380-
if !name.starts_with(prefix) {
384+
let (name_, _, needs_semi_) = ENTITIES[ix];
385+
if !name_.starts_with(prefix) {
381386
// no match
382387
cursor = -1;
383388
break;
384389
}
385-
if name.len() > plen && name[plen] == b {
390+
if name_.len() > plen && name_[plen] == b {
386391
cursor = ix;
392+
name = name_;
393+
needs_semi = needs_semi_;
394+
if name_.len() == plen+1 {
395+
name = name_;
396+
needs_semi = needs_semi_;
397+
}
387398
break;
388399
}
389400
}
@@ -394,8 +405,13 @@ impl<W:Writer> Writer for UnescapeWriter<W> {
394405
self.state = CharData;
395406
cdata = i;
396407
} else {
397-
it.next(); // consume character
398-
self.state = Named(cursor, plen+1);
408+
let plen = plen+1;
409+
let lastcur = if !needs_semi && name.len() == plen {
410+
cursor
411+
} else {
412+
lastcur
413+
};
414+
self.state = Named(cursor, plen, lastcur);
399415
}
400416
}
401417
(HexStart(_), 'a'..'f')|(HexStart(_), 'A'..'F')|(HexStart(_), '0'..'9') => {

0 commit comments

Comments
 (0)