diff --git a/src/parse.js b/src/parse.js index 7d0b8e7..4c0fcd7 100644 --- a/src/parse.js +++ b/src/parse.js @@ -102,9 +102,17 @@ export default function parse(html, options) { // calculate correct end of the content slice in case there's // no tag after the text node. const end = html.indexOf('<', start) - const content = html.slice(start, end === -1 ? undefined : end) - // if a node is nothing but whitespace, no need to add it. - if (!/^\s*$/.test(content)) { + let content = html.slice(start, end === -1 ? undefined : end) + // if a node is nothing but whitespace, collapse it as the spec states: + // https://www.w3.org/TR/html4/struct/text.html#h-9.1 + if (/^\s*$/.test(content)) { + content = ' '; + } + // don't add whitespace-only text nodes if they would be trailing text nodes + // or if they would be leading whitespace-only text nodes: + // * end > -1 indicates this is not a trailing text node + // * leading node is when level is -1 and parent has length 0 + if ((options.respectWhitespace && end > -1 && level + parent.length >= 0) || content !== ' ') { parent.push({ type: 'text', content: content, diff --git a/test/parse.js b/test/parse.js index 8f710fa..4c18531 100644 --- a/test/parse.js +++ b/test/parse.js @@ -881,3 +881,79 @@ test('ReDoS vulnerability reported by Sam Sanoop of Snyk', function (t) { t.ok(duration < 100, 'should not hang') t.end() }) + +test('whitespace', function (t) { + let html = '
\n' + let parsed = HTML.parse(html) + t.deepEqual(parsed, [{ + type: 'tag', + name: 'div', + attrs: {}, + voidElement: false, + children: [] + }], 'should not explode on trailing whitespace') + + html = '