diff --git a/src/parse.js b/src/parse.js index 7d0b8e7..e395905 100644 --- a/src/parse.js +++ b/src/parse.js @@ -102,9 +102,17 @@ export default function parse(html, options) { // calculate correct end of the content slice in case there's // no tag after the text node. const end = html.indexOf('<', start) - const content = html.slice(start, end === -1 ? undefined : end) - // if a node is nothing but whitespace, no need to add it. - if (!/^\s*$/.test(content)) { + let content = html.slice(start, end === -1 ? undefined : end) + // if a node is nothing but whitespace, collapse it as the spec states: + // https://www.w3.org/TR/html4/struct/text.html#h-9.1 + if (/^\s*$/.test(content)) { + content = ' '; + } + // don't add whitespace-only text nodes if they would be trailing text nodes + // or if they would be leading whitespace-only text nodes: + // * end > -1 indicates this is not a trailing text node + // * leading node is when level is -1 and parent has length 0 + if ((end > -1 && level + parent.length >= 0) || content !== ' ') { parent.push({ type: 'text', content: content, diff --git a/test/parse.js b/test/parse.js index 8f710fa..9a49ab8 100644 --- a/test/parse.js +++ b/test/parse.js @@ -659,6 +659,10 @@ test('parse', function (t) { voidElement: false, children: [{ type: 'text', content: 'something' }], }, + { + type: 'text', + content: ' ', + }, { type: 'tag', name: 'a', @@ -686,6 +690,7 @@ test('parse', function (t) { voidElement: false, children: [{ type: 'text', content: 'Hi' }], }, + { type: 'text', content: ' ' }, { type: 'tag', name: 'span', @@ -881,3 +886,53 @@ test('ReDoS vulnerability reported by Sam Sanoop of Snyk', function (t) { t.ok(duration < 100, 'should not hang') t.end() }) + +test('whitespace', function (t) { + let html = '
\n' + let parsed = HTML.parse(html) + t.deepEqual(parsed, [{ + type: 'tag', + name: 'div', + attrs: {}, + voidElement: false, + children: [] + }], 'should not explode on trailing whitespace') + + html = '