From 95883700e321ed1e444fb5358555021a3d3e784b Mon Sep 17 00:00:00 2001 From: Adriano Raiano Date: Sun, 11 Apr 2021 17:53:56 +0200 Subject: [PATCH] MAJOR: respect whitespaces --- src/parse.js | 14 ++++++++++--- test/parse.js | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/parse.js b/src/parse.js index 7d0b8e7..e395905 100644 --- a/src/parse.js +++ b/src/parse.js @@ -102,9 +102,17 @@ export default function parse(html, options) { // calculate correct end of the content slice in case there's // no tag after the text node. const end = html.indexOf('<', start) - const content = html.slice(start, end === -1 ? undefined : end) - // if a node is nothing but whitespace, no need to add it. - if (!/^\s*$/.test(content)) { + let content = html.slice(start, end === -1 ? undefined : end) + // if a node is nothing but whitespace, collapse it as the spec states: + // https://www.w3.org/TR/html4/struct/text.html#h-9.1 + if (/^\s*$/.test(content)) { + content = ' '; + } + // don't add whitespace-only text nodes if they would be trailing text nodes + // or if they would be leading whitespace-only text nodes: + // * end > -1 indicates this is not a trailing text node + // * leading node is when level is -1 and parent has length 0 + if ((end > -1 && level + parent.length >= 0) || content !== ' ') { parent.push({ type: 'text', content: content, diff --git a/test/parse.js b/test/parse.js index 8f710fa..9a49ab8 100644 --- a/test/parse.js +++ b/test/parse.js @@ -659,6 +659,10 @@ test('parse', function (t) { voidElement: false, children: [{ type: 'text', content: 'something' }], }, + { + type: 'text', + content: ' ', + }, { type: 'tag', name: 'a', @@ -686,6 +690,7 @@ test('parse', function (t) { voidElement: false, children: [{ type: 'text', content: 'Hi' }], }, + { type: 'text', content: ' ' }, { type: 'tag', name: 'span', @@ -881,3 +886,53 @@ test('ReDoS vulnerability reported by Sam Sanoop of Snyk', function (t) { t.ok(duration < 100, 'should not hang') t.end() }) + +test('whitespace', function (t) { + let html = '
\n' + let parsed = HTML.parse(html) + t.deepEqual(parsed, [{ + type: 'tag', + name: 'div', + attrs: {}, + voidElement: false, + children: [] + }], 'should not explode on trailing whitespace') + + html = '
Hi
\n\n There \t
' + parsed = HTML.parse(html) + t.deepEqual(parsed, [{ + type: 'tag', + name: 'div', + attrs: {}, + voidElement: false, + children: [ + { type: 'text', content: 'Hi' } + ] + },{ + type: 'text', + content: ' ' + }, + { + type: 'tag', + name: 'span', + attrs: {}, + voidElement: false, + children: [ + { type: 'text', content: 'There' } + ] + },{ + type: 'text', + content: ' ' + },{ + type: 'tag', + name: 'div', + attrs: {}, + voidElement: false, + children: [ + { type: 'text', content: ' ' } + ] + }], 'should collapse whitespace') + // See https://www.w3.org/TR/html4/struct/text.html#h-9.1 + + t.end() +}) \ No newline at end of file