Fix links being parsed as markdown links improperly

Dariusz Niemczyk · Dariusz Niemczyk · commit 6cf505640d92 · 2021-11-25T14:43:45.000+01:00
Fixes #4674
diff --git a/src/Markdown.ts b/src/Markdown.ts
@@ -17,6 +17,8 @@ limitations under the License.
 
 import * as commonmark from 'commonmark';
 import { escape } from "lodash";
+import { logger } from 'matrix-js-sdk/src/logger';
+import * as linkify from 'linkifyjs';
 
 const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];
 
@@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
     link: (node: commonmark.Node, entering: boolean) => void;
     html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
     html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
+    text: (node: commonmark.Node) => void;
+    out: (text: string) => void;
+    emph: (node: commonmark.Node) => void;
 }
 
 function isAllowedHtmlTag(node: commonmark.Node): boolean {
@@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
     return par.firstChild != par.lastChild;
 }
 
+function getTextUntilEndOrLinebreak(node: commonmark.Node) {
+    let currentNode = node;
+    let text = '';
+    while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
+        const { literal, type } = currentNode;
+        if (type === 'text' && literal) {
+            let n = 0;
+            let char = literal[n];
+            while (char !== ' ' && char !== null && n <= literal.length) {
+                if (char === ' ') {
+                    break;
+                }
+                if (char) {
+                    text += char;
+                }
+                n += 1;
+                char = literal[n];
+            }
+            if (char === ' ') {
+                break;
+            }
+        }
+        currentNode = currentNode.next;
+    }
+    return text;
+}
+
 /**
  * Class that wraps commonmark, adding the ability to see whether
  * a given message actually uses any markdown syntax or whether
@@ -70,11 +102,87 @@ export default class Markdown {
     private input: string;
     private parsed: commonmark.Node;
 
-    constructor(input) {
+    constructor(input: string) {
         this.input = input;
 
         const parser = new commonmark.Parser();
         this.parsed = parser.parse(this.input);
+        this.parsed = this.escapeLinks(this.parsed);
+    }
+
+    private escapeLinks(parsed: commonmark.Node) {
+        const walker = parsed.walker();
+        let event: commonmark.NodeWalkingStep = null;
+        let text = '';
+        let isInPara = false;
+        let previousNode: commonmark.Node | null = null;
+        while ((event = walker.next())) {
+            const { node } = event;
+            if (node.type === 'paragraph') {
+                if (event.entering) {
+                    isInPara = true;
+                } else {
+                    isInPara = false;
+                }
+            }
+            if (isInPara) {
+                // Clear saved string when line ends
+                if (
+                    node.type === 'softbreak' ||
+                    node.type === 'linebreak' ||
+                    // Also start calculating the text from the beginning on any spaces
+                    (node.type === 'text' && node.literal === ' ')
+                ) {
+                    text = '';
+                }
+                if (node.type === 'text') {
+                    text += node.literal;
+                }
+                // We should not do this if previous node was not a textnode, as we can't combine it then.
+                if (node.type === 'emph' && previousNode.type === 'text') {
+                    if (event.entering) {
+                        const foundLinks = linkify.find(text);
+                        for (const { value } of foundLinks) {
+                            if (node.firstChild.literal) {
+                                /**
+                                 * NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
+                                 * but this solution seems to work well and is hopefully slightly easier to understand too
+                                 */
+                                const nonEmphasizedText = `_${node.firstChild.literal}_`;
+                                const f = getTextUntilEndOrLinebreak(node);
+                                const newText = value + nonEmphasizedText + f;
+                                const newLinks = linkify.find(newText);
+                                // Should always find only one link here, if it finds more it means that the algorithm is broken
+                                if (newLinks.length === 1) {
+                                    const emphasisTextNode = new commonmark.Node('text');
+                                    emphasisTextNode.literal = nonEmphasizedText;
+                                    previousNode.insertAfter(emphasisTextNode);
+                                    node.firstChild.literal = '';
+                                    // Empty string is a sign that we should ignore it in HTML rendering
+                                    node.literal = '';
+                                } else {
+                                    logger.error(
+                                        "Markdown links escaping found too many links for following text: ",
+                                        text,
+                                    );
+                                    logger.error(
+                                        "Markdown links escaping found too many links for modified text: ",
+                                        newText,
+                                    );
+                                }
+                            }
+                        }
+                    } else {
+                        node.firstChild.literal = '';
+                        // Empty string is a sign that we should ignore it in HTML rendering
+                        node.literal = '';
+                        // node.unlink();
+                    }
+                }
+            }
+            previousNode = node;
+        }
+        return parsed;
     }
 
     isPlainText(): boolean {
@@ -120,9 +228,7 @@ export default class Markdown {
         // you can nest them.
         //
         // Let's try sending with <p/>s anyway for now, though.
-
         const realParagraph = renderer.paragraph;
-
         renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
             // If there is only one top level node, just return the
             // bare text: it's a single line of text and so should be
@@ -133,6 +239,9 @@ export default class Markdown {
                 realParagraph.call(this, node, entering);
             }
         };
+        renderer.text = (node: commonmark.Node) => {
+            renderer.out(node.literal);
+        };
 
         renderer.link = function(node, entering) {
             const attrs = this.attrs(node);
@@ -165,7 +274,7 @@ export default class Markdown {
         renderer.html_block = function(node: commonmark.Node) {
             /*
             // as with `paragraph`, we only insert line breaks
-            // if there are multiple lines in the markdown.
+            // if there are multiple lines in the markdownode.
             const isMultiLine = is_multi_line(node);
             if (isMultiLine) this.cr();
             */
@@ -175,6 +284,17 @@ export default class Markdown {
             */
         };
 
+        const realEmph = renderer.emph;
+
+        renderer.emph = function(node: commonmark.Node) {
+            // We're escaping links with emphasis in the middle of it to act like a real link
+            // This empty string check here is to verify that we have modified the emphasis node properly
+            if (node.type === 'emph' && node.literal === "") {
+                return;
+            }
+            return realEmph(node);
+        };
+
         return renderer.render(this.parsed);
     }
 
@@ -184,7 +304,7 @@ export default class Markdown {
      * markdown syntax
      * (to fix https://github.com/vector-im/element-web/issues/2870).
      *
-     * N.B. this does **NOT** render arbitrary MD to plain text - only MD
+     * node.B. this does **NOT** render arbitrary MD to plain text - only MD
      * which has no formatting.  Otherwise it emits HTML(!).
      */
     toPlaintext(): string {
@@ -205,6 +325,16 @@ export default class Markdown {
             if (isMultiLine(node) && node.next) this.lit('\n\n');
         };
 
+        const realEmph = renderer.emph;
+        renderer.emph = function(node: commonmark.Node) {
+            // We're escaping links with emphasis in the middle of it to act like a real link
+            // This empty string check here is to verify that we have modified the emphasis node properly
+            if (node.type === 'emph' && node.literal === "") {
+                return;
+            }
+            return realEmph(node);
+        };
+
         return renderer.render(this.parsed);
     }
 }
diff --git a/test/Markdown-test.ts b/test/Markdown-test.ts
@@ -0,0 +1,109 @@
+/**
+ * @jest-environment jsdom
+ */
+import Markdown from "../src/Markdown";
+
+describe("Markdown parser test", () => {
+    describe("autolinks", () => {
+        const testString = [
+            "Test1:",
+            "#_foonetic_xkcd:matrix.org",
+            "http://google.com/_thing_",
+            "https://matrix.org/_matrix/client/foo/123_",
+            "#_foonetic_xkcd:matrix.org",
+            "",
+            "Test1A:",
+            "#_foonetic_xkcd:matrix.org",
+            "http://google.com/_thing_",
+            "https://matrix.org/_matrix/client/foo/123_",
+            "#_foonetic_xkcd:matrix.org",
+            "",
+            "Test2:",
+            "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
+            "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
+            "",
+            "Test3:",
+            "https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
+            "https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
+        ].join("\n");
+
+        it('tests that links are getting properly HTML formatted', () => {
+            /* eslint-disable max-len */
+            const expectedResult = [
+                "<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
+                "<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
+                "<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>",
+                "<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>",
+                "",
+            ].join("\n");
+            /* eslint-enable max-len */
+            const md = new Markdown(testString);
+            expect(md.toHTML()).toEqual(expectedResult);
+        });
+        it('tests that links with autolinks are not touched at all and are still properly formatted', () => {
+            const test = [
+                "Test1:",
+                "<#_foonetic_xkcd:matrix.org>",
+                "<http://google.com/_thing_>",
+                "<https://matrix.org/_matrix/client/foo/123_>",
+                "<#_foonetic_xkcd:matrix.org>",
+                "",
+                "Test1A:",
+                "<#_foonetic_xkcd:matrix.org>",
+                "<http://google.com/_thing_>",
+                "<https://matrix.org/_matrix/client/foo/123_>",
+                "<#_foonetic_xkcd:matrix.org>",
+                "",
+                "Test2:",
+                "<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
+                "<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
+                "",
+                "Test3:",
+                "<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
+                "<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
+            ].join("\n");
+            /* eslint-disable max-len */
+            /**
+             * NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org
+             * but it seems to be actually working properly
+             */
+            const expectedResult = [
+                "<p>Test1:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
+                "<p>Test1A:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
+                "<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>",
+                "<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>",
+                "",
+            ].join("\n");
+            /* eslint-enable max-len */
+            const md = new Markdown(test);
+            expect(md.toHTML()).toEqual(expectedResult);
+        });
+
+        it('expects that links in codeblock are not modified', () => {
+            const expectedResult = [
+                '<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org',
+                'http://google.com/_thing_',
+                'https://matrix.org/_matrix/client/foo/123_',
+                '#_foonetic_xkcd:matrix.org',
+                '',
+                'Test1A:',
+                '#_foonetic_xkcd:matrix.org',
+                'http://google.com/_thing_',
+                'https://matrix.org/_matrix/client/foo/123_',
+                '#_foonetic_xkcd:matrix.org',
+                '',
+                'Test2:',
+                'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
+                'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
+                '',
+                'Test3:',
+                'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
+                'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
+                '</code></pre>',
+                '',
+            ].join('\n');
+            const md = new Markdown("```" + testString + "```");
+            expect(md.toHTML()).toEqual(expectedResult);
+        });
+    });
+});