Skip to content
This repository was archived by the owner on Sep 11, 2024. It is now read-only.

Commit 6cf5056

Browse files
author
Dariusz Niemczyk
committed
Fix links being parsed as markdown links improperly
Fixes #4674
1 parent ea97c41 commit 6cf5056

File tree

2 files changed

+244
-5
lines changed

2 files changed

+244
-5
lines changed

src/Markdown.ts

Lines changed: 135 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717

1818
import * as commonmark from 'commonmark';
1919
import { escape } from "lodash";
20+
import { logger } from 'matrix-js-sdk/src/logger';
21+
import * as linkify from 'linkifyjs';
2022

2123
const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];
2224

@@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
2931
link: (node: commonmark.Node, entering: boolean) => void;
3032
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
3133
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
34+
text: (node: commonmark.Node) => void;
35+
out: (text: string) => void;
36+
emph: (node: commonmark.Node) => void;
3237
}
3338

3439
function isAllowedHtmlTag(node: commonmark.Node): boolean {
@@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
6166
return par.firstChild != par.lastChild;
6267
}
6368

69+
function getTextUntilEndOrLinebreak(node: commonmark.Node) {
70+
let currentNode = node;
71+
let text = '';
72+
while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
73+
const { literal, type } = currentNode;
74+
if (type === 'text' && literal) {
75+
let n = 0;
76+
let char = literal[n];
77+
while (char !== ' ' && char !== null && n <= literal.length) {
78+
if (char === ' ') {
79+
break;
80+
}
81+
if (char) {
82+
text += char;
83+
}
84+
n += 1;
85+
char = literal[n];
86+
}
87+
if (char === ' ') {
88+
break;
89+
}
90+
}
91+
currentNode = currentNode.next;
92+
}
93+
return text;
94+
}
95+
6496
/**
6597
* Class that wraps commonmark, adding the ability to see whether
6698
* a given message actually uses any markdown syntax or whether
@@ -70,11 +102,87 @@ export default class Markdown {
70102
private input: string;
71103
private parsed: commonmark.Node;
72104

73-
constructor(input) {
105+
constructor(input: string) {
74106
this.input = input;
75107

76108
const parser = new commonmark.Parser();
77109
this.parsed = parser.parse(this.input);
110+
this.parsed = this.escapeLinks(this.parsed);
111+
}
112+
113+
private escapeLinks(parsed: commonmark.Node) {
114+
const walker = parsed.walker();
115+
let event: commonmark.NodeWalkingStep = null;
116+
let text = '';
117+
let isInPara = false;
118+
let previousNode: commonmark.Node | null = null;
119+
while ((event = walker.next())) {
120+
const { node } = event;
121+
if (node.type === 'paragraph') {
122+
if (event.entering) {
123+
isInPara = true;
124+
} else {
125+
isInPara = false;
126+
}
127+
}
128+
if (isInPara) {
129+
// Clear saved string when line ends
130+
if (
131+
node.type === 'softbreak' ||
132+
node.type === 'linebreak' ||
133+
// Also start calculating the text from the beginning on any spaces
134+
(node.type === 'text' && node.literal === ' ')
135+
) {
136+
text = '';
137+
}
138+
if (node.type === 'text') {
139+
text += node.literal;
140+
}
141+
// We should not do this if previous node was not a textnode, as we can't combine it then.
142+
if (node.type === 'emph' && previousNode.type === 'text') {
143+
if (event.entering) {
144+
const foundLinks = linkify.find(text);
145+
for (const { value } of foundLinks) {
146+
if (node.firstChild.literal) {
147+
/**
148+
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
149+
* but this solution seems to work well and is hopefully slightly easier to understand too
150+
*/
151+
const nonEmphasizedText = `_${node.firstChild.literal}_`;
152+
const f = getTextUntilEndOrLinebreak(node);
153+
const newText = value + nonEmphasizedText + f;
154+
const newLinks = linkify.find(newText);
155+
// Should always find only one link here, if it finds more it means that the algorithm is broken
156+
if (newLinks.length === 1) {
157+
const emphasisTextNode = new commonmark.Node('text');
158+
emphasisTextNode.literal = nonEmphasizedText;
159+
previousNode.insertAfter(emphasisTextNode);
160+
node.firstChild.literal = '';
161+
// Empty string is a sign that we should ignore it in HTML rendering
162+
node.literal = '';
163+
} else {
164+
logger.error(
165+
"Markdown links escaping found too many links for following text: ",
166+
text,
167+
);
168+
logger.error(
169+
"Markdown links escaping found too many links for modified text: ",
170+
newText,
171+
);
172+
}
173+
}
174+
}
175+
} else {
176+
node.firstChild.literal = '';
177+
// Empty string is a sign that we should ignore it in HTML rendering
178+
node.literal = '';
179+
// node.unlink();
180+
}
181+
}
182+
}
183+
previousNode = node;
184+
}
185+
return parsed;
78186
}
79187

80188
isPlainText(): boolean {
@@ -120,9 +228,7 @@ export default class Markdown {
120228
// you can nest them.
121229
//
122230
// Let's try sending with <p/>s anyway for now, though.
123-
124231
const realParagraph = renderer.paragraph;
125-
126232
renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
127233
// If there is only one top level node, just return the
128234
// bare text: it's a single line of text and so should be
@@ -133,6 +239,9 @@ export default class Markdown {
133239
realParagraph.call(this, node, entering);
134240
}
135241
};
242+
renderer.text = (node: commonmark.Node) => {
243+
renderer.out(node.literal);
244+
};
136245

137246
renderer.link = function(node, entering) {
138247
const attrs = this.attrs(node);
@@ -165,7 +274,7 @@ export default class Markdown {
165274
renderer.html_block = function(node: commonmark.Node) {
166275
/*
167276
// as with `paragraph`, we only insert line breaks
168-
// if there are multiple lines in the markdown.
277+
// if there are multiple lines in the markdownode.
169278
const isMultiLine = is_multi_line(node);
170279
if (isMultiLine) this.cr();
171280
*/
@@ -175,6 +284,17 @@ export default class Markdown {
175284
*/
176285
};
177286

287+
const realEmph = renderer.emph;
288+
289+
renderer.emph = function(node: commonmark.Node) {
290+
// We're escaping links with emphasis in the middle of it to act like a real link
291+
// This empty string check here is to verify that we have modified the emphasis node properly
292+
if (node.type === 'emph' && node.literal === "") {
293+
return;
294+
}
295+
return realEmph(node);
296+
};
297+
178298
return renderer.render(this.parsed);
179299
}
180300

@@ -184,7 +304,7 @@ export default class Markdown {
184304
* markdown syntax
185305
* (to fix https://github.com/vector-im/element-web/issues/2870).
186306
*
187-
* N.B. this does **NOT** render arbitrary MD to plain text - only MD
307+
* node.B. this does **NOT** render arbitrary MD to plain text - only MD
188308
* which has no formatting. Otherwise it emits HTML(!).
189309
*/
190310
toPlaintext(): string {
@@ -205,6 +325,16 @@ export default class Markdown {
205325
if (isMultiLine(node) && node.next) this.lit('\n\n');
206326
};
207327

328+
const realEmph = renderer.emph;
329+
renderer.emph = function(node: commonmark.Node) {
330+
// We're escaping links with emphasis in the middle of it to act like a real link
331+
// This empty string check here is to verify that we have modified the emphasis node properly
332+
if (node.type === 'emph' && node.literal === "") {
333+
return;
334+
}
335+
return realEmph(node);
336+
};
337+
208338
return renderer.render(this.parsed);
209339
}
210340
}

test/Markdown-test.ts

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/**
2+
* @jest-environment jsdom
3+
*/
4+
import Markdown from "../src/Markdown";
5+
6+
describe("Markdown parser test", () => {
7+
describe("autolinks", () => {
8+
const testString = [
9+
"Test1:",
10+
"#_foonetic_xkcd:matrix.org",
11+
"http://google.com/_thing_",
12+
"https://matrix.org/_matrix/client/foo/123_",
13+
"#_foonetic_xkcd:matrix.org",
14+
"",
15+
"Test1A:",
16+
"#_foonetic_xkcd:matrix.org",
17+
"http://google.com/_thing_",
18+
"https://matrix.org/_matrix/client/foo/123_",
19+
"#_foonetic_xkcd:matrix.org",
20+
"",
21+
"Test2:",
22+
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
23+
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
24+
"",
25+
"Test3:",
26+
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
27+
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
28+
].join("\n");
29+
30+
it('tests that links are getting properly HTML formatted', () => {
31+
/* eslint-disable max-len */
32+
const expectedResult = [
33+
"<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
34+
"<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
35+
"<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>",
36+
"<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>",
37+
"",
38+
].join("\n");
39+
/* eslint-enable max-len */
40+
const md = new Markdown(testString);
41+
expect(md.toHTML()).toEqual(expectedResult);
42+
});
43+
it('tests that links with autolinks are not touched at all and are still properly formatted', () => {
44+
const test = [
45+
"Test1:",
46+
"<#_foonetic_xkcd:matrix.org>",
47+
"<http://google.com/_thing_>",
48+
"<https://matrix.org/_matrix/client/foo/123_>",
49+
"<#_foonetic_xkcd:matrix.org>",
50+
"",
51+
"Test1A:",
52+
"<#_foonetic_xkcd:matrix.org>",
53+
"<http://google.com/_thing_>",
54+
"<https://matrix.org/_matrix/client/foo/123_>",
55+
"<#_foonetic_xkcd:matrix.org>",
56+
"",
57+
"Test2:",
58+
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
59+
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
60+
"",
61+
"Test3:",
62+
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
63+
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
64+
].join("\n");
65+
/* eslint-disable max-len */
66+
/**
67+
* NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org
68+
* but it seems to be actually working properly
69+
*/
70+
const expectedResult = [
71+
"<p>Test1:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
72+
"<p>Test1A:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
73+
"<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>",
74+
"<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>",
75+
"",
76+
].join("\n");
77+
/* eslint-enable max-len */
78+
const md = new Markdown(test);
79+
expect(md.toHTML()).toEqual(expectedResult);
80+
});
81+
82+
it('expects that links in codeblock are not modified', () => {
83+
const expectedResult = [
84+
'<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org',
85+
'http://google.com/_thing_',
86+
'https://matrix.org/_matrix/client/foo/123_',
87+
'#_foonetic_xkcd:matrix.org',
88+
'',
89+
'Test1A:',
90+
'#_foonetic_xkcd:matrix.org',
91+
'http://google.com/_thing_',
92+
'https://matrix.org/_matrix/client/foo/123_',
93+
'#_foonetic_xkcd:matrix.org',
94+
'',
95+
'Test2:',
96+
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
97+
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
98+
'',
99+
'Test3:',
100+
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
101+
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
102+
'</code></pre>',
103+
'',
104+
].join('\n');
105+
const md = new Markdown("```" + testString + "```");
106+
expect(md.toHTML()).toEqual(expectedResult);
107+
});
108+
});
109+
});

0 commit comments

Comments
 (0)