From 853c89dfdcbf283db26a1f2d822c82289b56045d Mon Sep 17 00:00:00 2001 From: David Baker Date: Thu, 2 Feb 2017 14:17:07 +0000 Subject: [PATCH] Fix spurious html tags like * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 --- src/Markdown.js | 117 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 78 insertions(+), 39 deletions(-) diff --git a/src/Markdown.js b/src/Markdown.js index af103b7ad6..d6dc979a5a 100644 --- a/src/Markdown.js +++ b/src/Markdown.js @@ -15,6 +15,45 @@ limitations under the License. */ import commonmark from 'commonmark'; +import escape from 'lodash/escape'; + +const ALLOWED_HTML_TAGS = ['del']; + +// These types of node are definitely text +const TEXT_NODES = ['text', 'softbreak', 'linebreak', 'paragraph', 'document']; + +function is_allowed_html_tag(node) { + // Regex won't work for tags with attrs, but we only + // allow anyway. + const matches = /^<\/?(.*)>$/.exec(node.literal); + if (matches && matches.length == 2) { + const tag = matches[1]; + return ALLOWED_HTML_TAGS.indexOf(tag) > -1; + } + return false; +} + +function html_if_tag_allowed(node) { + if (is_allowed_html_tag(node)) { + this.lit(node.literal); + return; + } else { + this.lit(escape(node.literal)); + } +} + +/* + * Returns true if the parse output containing the node + * comprises multiple block level elements (ie. lines), + * or false if it is only a single line. + */ +function is_multi_line(node) { + var par = node; + while (par.parent) { + par = par.parent; + } + return par.firstChild != par.lastChild; +} /** * Class that wraps commonmark, adding the ability to see whether @@ -30,29 +69,26 @@ export default class Markdown { } isPlainText() { - // we determine if the message requires markdown by - // running the parser on the tokens with a dummy - // rendered and seeing if any of the renderer's - // functions are called other than those noted below. - // TODO: can't we just examine the output of the parser? - let is_plain = true; + const walker = this.parsed.walker(); - function setNotPlain() { - is_plain = false; + let ev; + while ( (ev = walker.next()) ) { + const node = ev.node; + if (TEXT_NODES.indexOf(node.type) > -1) { + // definitely text + continue; + } else if (node.type == 'html_inline' || node.type == 'html_block') { + // if it's an allowed html tag, we need to render it and therefore + // we will need to use HTML. If it's not allowed, it's not HTML since + // we'll just be treating it as text. + if (is_allowed_html_tag(node)) { + return false; + } + } else { + return false; + } } - - const dummy_renderer = new commonmark.HtmlRenderer(); - for (const k of Object.keys(commonmark.HtmlRenderer.prototype)) { - dummy_renderer[k] = setNotPlain; - } - // text and paragraph are just text - dummy_renderer.text = function(t) { return t; }; - dummy_renderer.softbreak = function(t) { return t; }; - dummy_renderer.paragraph = function(t) { return t; }; - - dummy_renderer.render(this.parsed); - - return is_plain; + return true; } toHTML() { @@ -65,20 +101,27 @@ export default class Markdown { // 'inline', rather than unnecessarily wrapped in its own // p tag. If, however, we have multiple nodes, each gets // its own p tag to keep them as separate paragraphs. - var par = node; - while (par.parent) { - par = par.parent; - } - if (par.firstChild != par.lastChild) { + if (is_multi_line(node)) { real_paragraph.call(this, node, entering); } }; + renderer.html_inline = html_if_tag_allowed; + renderer.html_block = function(node) { + // as with `paragraph`, we only insert line breaks + // if there are multiple lines in the markdown. + const isMultiLine = is_multi_line(node); + + if (isMultiLine) this.cr(); + html_if_tag_allowed.call(this, node); + if (isMultiLine) this.cr(); + } + return renderer.render(this.parsed); } /* - * Render the mrkdown message to plain text. That is, essentially + * Render the markdown message to plain text. That is, essentially * just remove any backslashes escaping what would otherwise be * markdown syntax * (to fix https://github.com/vector-im/riot-web/issues/2870) @@ -96,22 +139,18 @@ export default class Markdown { }; renderer.paragraph = function(node, entering) { - // If there is only one top level node, just return the - // bare text: it's a single line of text and so should be - // 'inline', rather than unnecessarily wrapped in its own - // p tag. If, however, we have multiple nodes, each gets - // its own p tag to keep them as separate paragraphs. - var par = node; - while (par.parent) { - node = par; - par = par.parent; - } - if (node != par.lastChild) { - if (!entering) { + // as with toHTML, only append lines to paragraphs if there are + // multiple paragraphs + if (is_multi_line(node)) { + if (!entering && node.next) { this.lit('\n\n'); } } }; + renderer.html_block = function(node) { + this.lit(node.literal); + if (is_multi_line(node) && node.next) this.lit('\n\n'); + } return renderer.render(this.parsed); }