From 853c89dfdcbf283db26a1f2d822c82289b56045d Mon Sep 17 00:00:00 2001
From: David Baker <dave@matrix.org>
Date: Thu, 2 Feb 2017 14:17:07 +0000
Subject: [PATCH] Fix spurious html tags like <shrug>

 * Only render HTML tags in markdown if they're del tags
 * Consider non-allowed HTML tags as plain text nodes, so
   a message of just '<shrug>' doesn't need to be sent as
   HTML
 * Consequently rewrite isPlaintext to just look at the parse
   tree rather than making and gutting a renderer to walk
   the tree (now we're using a library that actually produces
   a meaningfgul parse tree).
 * Tweak when we put \n on text output to avoid putting \n on
   the end of messages.

Fixes https://github.com/vector-im/riot-web/issues/3065
---
 src/Markdown.js | 117 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 78 insertions(+), 39 deletions(-)
diff --git a/src/Markdown.js b/src/Markdown.js
index af103b7ad6..d6dc979a5a 100644
--- a/src/Markdown.js
+++ b/src/Markdown.js
@@ -15,6 +15,45 @@ limitations under the License.
 */
 
 import commonmark from 'commonmark';
+import escape from 'lodash/escape';
+
+const ALLOWED_HTML_TAGS = ['del'];
+
+// These types of node are definitely text
+const TEXT_NODES = ['text', 'softbreak', 'linebreak', 'paragraph', 'document'];
+
+function is_allowed_html_tag(node) {
+    // Regex won't work for tags with attrs, but we only
+    // allow <del> anyway.
+    const matches = /^<\/?(.*)>$/.exec(node.literal);
+    if (matches && matches.length == 2) {
+        const tag = matches[1];
+        return ALLOWED_HTML_TAGS.indexOf(tag) > -1;
+    }
+    return false;
+}
+
+function html_if_tag_allowed(node) {
+    if (is_allowed_html_tag(node)) {
+        this.lit(node.literal);
+        return;
+    } else {
+        this.lit(escape(node.literal));
+    }
+}
+
+/*
+ * Returns true if the parse output containing the node
+ * comprises multiple block level elements (ie. lines),
+ * or false if it is only a single line.
+ */
+function is_multi_line(node) {
+    var par = node;
+    while (par.parent) {
+        par = par.parent;
+    }
+    return par.firstChild != par.lastChild;
+}
 
 /**
  * Class that wraps commonmark, adding the ability to see whether
@@ -30,29 +69,26 @@ export default class Markdown {
     }
 
     isPlainText() {
-        // we determine if the message requires markdown by
-        // running the parser on the tokens with a dummy
-        // rendered and seeing if any of the renderer's
-        // functions are called other than those noted below.
-        // TODO: can't we just examine the output of the parser?
-        let is_plain = true;
+        const walker = this.parsed.walker();
 
-        function setNotPlain() {
-            is_plain = false;
+        let ev;
+        while ( (ev = walker.next()) ) {
+            const node = ev.node;
+            if (TEXT_NODES.indexOf(node.type) > -1) {
+                // definitely text
+                continue;
+            } else if (node.type == 'html_inline' || node.type == 'html_block') {
+                // if it's an allowed html tag, we need to render it and therefore
+                // we will need to use HTML. If it's not allowed, it's not HTML since
+                // we'll just be treating it as text.
+                if (is_allowed_html_tag(node)) {
+                    return false;
+                }
+            } else {
+                return false;
+            }
         }
-
-        const dummy_renderer = new commonmark.HtmlRenderer();
-        for (const k of Object.keys(commonmark.HtmlRenderer.prototype)) {
-            dummy_renderer[k] = setNotPlain;
-        }
-        // text and paragraph are just text
-        dummy_renderer.text = function(t) { return t; };
-        dummy_renderer.softbreak = function(t) { return t; };
-        dummy_renderer.paragraph = function(t) { return t; };
-
-        dummy_renderer.render(this.parsed);
-
-        return is_plain;
+        return true;
     }
 
     toHTML() {
@@ -65,20 +101,27 @@ export default class Markdown {
             // 'inline', rather than unnecessarily wrapped in its own
             // p tag. If, however, we have multiple nodes, each gets
             // its own p tag to keep them as separate paragraphs.
-            var par = node;
-            while (par.parent) {
-                par = par.parent;
-            }
-            if (par.firstChild != par.lastChild) {
+            if (is_multi_line(node)) {
                 real_paragraph.call(this, node, entering);
             }
         };
 
+        renderer.html_inline = html_if_tag_allowed;
+        renderer.html_block = function(node) {
+            // as with `paragraph`, we only insert line breaks
+            // if there are multiple lines in the markdown.
+            const isMultiLine = is_multi_line(node);
+
+            if (isMultiLine) this.cr();
+            html_if_tag_allowed.call(this, node);
+            if (isMultiLine) this.cr();
+        }
+
         return renderer.render(this.parsed);
     }
 
     /*
-     * Render the mrkdown message to plain text. That is, essentially
+     * Render the markdown message to plain text. That is, essentially
      * just remove any backslashes escaping what would otherwise be
      * markdown syntax
      * (to fix https://github.com/vector-im/riot-web/issues/2870)
@@ -96,22 +139,18 @@ export default class Markdown {
         };
 
         renderer.paragraph = function(node, entering) {
-            // If there is only one top level node, just return the
-            // bare text: it's a single line of text and so should be
-            // 'inline', rather than unnecessarily wrapped in its own
-            // p tag. If, however, we have multiple nodes, each gets
-            // its own p tag to keep them as separate paragraphs.
-            var par = node;
-            while (par.parent) {
-                node = par;
-                par = par.parent;
-            }
-            if (node != par.lastChild) {
-                if (!entering) {
+            // as with toHTML, only append lines to paragraphs if there are
+            // multiple paragraphs
+            if (is_multi_line(node)) {
+                if (!entering && node.next) {
                     this.lit('\n\n');
                 }
             }
         };
+        renderer.html_block = function(node) {
+            this.lit(node.literal);
+            if (is_multi_line(node) && node.next) this.lit('\n\n');
+        }
 
         return renderer.render(this.parsed);
     }