element-web/src/Markdown.js

/*
Copyright 2016 OpenMarket Ltd

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

import commonmark from 'commonmark';
import escape from 'lodash/escape';

const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];

// These types of node are definitely text
const TEXT_NODES = ['text', 'softbreak', 'linebreak', 'paragraph', 'document'];

function is_allowed_html_tag(node) {
    // Regex won't work for tags with attrs, but we only
    // allow <del> anyway.
    const matches = /^<\/?(.*)>$/.exec(node.literal);
    if (matches && matches.length == 2) {
        const tag = matches[1];
        return ALLOWED_HTML_TAGS.indexOf(tag) > -1;
    }
    return false;
}

function html_if_tag_allowed(node) {
    if (is_allowed_html_tag(node)) {
        this.lit(node.literal);
        return;
    } else {
        this.lit(escape(node.literal));
    }
}

/*
 * Returns true if the parse output containing the node
 * comprises multiple block level elements (ie. lines),
 * or false if it is only a single line.
 */
function is_multi_line(node) {
    var par = node;
    while (par.parent) {
        par = par.parent;
    }
    return par.firstChild != par.lastChild;
}

import linkifyMatrix from './linkify-matrix';
import * as linkify from 'linkifyjs';
linkifyMatrix(linkify);

// Thieved from draft-js-export-markdown
function escapeMarkdown(s) {
    return s.replace(/[*_`]/g, '\\$&');
}

// Replace URLs, room aliases and user IDs with md-escaped URLs
function linkifyMarkdown(s) {
    const links = linkify.find(s);
    links.forEach((l) => {
        // This may replace several instances of `l.value` at once, but that's OK
        s = s.replace(l.value, escapeMarkdown(l.value));
    });
    return s;
}

/**
 * Class that wraps commonmark, adding the ability to see whether
 * a given message actually uses any markdown syntax or whether
 * it's plain text.
 */
export default class Markdown {
    constructor(input) {
        this.input = linkifyMarkdown(input);

        const parser = new commonmark.Parser();
        this.parsed = parser.parse(this.input);
    }

    isPlainText() {
        const walker = this.parsed.walker();

        let ev;
        while ( (ev = walker.next()) ) {
            const node = ev.node;
            if (TEXT_NODES.indexOf(node.type) > -1) {
                // definitely text
                continue;
            } else if (node.type == 'html_inline' || node.type == 'html_block') {
                // if it's an allowed html tag, we need to render it and therefore
                // we will need to use HTML. If it's not allowed, it's not HTML since
                // we'll just be treating it as text.
                if (is_allowed_html_tag(node)) {
                    return false;
                }
            } else {
                return false;
            }
        }
        return true;
    }

    toHTML() {
        const renderer = new commonmark.HtmlRenderer({
            safe: false,

            // Set soft breaks to hard HTML breaks: commonmark
            // puts softbreaks in for multiple lines in a blockquote,
            // so if these are just newline characters then the
            // block quote ends up all on one line
            // (https://github.com/vector-im/riot-web/issues/3154)
            softbreak: '<br />',
        });
        const real_paragraph = renderer.paragraph;

        renderer.paragraph = function(node, entering) {
            // If there is only one top level node, just return the
            // bare text: it's a single line of text and so should be
            // 'inline', rather than unnecessarily wrapped in its own
            // p tag. If, however, we have multiple nodes, each gets
            // its own p tag to keep them as separate paragraphs.
            if (is_multi_line(node)) {
                real_paragraph.call(this, node, entering);
            }
        };

        renderer.html_inline = html_if_tag_allowed;
        renderer.html_block = function(node) {
            // as with `paragraph`, we only insert line breaks
            // if there are multiple lines in the markdown.
            const isMultiLine = is_multi_line(node);

            if (isMultiLine) this.cr();
            html_if_tag_allowed.call(this, node);
            if (isMultiLine) this.cr();
        }

        return renderer.render(this.parsed);
    }

    /*
     * Render the markdown message to plain text. That is, essentially
     * just remove any backslashes escaping what would otherwise be
     * markdown syntax
     * (to fix https://github.com/vector-im/riot-web/issues/2870)
     */
    toPlaintext() {
        const renderer = new commonmark.HtmlRenderer({safe: false});
        const real_paragraph = renderer.paragraph;

        // The default `out` function only sends the input through an XML
        // escaping function, which causes messages to be entity encoded,
        // which we don't want in this case.
        renderer.out = function(s) {
            // The `lit` function adds a string literal to the output buffer.
            this.lit(s);
        };

        renderer.paragraph = function(node, entering) {
            // as with toHTML, only append lines to paragraphs if there are
            // multiple paragraphs
            if (is_multi_line(node)) {
                if (!entering && node.next) {
                    this.lit('\n\n');
                }
            }
        };
        renderer.html_block = function(node) {
            this.lit(node.literal);
            if (is_multi_line(node) && node.next) this.lit('\n\n');
        }

        return renderer.render(this.parsed);
    }
}
Better detection of when input contains markdown 2016-09-22 16:18:12 +00:00			`/*`
			`Copyright 2016 OpenMarket Ltd`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License.`
			`*/`

Replace marked with commonmark Marked has some annoying bugs, and the author is inactive, so replace it with commonmark.js, which is the reference JavaScript implementation of CommonMark. CommonMark is also preferable since it has a specification, and a conformance test suite to make sure that parsers are correct. Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2016-11-29 19:56:48 +00:00			`import commonmark from 'commonmark';`
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00			`import escape from 'lodash/escape';`

allow sending sub,sup and whitelist them on receive Signed-off-by: Michael Telatynski <7t3chguy@gmail.com> 2017-09-13 11:04:46 +00:00			`const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];`
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00
			`// These types of node are definitely text`
			`const TEXT_NODES = ['text', 'softbreak', 'linebreak', 'paragraph', 'document'];`

			`function is_allowed_html_tag(node) {`
			`// Regex won't work for tags with attrs, but we only`
			`// allow <del> anyway.`
			`const matches = /^<\/?(.*)>$/.exec(node.literal);`
			`if (matches && matches.length == 2) {`
			`const tag = matches[1];`
			`return ALLOWED_HTML_TAGS.indexOf(tag) > -1;`
			`}`
			`return false;`
			`}`

			`function html_if_tag_allowed(node) {`
			`if (is_allowed_html_tag(node)) {`
			`this.lit(node.literal);`
			`return;`
			`} else {`
			`this.lit(escape(node.literal));`
			`}`
			`}`

			`/*`
			`* Returns true if the parse output containing the node`
			`* comprises multiple block level elements (ie. lines),`
			`* or false if it is only a single line.`
			`*/`
			`function is_multi_line(node) {`
			`var par = node;`
			`while (par.parent) {`
			`par = par.parent;`
			`}`
			`return par.firstChild != par.lastChild;`
			`}`
Better detection of when input contains markdown 2016-09-22 16:18:12 +00:00
MD-escape URLs/alises/user IDs prior to parsing markdown So that MD characters in them do not result in formatting being applied. Fixes https://github.com/vector-im/riot-web/issues/3428 Fixes https://github.com/vector-im/riot-web/issues/4674 2017-08-03 17:21:08 +00:00			`import linkifyMatrix from './linkify-matrix';`
			`import * as linkify from 'linkifyjs';`
			`linkifyMatrix(linkify);`

			`// Thieved from draft-js-export-markdown`
			`function escapeMarkdown(s) {`
			return s.replace(/[*_`]/g, '\\$&');
			`}`

			`// Replace URLs, room aliases and user IDs with md-escaped URLs`
			`function linkifyMarkdown(s) {`
			`const links = linkify.find(s);`
			`links.forEach((l) => {`
			// This may replace several instances of `l.value` at once, but that's OK
			`s = s.replace(l.value, escapeMarkdown(l.value));`
			`});`
			`return s;`
			`}`

Better detection of when input contains markdown 2016-09-22 16:18:12 +00:00			`/**`
Update the comments in Markdown.js so they don't claim it;s a wrapper around marked when it's now commonmark, and comment why we render markdown to plaintext which is somewhat unintuitive. 2017-02-02 11:27:07 +00:00			`* Class that wraps commonmark, adding the ability to see whether`
Better detection of when input contains markdown 2016-09-22 16:18:12 +00:00			`* a given message actually uses any markdown syntax or whether`
			`* it's plain text.`
			`*/`
			`export default class Markdown {`
			`constructor(input) {`
MD-escape URLs/alises/user IDs prior to parsing markdown So that MD characters in them do not result in formatting being applied. Fixes https://github.com/vector-im/riot-web/issues/3428 Fixes https://github.com/vector-im/riot-web/issues/4674 2017-08-03 17:21:08 +00:00			`this.input = linkifyMarkdown(input);`
Parse once and re-use the parsed output Rather than re-parsing the same output in each function 2017-02-02 11:34:39 +00:00
			`const parser = new commonmark.Parser();`
			`this.parsed = parser.parse(this.input);`
Better detection of when input contains markdown 2016-09-22 16:18:12 +00:00			`}`

			`isPlainText() {`
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00			`const walker = this.parsed.walker();`

			`let ev;`
			`while ( (ev = walker.next()) ) {`
			`const node = ev.node;`
			`if (TEXT_NODES.indexOf(node.type) > -1) {`
			`// definitely text`
			`continue;`
			`} else if (node.type == 'html_inline' \|\| node.type == 'html_block') {`
			`// if it's an allowed html tag, we need to render it and therefore`
			`// we will need to use HTML. If it's not allowed, it's not HTML since`
			`// we'll just be treating it as text.`
			`if (is_allowed_html_tag(node)) {`
			`return false;`
			`}`
			`} else {`
			`return false;`
			`}`
Better detection of when input contains markdown 2016-09-22 16:18:12 +00:00			`}`
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00			`return true;`
Better detection of when input contains markdown 2016-09-22 16:18:12 +00:00			`}`

Markdown: delete remaining pre-split relics Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-19 10:55:36 +00:00			`toHTML() {`
Fix block quotes all being on a single line Fixes https://github.com/vector-im/riot-web/issues/3154 2017-02-17 18:06:00 +00:00			`const renderer = new commonmark.HtmlRenderer({`
			`safe: false,`

			`// Set soft breaks to hard HTML breaks: commonmark`
			`// puts softbreaks in for multiple lines in a blockquote,`
			`// so if these are just newline characters then the`
			`// block quote ends up all on one line`
			`// (https://github.com/vector-im/riot-web/issues/3154)`
			`softbreak: '<br />',`
			`});`
Make ourselves a new rendered each time Rather than keeping one in memory, abusing it in different ways each time and then craefully putting it back the way it was (and in one case, failing, because we forgot to put the `out` method back). 2017-02-02 11:45:21 +00:00			`const real_paragraph = renderer.paragraph;`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00
Make ourselves a new rendered each time Rather than keeping one in memory, abusing it in different ways each time and then craefully putting it back the way it was (and in one case, failing, because we forgot to put the `out` method back). 2017-02-02 11:45:21 +00:00			`renderer.paragraph = function(node, entering) {`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00			`// If there is only one top level node, just return the`
			`// bare text: it's a single line of text and so should be`
			`// 'inline', rather than unnecessarily wrapped in its own`
			`// p tag. If, however, we have multiple nodes, each gets`
			`// its own p tag to keep them as separate paragraphs.`
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00			`if (is_multi_line(node)) {`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00			`real_paragraph.call(this, node, entering);`
Markdown: Don't XML escape the output when not HTML Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-17 21:20:05 +00:00			`}`
Fix a bunch of linting errors eslint --fix and a few manual ones 2017-01-20 14:22:27 +00:00			`};`
Markdown: Don't XML escape the output when not HTML Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-17 21:20:05 +00:00
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00			`renderer.html_inline = html_if_tag_allowed;`
			`renderer.html_block = function(node) {`
			// as with `paragraph`, we only insert line breaks
			`// if there are multiple lines in the markdown.`
			`const isMultiLine = is_multi_line(node);`

			`if (isMultiLine) this.cr();`
			`html_if_tag_allowed.call(this, node);`
			`if (isMultiLine) this.cr();`
			`}`

Make ourselves a new rendered each time Rather than keeping one in memory, abusing it in different ways each time and then craefully putting it back the way it was (and in one case, failing, because we forgot to put the `out` method back). 2017-02-02 11:45:21 +00:00			`return renderer.render(this.parsed);`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00			`}`

Update the comments in Markdown.js so they don't claim it;s a wrapper around marked when it's now commonmark, and comment why we render markdown to plaintext which is somewhat unintuitive. 2017-02-02 11:27:07 +00:00			`/*`
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00			`* Render the markdown message to plain text. That is, essentially`
Update the comments in Markdown.js so they don't claim it;s a wrapper around marked when it's now commonmark, and comment why we render markdown to plaintext which is somewhat unintuitive. 2017-02-02 11:27:07 +00:00			`* just remove any backslashes escaping what would otherwise be`
			`* markdown syntax`
			`* (to fix https://github.com/vector-im/riot-web/issues/2870)`
			`*/`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00			`toPlaintext() {`
Make ourselves a new rendered each time Rather than keeping one in memory, abusing it in different ways each time and then craefully putting it back the way it was (and in one case, failing, because we forgot to put the `out` method back). 2017-02-02 11:45:21 +00:00			`const renderer = new commonmark.HtmlRenderer({safe: false});`
			`const real_paragraph = renderer.paragraph;`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00
			// The default `out` function only sends the input through an XML
			`// escaping function, which causes messages to be entity encoded,`
			`// which we don't want in this case.`
Make ourselves a new rendered each time Rather than keeping one in memory, abusing it in different ways each time and then craefully putting it back the way it was (and in one case, failing, because we forgot to put the `out` method back). 2017-02-02 11:45:21 +00:00			`renderer.out = function(s) {`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00			// The `lit` function adds a string literal to the output buffer.
			`this.lit(s);`
Fix a bunch of linting errors eslint --fix and a few manual ones 2017-01-20 14:22:27 +00:00			`};`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00
Make ourselves a new rendered each time Rather than keeping one in memory, abusing it in different ways each time and then craefully putting it back the way it was (and in one case, failing, because we forgot to put the `out` method back). 2017-02-02 11:45:21 +00:00			`renderer.paragraph = function(node, entering) {`
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00			`// as with toHTML, only append lines to paragraphs if there are`
			`// multiple paragraphs`
			`if (is_multi_line(node)) {`
			`if (!entering && node.next) {`
Markdown: Split up render function into toHTML/toPlaintext Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2017-01-18 18:29:11 +00:00			`this.lit('\n\n');`
Fix escaping markdown by rendering plaintext We still need to parse "plaintext" messages through the markdown renderer so that escappes are rendered properly. Fixes vector-im/riot-web#2870. Signed-off-by: Johannes Löthberg <johannes@kyriasis.com> 2016-12-02 18:58:35 +00:00			`}`
Better logic for wrapping in p tags or not 2016-09-22 17:57:46 +00:00			`}`
Fix a bunch of linting errors eslint --fix and a few manual ones 2017-01-20 14:22:27 +00:00			`};`
Fix spurious html tags like <shrug> * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '<shrug>' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 2017-02-02 14:17:07 +00:00			`renderer.html_block = function(node) {`
			`this.lit(node.literal);`
			`if (is_multi_line(node) && node.next) this.lit('\n\n');`
			`}`
Better logic for wrapping in p tags or not 2016-09-22 17:57:46 +00:00
Make ourselves a new rendered each time Rather than keeping one in memory, abusing it in different ways each time and then craefully putting it back the way it was (and in one case, failing, because we forgot to put the `out` method back). 2017-02-02 11:45:21 +00:00			`return renderer.render(this.parsed);`
Better detection of when input contains markdown 2016-09-22 16:18:12 +00:00			`}`
			`}`