/*
Copyright 2024 New Vector Ltd.
Copyright 2021 The Matrix.org Foundation C.I.C.
Copyright 2016 OpenMarket Ltd
SPDX-License-Identifier: AGPL-3.0-only OR GPL-3.0-only
Please see LICENSE files in the repository root for full details.
*/
import "./@types/commonmark"; // import better types than @types/commonmark
import * as commonmark from "commonmark";
import { escape } from "lodash";
import { logger } from "matrix-js-sdk/src/logger";
import { linkify } from "./linkify-matrix";
const ALLOWED_HTML_TAGS = ["sub", "sup", "del", "s", "u", "br", "br/"];
// These types of node are definitely text
const TEXT_NODES = ["text", "softbreak", "linebreak", "paragraph", "document"];
function isAllowedHtmlTag(node: commonmark.Node): boolean {
if (!node.literal) {
return false;
}
if (node.literal.match('^<((div|span) data-mx-maths="[^"]*"|/(div|span))>$') != null) {
return true;
}
// Regex won't work for tags with attrs, but the tags we allow
// shouldn't really have any anyway.
const matches = /^<\/?(.*)>$/.exec(node.literal);
if (matches && matches.length == 2) {
const tag = matches[1];
return ALLOWED_HTML_TAGS.indexOf(tag) > -1;
}
return false;
}
/*
* Returns true if the parse output containing the node
* comprises multiple block level elements (ie. lines),
* or false if it is only a single line.
*/
function isMultiLine(node: commonmark.Node): boolean {
let par = node;
while (par.parent) {
par = par.parent;
}
return par.firstChild != par.lastChild;
}
function getTextUntilEndOrLinebreak(node: commonmark.Node): string {
let currentNode: commonmark.Node | null = node;
let text = "";
while (currentNode && currentNode.type !== "softbreak" && currentNode.type !== "linebreak") {
const { literal, type } = currentNode;
if (type === "text" && literal) {
let n = 0;
let char = literal[n];
while (char !== " " && char !== null && n <= literal.length) {
if (char === " ") {
break;
}
if (char) {
text += char;
}
n += 1;
char = literal[n];
}
if (char === " ") {
break;
}
}
currentNode = currentNode.next;
}
return text;
}
const formattingChangesByNodeType = {
emph: "_",
strong: "__",
};
/**
* Returns the literal of a node an all child nodes.
*/
const innerNodeLiteral = (node: commonmark.Node): string => {
let literal = "";
const walker = node.walker();
let step: commonmark.NodeWalkingStep | null;
while ((step = walker.next())) {
const currentNode = step.node;
const currentNodeLiteral = currentNode.literal;
if (step.entering && currentNode.type === "text" && currentNodeLiteral) {
literal += currentNodeLiteral;
}
}
return literal;
};
const emptyItemWithNoSiblings = (node: commonmark.Node): boolean => {
return !node.prev && !node.next && !node.firstChild;
};
/**
* Class that wraps commonmark, adding the ability to see whether
* a given message actually uses any markdown syntax or whether
* it's plain text.
*/
export default class Markdown {
private input: string;
private parsed: commonmark.Node;
public constructor(input: string) {
this.input = input;
const parser = new commonmark.Parser();
this.parsed = parser.parse(this.input);
this.parsed = this.repairLinks(this.parsed);
}
/**
* This method is modifying the parsed AST in such a way that links are always
* properly linkified instead of sometimes being wrongly emphasised in case
* if you were to write a link like the example below:
* https://my_weird-link_domain.domain.com
* ^ this link would be parsed to something like this:
* https://myweird-linkdomain.domain.com
* This method makes it so the link gets properly modified to a version where it is
* not emphasised until it actually ends.
* See: https://github.com/vector-im/element-web/issues/4674
* @param parsed
*/
private repairLinks(parsed: commonmark.Node): commonmark.Node {
const walker = parsed.walker();
let event: commonmark.NodeWalkingStep | null = null;
let text = "";
let isInPara = false;
let previousNode: commonmark.Node | null = null;
let shouldUnlinkFormattingNode = false;
while ((event = walker.next())) {
const { node } = event;
if (node.type === "paragraph") {
if (event.entering) {
isInPara = true;
} else {
isInPara = false;
}
}
if (isInPara) {
// Clear saved string when line ends
if (
node.type === "softbreak" ||
node.type === "linebreak" ||
// Also start calculating the text from the beginning on any spaces
(node.type === "text" && node.literal === " ")
) {
text = "";
continue;
}
// Break up text nodes on spaces, so that we don't shoot past them without resetting
if (node.type === "text" && node.literal) {
const [thisPart, ...nextParts] = node.literal.split(/( )/);
node.literal = thisPart;
text += thisPart;
// Add the remaining parts as siblings
nextParts.reverse().forEach((part) => {
if (part) {
const nextNode = new commonmark.Node("text");
nextNode.literal = part;
node.insertAfter(nextNode);
// Make the iterator aware of the newly inserted node
walker.resumeAt(nextNode, true);
}
});
}
// We should not do this if previous node was not a textnode, as we can't combine it then.
if ((node.type === "emph" || node.type === "strong") && previousNode?.type === "text") {
if (event.entering) {
const foundLinks = linkify.find(text);
for (const { value } of foundLinks) {
if (node?.firstChild?.literal) {
/**
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
* but this solution seems to work well and is hopefully slightly easier to understand too
*/
const format = formattingChangesByNodeType[node.type];
const nonEmphasizedText = `${format}${innerNodeLiteral(node)}${format}`;
const f = getTextUntilEndOrLinebreak(node);
const newText = value + nonEmphasizedText + f;
const newLinks = linkify.find(newText);
// Should always find only one link here, if it finds more it means that the algorithm is broken
if (newLinks.length === 1) {
const emphasisTextNode = new commonmark.Node("text");
emphasisTextNode.literal = nonEmphasizedText;
previousNode.insertAfter(emphasisTextNode);
node.firstChild.literal = "";
event = node.walker().next();
if (event) {
// Remove `em` opening and closing nodes
node.unlink();
previousNode.insertAfter(event.node);
shouldUnlinkFormattingNode = true;
}
} else {
logger.error(
"Markdown links escaping found too many links for following text: ",
text,
);
logger.error(
"Markdown links escaping found too many links for modified text: ",
newText,
);
}
}
}
} else {
if (shouldUnlinkFormattingNode) {
node.unlink();
shouldUnlinkFormattingNode = false;
}
}
}
}
previousNode = node;
}
return parsed;
}
public isPlainText(): boolean {
const walker = this.parsed.walker();
let ev: commonmark.NodeWalkingStep | null;
while ((ev = walker.next())) {
const node = ev.node;
if (TEXT_NODES.indexOf(node.type) > -1) {
// definitely text
continue;
} else if (node.type == "list" || node.type == "item") {
// Special handling for inputs like `+`, `*`, `-` and `2021.` which
// would otherwise be treated as a list of a single empty item.
// See https://github.com/vector-im/element-web/issues/7631
if (node.type == "list" && node.firstChild && emptyItemWithNoSiblings(node.firstChild)) {
// A list with a single empty item is treated as plain text.
continue;
}
if (node.type == "item" && emptyItemWithNoSiblings(node)) {
// An empty list item with no sibling items is treated as plain text.
continue;
}
// Everything else is actual lists and therefore not plaintext.
return false;
} else if (node.type == "html_inline" || node.type == "html_block") {
// if it's an allowed html tag, we need to render it and therefore
// we will need to use HTML. If it's not allowed, it's not HTML since
// we'll just be treating it as text.
if (isAllowedHtmlTag(node)) {
return false;
}
} else {
return false;
}
}
return true;
}
public toHTML({ externalLinks = false } = {}): string {
const renderer = new commonmark.HtmlRenderer({
safe: false,
// Set soft breaks to hard HTML breaks: commonmark
// puts softbreaks in for multiple lines in a blockquote,
// so if these are just newline characters then the
// block quote ends up all on one line
// (https://github.com/vector-im/element-web/issues/3154)
softbreak: "
",
});
// Trying to strip out the wrapping