From d055dbe522e271d598ba98dabdfd069ea0f0a016 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Wed, 10 Feb 2016 20:25:32 +0000 Subject: [PATCH 1/3] use sanitize-html's textFilter callback to only apply highlights to textNodes when highlighting HTML. fixes https://github.com/vector-im/vector-web/issues/294 --- src/HtmlUtils.js | 62 ++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/src/HtmlUtils.js b/src/HtmlUtils.js index 87f5bd2cca..b90cab5d72 100644 --- a/src/HtmlUtils.js +++ b/src/HtmlUtils.js @@ -17,6 +17,7 @@ limitations under the License. 'use strict'; var React = require('react'); +var ReactDOMServer = require('react-dom/server') var sanitizeHtml = require('sanitize-html'); var highlight = require('highlight.js'); @@ -57,24 +58,17 @@ class Highlighter { this._key = 0; } - applyHighlights(safeSnippet, highlights) { + applyHighlights(safeSnippet, safeHighlights) { var lastOffset = 0; var offset; var nodes = []; - // XXX: when highlighting HTML, synapse performs the search on the plaintext body, - // but we're attempting to apply the highlights here to the HTML body. This is - // never going to end well - we really should be hooking into the sanitzer HTML - // parser to only attempt to highlight text nodes to avoid corrupting tags. - // If and when this happens, we'll probably have to split this method in two between - // HTML and plain-text highlighting. - - var safeHighlight = this.html ? sanitizeHtml(highlights[0], sanitizeHtmlParams) : highlights[0]; + var safeHighlight = safeHighlights[0]; while ((offset = safeSnippet.toLowerCase().indexOf(safeHighlight.toLowerCase(), lastOffset)) >= 0) { // handle preamble if (offset > lastOffset) { var subSnippet = safeSnippet.substring(lastOffset, offset); - nodes = nodes.concat(this._applySubHighlights(subSnippet, highlights)); + nodes = nodes.concat(this._applySubHighlights(subSnippet, safeHighlights)); } // do highlight @@ -86,15 +80,15 @@ class Highlighter { // handle postamble if (lastOffset != safeSnippet.length) { var subSnippet = safeSnippet.substring(lastOffset, undefined); - nodes = nodes.concat(this._applySubHighlights(subSnippet, highlights)); + nodes = nodes.concat(this._applySubHighlights(subSnippet, safeHighlights)); } return nodes; } - _applySubHighlights(safeSnippet, highlights) { - if (highlights[1]) { + _applySubHighlights(safeSnippet, safeHighlights) { + if (safeHighlights[1]) { // recurse into this range to check for the next set of highlight matches - return this.applyHighlights(safeSnippet, highlights.slice(1)); + return this.applyHighlights(safeSnippet, safeHighlights.slice(1)); } else { // no more highlights to be found, just return the unhighlighted string @@ -132,7 +126,7 @@ module.exports = { * * content: 'content' of the MatrixEvent * - * highlights: optional list of words to highlight + * highlights: optional list of words to highlight, ordered by longest word first * * opts.onHighlightClick: optional callback function to be called when a * highlighted word is clicked @@ -142,28 +136,38 @@ module.exports = { var isHtml = (content.format === "org.matrix.custom.html"); - var safeBody; + var safeBody, body; if (isHtml) { + // XXX: We sanitize the HTML whilst also highlighting its text nodes, to avoid accidentally trying + // to highlight HTML tags themselves. However, this does mean that we don't highlight textnodes which + // are interrupted by HTML tags (not that we did before) - e.g. foobar won't get highlighted + // by an attempt to search for 'foobar'. Then again, the search query probably wouldn't work either + if (highlights && highlights.length > 0) { + var highlighter = new Highlighter(isHtml, "mx_EventTile_searchHighlight", opts.onHighlightClick); + // XXX: hacky bodge to temporarily apply a textFilter to the sanitizeHtmlParams structure. + sanitizeHtmlParams.textFilter = function(safeText) { + var html = highlighter.applyHighlights(safeText, highlights).map(function(span) { + // XXX: rather clunky conversion from the react nodes returned by applyHighlights + // (which need to be nodes for the non-html highlighting case), to convert them + // back into raw HTML given that's what sanitize-html works in terms of. + return ReactDOMServer.renderToString(span); + }).join(''); + return html; + }; + } safeBody = sanitizeHtml(content.formatted_body, sanitizeHtmlParams); + delete sanitizeHtmlParams.textFilter; + return ; } else { safeBody = content.body; - } - - var body; - if (highlights && highlights.length > 0) { - var highlighter = new Highlighter(isHtml, "mx_EventTile_searchHighlight", opts.onHighlightClick); - body = highlighter.applyHighlights(safeBody, highlights); - } - else { - if (isHtml) { - body = ; + if (highlights && highlights.length > 0) { + var highlighter = new Highlighter(isHtml, "mx_EventTile_searchHighlight", opts.onHighlightClick); + return highlighter.applyHighlights(safeBody, highlights); } else { - body = safeBody; + return safeBody; } } - - return body; }, highlightDom: function(element) { From 92435c0865a0af7e31da1a7a4395ba355b42bdad Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Wed, 10 Feb 2016 23:45:07 +0000 Subject: [PATCH 2/3] ooops, don't forget to actually sanitize the highlights after all that --- src/HtmlUtils.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/HtmlUtils.js b/src/HtmlUtils.js index b90cab5d72..f13295f24a 100644 --- a/src/HtmlUtils.js +++ b/src/HtmlUtils.js @@ -144,15 +144,17 @@ module.exports = { // by an attempt to search for 'foobar'. Then again, the search query probably wouldn't work either if (highlights && highlights.length > 0) { var highlighter = new Highlighter(isHtml, "mx_EventTile_searchHighlight", opts.onHighlightClick); + var safeHighlights = highlights.map(function(highlight) { + return sanitizeHtml(highlight, sanitizeHtmlParams); + }); // XXX: hacky bodge to temporarily apply a textFilter to the sanitizeHtmlParams structure. sanitizeHtmlParams.textFilter = function(safeText) { - var html = highlighter.applyHighlights(safeText, highlights).map(function(span) { + return highlighter.applyHighlights(safeText, safeHighlights).map(function(span) { // XXX: rather clunky conversion from the react nodes returned by applyHighlights // (which need to be nodes for the non-html highlighting case), to convert them // back into raw HTML given that's what sanitize-html works in terms of. return ReactDOMServer.renderToString(span); }).join(''); - return html; }; } safeBody = sanitizeHtml(content.formatted_body, sanitizeHtmlParams); From 1c30640a92d91371afe00e3fe89f690317db3f14 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 11 Feb 2016 14:03:48 +0000 Subject: [PATCH 3/3] remove unused 'body' var; use a `finally` to clean up the temporary textfilter --- src/HtmlUtils.js | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/HtmlUtils.js b/src/HtmlUtils.js index f13295f24a..0b7f17b2b2 100644 --- a/src/HtmlUtils.js +++ b/src/HtmlUtils.js @@ -136,29 +136,33 @@ module.exports = { var isHtml = (content.format === "org.matrix.custom.html"); - var safeBody, body; + var safeBody; if (isHtml) { // XXX: We sanitize the HTML whilst also highlighting its text nodes, to avoid accidentally trying // to highlight HTML tags themselves. However, this does mean that we don't highlight textnodes which // are interrupted by HTML tags (not that we did before) - e.g. foobar won't get highlighted // by an attempt to search for 'foobar'. Then again, the search query probably wouldn't work either - if (highlights && highlights.length > 0) { - var highlighter = new Highlighter(isHtml, "mx_EventTile_searchHighlight", opts.onHighlightClick); - var safeHighlights = highlights.map(function(highlight) { - return sanitizeHtml(highlight, sanitizeHtmlParams); - }); - // XXX: hacky bodge to temporarily apply a textFilter to the sanitizeHtmlParams structure. - sanitizeHtmlParams.textFilter = function(safeText) { - return highlighter.applyHighlights(safeText, safeHighlights).map(function(span) { - // XXX: rather clunky conversion from the react nodes returned by applyHighlights - // (which need to be nodes for the non-html highlighting case), to convert them - // back into raw HTML given that's what sanitize-html works in terms of. - return ReactDOMServer.renderToString(span); - }).join(''); - }; + try { + if (highlights && highlights.length > 0) { + var highlighter = new Highlighter(isHtml, "mx_EventTile_searchHighlight", opts.onHighlightClick); + var safeHighlights = highlights.map(function(highlight) { + return sanitizeHtml(highlight, sanitizeHtmlParams); + }); + // XXX: hacky bodge to temporarily apply a textFilter to the sanitizeHtmlParams structure. + sanitizeHtmlParams.textFilter = function(safeText) { + return highlighter.applyHighlights(safeText, safeHighlights).map(function(span) { + // XXX: rather clunky conversion from the react nodes returned by applyHighlights + // (which need to be nodes for the non-html highlighting case), to convert them + // back into raw HTML given that's what sanitize-html works in terms of. + return ReactDOMServer.renderToString(span); + }).join(''); + }; + } + safeBody = sanitizeHtml(content.formatted_body, sanitizeHtmlParams); + } + finally { + delete sanitizeHtmlParams.textFilter; } - safeBody = sanitizeHtml(content.formatted_body, sanitizeHtmlParams); - delete sanitizeHtmlParams.textFilter; return ; } else { safeBody = content.body;