From c61eca8c246c3f52e91c4f7e6d77710db41413f3 Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 4 Jul 2024 13:48:07 -0400 Subject: [PATCH] Don't consider textual characters to be emoji (#12582) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Don't consider textual characters to be emoji We were using emojibase-regex to match emoji within messages. However, the docs (https://emojibase.dev/docs/regex/) state that this regex matches both emoji and text presentation characters. This is not what we want, and will result in false positives for characters like 'โ†”' that could turn into an emoji if paired with a variation selector. Unfortunately, none of the other regexes provided by Emojibase do what we want either (https://github.com/milesj/emojibase/issues/174). In the meantime, browser support for the RGI_Emoji character sequence class has made it feasible to write an emoji regex by hand, so that's what I've done. * Add a fallback for BIGEMOJI_REGEX as well --- .eslintrc.js | 10 +++++ src/HtmlUtils.tsx | 34 ++++++++++++--- .../views/rooms/SendMessageComposer.tsx | 2 +- src/editor/parts.ts | 11 +++-- test/HtmlUtils-test.tsx | 12 ++++++ test/__snapshots__/HtmlUtils-test.tsx.snap | 41 +++++++++++++++++++ 6 files changed, 98 insertions(+), 12 deletions(-) diff --git a/.eslintrc.js b/.eslintrc.js index 0f1335b586..c6251ef722 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -78,6 +78,11 @@ module.exports = { name: "matrix-react-sdk/", message: "Please use matrix-react-sdk/src/index instead", }, + { + name: "emojibase-regex", + message: + "This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.", + }, ], patterns: [ { @@ -141,6 +146,11 @@ module.exports = { ], message: "Please use matrix-js-sdk/src/matrix instead", }, + { + group: ["emojibase-regex/emoji*"], + message: + "This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.", + }, ], }, ], diff --git a/src/HtmlUtils.tsx b/src/HtmlUtils.tsx index d8c154440b..888c30d76c 100644 --- a/src/HtmlUtils.tsx +++ b/src/HtmlUtils.tsx @@ -20,7 +20,6 @@ limitations under the License. import React, { LegacyRef, ReactNode } from "react"; import sanitizeHtml from "sanitize-html"; import classNames from "classnames"; -import EMOJIBASE_REGEX from "emojibase-regex"; import katex from "katex"; import { decode } from "html-entities"; import { IContent } from "matrix-js-sdk/src/matrix"; @@ -46,10 +45,35 @@ const SURROGATE_PAIR_PATTERN = /([\ud800-\udbff])([\udc00-\udfff])/; const SYMBOL_PATTERN = /([\u2100-\u2bff])/; // Regex pattern for non-emoji characters that can appear in an "all-emoji" message -// (Zero-Width Joiner, Zero-Width Space, Emoji presentation character, other whitespace) -const EMOJI_SEPARATOR_REGEX = /[\u200D\u200B\s]|\uFE0F/g; +// (Zero-Width Space, other whitespace) +const EMOJI_SEPARATOR_REGEX = /[\u200B\s]/g; -const BIGEMOJI_REGEX = new RegExp(`^(${EMOJIBASE_REGEX.source})+$`, "i"); +// Regex for emoji. This includes any RGI_Emoji sequence followed by an optional +// emoji presentation VS (U+FE0F), but not those sequences that are followed by +// a text presentation VS (U+FE0E). We also count lone regional indicators +// (U+1F1E6-U+1F1FF). Technically this regex produces false negatives for emoji +// followed by U+FE0E when the emoji doesn't have a text variant, but in +// practice this doesn't matter. +export const EMOJI_REGEX = (() => { + try { + // Per our support policy, v mode is available to us, but we still don't + // want the app to completely crash on older platforms. We use the + // constructor here to avoid a syntax error on such platforms. + return new RegExp("\\p{RGI_Emoji}(?!\\uFE0E)(?:(? { + try { + return new RegExp(`^(${EMOJI_REGEX.source})+$`, "iv"); + } catch (_e) { + // Fall back, just like for EMOJI_REGEX + return /(?!)/; + } +})(); /* * Return true if the given string contains emoji @@ -266,7 +290,7 @@ export function formatEmojis(message: string | undefined, isHtmlMessage?: boolea let key = 0; for (const data of graphemeSegmenter.segment(message)) { - if (EMOJIBASE_REGEX.test(data.segment)) { + if (EMOJI_REGEX.test(data.segment)) { if (text) { result.push(text); text = ""; diff --git a/src/components/views/rooms/SendMessageComposer.tsx b/src/components/views/rooms/SendMessageComposer.tsx index 0ea0bdf94c..c5972ee86a 100644 --- a/src/components/views/rooms/SendMessageComposer.tsx +++ b/src/components/views/rooms/SendMessageComposer.tsx @@ -15,7 +15,6 @@ limitations under the License. */ import React, { createRef, KeyboardEvent, SyntheticEvent } from "react"; -import EMOJI_REGEX from "emojibase-regex"; import { IContent, MatrixEvent, @@ -70,6 +69,7 @@ import { doMaybeLocalRoomAction } from "../../../utils/local-room"; import { Caret } from "../../../editor/caret"; import { IDiff } from "../../../editor/diff"; import { getBlobSafeMimeType } from "../../../utils/blobs"; +import { EMOJI_REGEX } from "../../../HtmlUtils"; /** * Build the mentions information based on the editor model (and any related events): diff --git a/src/editor/parts.ts b/src/editor/parts.ts index 3f482357d1..2b732a6dd1 100644 --- a/src/editor/parts.ts +++ b/src/editor/parts.ts @@ -15,11 +15,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -import EMOJIBASE_REGEX from "emojibase-regex"; import { MatrixClient, RoomMember, Room } from "matrix-js-sdk/src/matrix"; import AutocompleteWrapperModel, { GetAutocompleterComponent, UpdateCallback, UpdateQuery } from "./autocomplete"; -import { unicodeToShortcode } from "../HtmlUtils"; +import { EMOJI_REGEX, unicodeToShortcode } from "../HtmlUtils"; import * as Avatar from "../Avatar"; import defaultDispatcher from "../dispatcher/dispatcher"; import { Action } from "../dispatcher/actions"; @@ -197,7 +196,7 @@ abstract class BasePart { abstract class PlainBasePart extends BasePart { protected acceptsInsertion(chr: string, offset: number, inputType: string): boolean { - if (chr === "\n" || EMOJIBASE_REGEX.test(chr)) { + if (chr === "\n" || EMOJI_REGEX.test(chr)) { return false; } // when not pasting or dropping text, reject characters that should start a pill candidate @@ -375,7 +374,7 @@ class NewlinePart extends BasePart implements IBasePart { export class EmojiPart extends BasePart implements IBasePart { protected acceptsInsertion(chr: string, offset: number): boolean { - return EMOJIBASE_REGEX.test(chr); + return EMOJI_REGEX.test(chr); } protected acceptsRemoval(position: number, chr: string): boolean { @@ -573,7 +572,7 @@ export class PartCreator { case "\n": return new NewlinePart(); default: - if (EMOJIBASE_REGEX.test(getFirstGrapheme(input))) { + if (EMOJI_REGEX.test(getFirstGrapheme(input))) { return new EmojiPart(); } return new PlainPart(); @@ -650,7 +649,7 @@ export class PartCreator { let plainText = ""; for (const data of graphemeSegmenter.segment(text)) { - if (EMOJIBASE_REGEX.test(data.segment)) { + if (EMOJI_REGEX.test(data.segment)) { if (plainText) { parts.push(this.plain(plainText)); plainText = ""; diff --git a/test/HtmlUtils-test.tsx b/test/HtmlUtils-test.tsx index d9e75faaa9..ae12a71780 100644 --- a/test/HtmlUtils-test.tsx +++ b/test/HtmlUtils-test.tsx @@ -107,6 +107,12 @@ describe("bodyToHtml", () => { expect(html).toMatchInlineSnapshot(`"test foo <b>bar"`); }); + it("generates big emoji for emoji made of multiple characters", () => { + const { asFragment } = render(bodyToHtml({ body: "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ โ†”๏ธ ๐Ÿ‡ฎ๐Ÿ‡ธ", msgtype: "m.text" }, [], {}) as ReactElement); + + expect(asFragment()).toMatchSnapshot(); + }); + it("should generate big emoji for an emoji-only reply to a message", () => { const { asFragment } = render( bodyToHtml( @@ -132,6 +138,12 @@ describe("bodyToHtml", () => { expect(asFragment()).toMatchSnapshot(); }); + it("does not mistake characters in text presentation mode for emoji", () => { + const { asFragment } = render(bodyToHtml({ body: "โ†” โ—๏ธŽ", msgtype: "m.text" }, [], {}) as ReactElement); + + expect(asFragment()).toMatchSnapshot(); + }); + describe("feature_latex_maths", () => { beforeEach(() => { jest.spyOn(SettingsStore, "getValue").mockImplementation((feature) => feature === "feature_latex_maths"); diff --git a/test/__snapshots__/HtmlUtils-test.tsx.snap b/test/__snapshots__/HtmlUtils-test.tsx.snap index c33cc46433..c69eaa7d95 100644 --- a/test/__snapshots__/HtmlUtils-test.tsx.snap +++ b/test/__snapshots__/HtmlUtils-test.tsx.snap @@ -1,5 +1,16 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP +exports[`bodyToHtml does not mistake characters in text presentation mode for emoji 1`] = ` + + + โ†” โ—๏ธŽ + + +`; + exports[`bodyToHtml feature_latex_maths should not mangle code blocks 1`] = `"

hello

$\\xi$

world

"`; exports[`bodyToHtml feature_latex_maths should not mangle divs 1`] = `"

hello

world
"`; @@ -8,6 +19,36 @@ exports[`bodyToHtml feature_latex_maths should render block katex 1`] = `"

hel exports[`bodyToHtml feature_latex_maths should render inline katex 1`] = `"hello ฮพ\\xi world"`; +exports[`bodyToHtml generates big emoji for emoji made of multiple characters 1`] = ` + + + + ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ + + + + โ†”๏ธ + + + + ๐Ÿ‡ฎ๐Ÿ‡ธ + + + +`; + exports[`bodyToHtml should generate big emoji for an emoji-only reply to a message 1`] = `