diff --git a/package-lock.json b/package-lock.json index cd82af0..675f276 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,12 +10,15 @@ "license": "SEE LICENSE FILE", "dependencies": { "abort-controller": "^3.0.0", + "commonmark": "^0.31.2", "fake-indexeddb": "^6.2.5", "fluent-ffmpeg": "^2.1.2", "fs-extra": "^11.1.0", "got": "^12.0.2", "image-size": "^1.0.2", "isomorphic-webcrypto": "^2.3.8", + "linkifyjs": "^4.3.3", + "lodash.escape": "^4.0.1", "matrix-js-sdk": "^41.5.0", "mime": "^3.0.0", "node-fetch": "^3.3.0", @@ -7314,6 +7317,23 @@ "optional": true, "peer": true }, + "node_modules/commonmark": { + "version": "0.31.2", + "resolved": "https://registry.npmjs.org/commonmark/-/commonmark-0.31.2.tgz", + "integrity": "sha512-2fRLTyb9r/2835k5cwcAwOj0DEc44FARnMp5veGsJ+mEAZdi52sNopLu07ZyElQUz058H43whzlERDIaaSw4rg==", + "license": "BSD-2-Clause", + "dependencies": { + "entities": "~3.0.1", + "mdurl": "~1.0.1", + "minimist": "~1.2.8" + }, + "bin": { + "commonmark": "bin/commonmark" + }, + "engines": { + "node": "*" + } + }, "node_modules/compare-versions": { "version": "3.6.0", "resolved": "https://registry.npmjs.org/compare-versions/-/compare-versions-3.6.0.tgz", @@ -7880,6 +7900,18 @@ "once": "^1.4.0" } }, + "node_modules/entities": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-3.0.1.tgz", + "integrity": "sha512-WiyBqoomrwMdFG1e0kqvASYfnlb0lp8M5o5Fw2OFq1hNZxxcNk8Ik0Xm7LxzBhuidnZB/UtBqVCgUz3kBOP51Q==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/env-editor": { "version": "0.4.2", "resolved": "https://registry.npmjs.org/env-editor/-/env-editor-0.4.2.tgz", @@ -10350,6 +10382,12 @@ "optional": true, "peer": true }, + "node_modules/linkifyjs": { + "version": "4.3.3", + "resolved": "https://registry.npmjs.org/linkifyjs/-/linkifyjs-4.3.3.tgz", + "integrity": "sha512-P8aEP5U/D1/IlTY2OeYsErdwh9bGuLE30NcXtKEjgdHcahveQoQwM2yZNsioQHsWFz0P7KKudisbrzCgR0sDHg==", + "license": "MIT" + }, "node_modules/locate-path": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", @@ -10379,6 +10417,12 @@ "optional": true, "peer": true }, + "node_modules/lodash.escape": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.escape/-/lodash.escape-4.0.1.tgz", + "integrity": "sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==", + "license": "MIT" + }, "node_modules/lodash.throttle": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/lodash.throttle/-/lodash.throttle-4.1.1.tgz", @@ -10794,6 +10838,12 @@ "optional": true, "peer": true }, + "node_modules/mdurl": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/mdurl/-/mdurl-1.0.1.tgz", + "integrity": "sha512-/sKlQJCBYVY9Ers9hqzKou4H6V5UWc/M59TH2dvkt+84itfnq7uFOMLpOiOS4ujvHP4etln18fmIxA5R5fll0g==", + "license": "MIT" + }, "node_modules/media-typer": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", @@ -11836,8 +11886,6 @@ "version": "1.2.8", "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", - "optional": true, - "peer": true, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -20839,6 +20887,16 @@ "optional": true, "peer": true }, + "commonmark": { + "version": "0.31.2", + "resolved": "https://registry.npmjs.org/commonmark/-/commonmark-0.31.2.tgz", + "integrity": "sha512-2fRLTyb9r/2835k5cwcAwOj0DEc44FARnMp5veGsJ+mEAZdi52sNopLu07ZyElQUz058H43whzlERDIaaSw4rg==", + "requires": { + "entities": "~3.0.1", + "mdurl": "~1.0.1", + "minimist": "~1.2.8" + } + }, "compare-versions": { "version": "3.6.0", "resolved": "https://registry.npmjs.org/compare-versions/-/compare-versions-3.6.0.tgz", @@ -21287,6 +21345,11 @@ "once": "^1.4.0" } }, + "entities": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-3.0.1.tgz", + "integrity": "sha512-WiyBqoomrwMdFG1e0kqvASYfnlb0lp8M5o5Fw2OFq1hNZxxcNk8Ik0Xm7LxzBhuidnZB/UtBqVCgUz3kBOP51Q==" + }, "env-editor": { "version": "0.4.2", "resolved": "https://registry.npmjs.org/env-editor/-/env-editor-0.4.2.tgz", @@ -23163,6 +23226,11 @@ "optional": true, "peer": true }, + "linkifyjs": { + "version": "4.3.3", + "resolved": "https://registry.npmjs.org/linkifyjs/-/linkifyjs-4.3.3.tgz", + "integrity": "sha512-P8aEP5U/D1/IlTY2OeYsErdwh9bGuLE30NcXtKEjgdHcahveQoQwM2yZNsioQHsWFz0P7KKudisbrzCgR0sDHg==" + }, "locate-path": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", @@ -23186,6 +23254,11 @@ "optional": true, "peer": true }, + "lodash.escape": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.escape/-/lodash.escape-4.0.1.tgz", + "integrity": "sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==" + }, "lodash.throttle": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/lodash.throttle/-/lodash.throttle-4.1.1.tgz", @@ -23524,6 +23597,11 @@ "optional": true, "peer": true }, + "mdurl": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/mdurl/-/mdurl-1.0.1.tgz", + "integrity": "sha512-/sKlQJCBYVY9Ers9hqzKou4H6V5UWc/M59TH2dvkt+84itfnq7uFOMLpOiOS4ujvHP4etln18fmIxA5R5fll0g==" + }, "media-typer": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", @@ -24351,9 +24429,7 @@ "minimist": { "version": "1.2.8", "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", - "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", - "optional": true, - "peer": true + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==" }, "minipass": { "version": "3.1.6", diff --git a/package.json b/package.json index e77b2e9..d005328 100644 --- a/package.json +++ b/package.json @@ -4,12 +4,15 @@ "description": "Matrix chat server client for Node-RED", "dependencies": { "abort-controller": "^3.0.0", + "commonmark": "^0.31.2", "fake-indexeddb": "^6.2.5", "fluent-ffmpeg": "^2.1.2", "fs-extra": "^11.1.0", "got": "^12.0.2", "image-size": "^1.0.2", "isomorphic-webcrypto": "^2.3.8", + "linkifyjs": "^4.3.3", + "lodash.escape": "^4.0.1", "matrix-js-sdk": "^41.5.0", "mime": "^3.0.0", "node-fetch": "^3.3.0", diff --git a/src/matrix-markdown.js b/src/matrix-markdown.js new file mode 100644 index 0000000..e833042 --- /dev/null +++ b/src/matrix-markdown.js @@ -0,0 +1,385 @@ +// Markdown -> HTML converter for matrix messages. +// +// Ported from matrix-react-sdk's `src/Markdown.ts` (now living at +// element-hq/element-web `apps/web/src/Markdown.ts`) so the HTML this module +// generates lines up with what Element produces for the same markdown source. +// +// Keep this in sync with element-web's Markdown.ts when noticeable changes +// land there. Source of truth: +// https://github.com/element-hq/element-web/blob/develop/apps/web/src/Markdown.ts +// +// Copyright 2024 New Vector Ltd. +// Copyright 2021 The Matrix.org Foundation C.I.C. +// Copyright 2016 OpenMarket Ltd +// +// SPDX-License-Identifier: AGPL-3.0-only OR GPL-3.0-only OR LicenseRef-Element-Commercial + +const commonmark = require("commonmark"); +const escape = require("lodash.escape"); +const linkify = require("linkifyjs"); + +const ALLOWED_HTML_TAGS = ["sub", "sup", "del", "s", "u", "br", "br/"]; + +// These types of node are definitely text +const TEXT_NODES = ["text", "softbreak", "linebreak", "paragraph", "document"]; + +function isAllowedHtmlTag(node) { + if (!node.literal) { + return false; + } + + if (node.literal.match('^<((div|span) data-mx-maths="[^"]*"|/(div|span))>$') != null) { + return true; + } + + // Regex won't work for tags with attrs, but the tags we allow + // shouldn't really have any anyway. + const matches = /^<\/?(.*)>$/.exec(node.literal); + if (matches && matches.length == 2) { + const tag = matches[1]; + return ALLOWED_HTML_TAGS.indexOf(tag) > -1; + } + + return false; +} + +/* + * Returns true if the parse output containing the node + * comprises multiple block level elements (ie. lines), + * or false if it is only a single line. + */ +function isMultiLine(node) { + let par = node; + while (par.parent) { + par = par.parent; + } + return par.firstChild != par.lastChild; +} + +function getTextUntilEndOrLinebreak(node) { + let currentNode = node; + let text = ""; + while (currentNode && currentNode.type !== "softbreak" && currentNode.type !== "linebreak") { + const { literal, type } = currentNode; + if (type === "text" && literal) { + let n = 0; + let char = literal[n]; + while (char !== " " && char !== null && n <= literal.length) { + if (char === " ") { + break; + } + if (char) { + text += char; + } + n += 1; + char = literal[n]; + } + if (char === " ") { + break; + } + } + currentNode = currentNode.next; + } + return text; +} + +const formattingChangesByNodeType = { + emph: "_", + strong: "__", +}; + +/** + * Returns the literal of a node and all child nodes. + */ +const innerNodeLiteral = (node) => { + let literal = ""; + + const walker = node.walker(); + let step; + + while ((step = walker.next())) { + const currentNode = step.node; + const currentNodeLiteral = currentNode.literal; + if (step.entering && currentNode.type === "text" && currentNodeLiteral) { + literal += currentNodeLiteral; + } + } + + return literal; +}; + +const emptyItemWithNoSiblings = (node) => { + return !node.prev && !node.next && !node.firstChild; +}; + +/** + * Class that wraps commonmark, adding the ability to see whether + * a given message actually uses any markdown syntax or whether + * it's plain text. + */ +class Markdown { + constructor(input) { + this.input = input; + + const parser = new commonmark.Parser(); + this.parsed = parser.parse(this.input); + this.parsed = this.repairLinks(this.parsed); + } + + /** + * This method is modifying the parsed AST in such a way that links are always + * properly linkified instead of sometimes being wrongly emphasised in case + * if you were to write a link like the example below: + * https://my_weird-link_domain.domain.com + * ^ this link would be parsed to something like this: + * https://myweird-linkdomain.domain.com + * This method makes it so the link gets properly modified to a version where it is + * not emphasised until it actually ends. + * See: https://github.com/vector-im/element-web/issues/4674 + */ + repairLinks(parsed) { + const walker = parsed.walker(); + let event = null; + let text = ""; + let isInPara = false; + let previousNode = null; + let shouldUnlinkFormattingNode = false; + while ((event = walker.next())) { + const { node } = event; + if (node.type === "paragraph") { + isInPara = !!event.entering; + } + if (isInPara) { + // Clear saved string when line ends + if ( + node.type === "softbreak" || + node.type === "linebreak" || + // Also start calculating the text from the beginning on any spaces + (node.type === "text" && node.literal === " ") + ) { + text = ""; + continue; + } + + // Break up text nodes on spaces, so that we don't shoot past them without resetting + if (node.type === "text" && node.literal) { + const [thisPart, ...nextParts] = node.literal.split(/( )/); + node.literal = thisPart; + text += thisPart; + + // Add the remaining parts as siblings + nextParts.reverse().forEach((part) => { + if (part) { + const nextNode = new commonmark.Node("text"); + nextNode.literal = part; + node.insertAfter(nextNode); + // Make the iterator aware of the newly inserted node + walker.resumeAt(nextNode, true); + } + }); + } + + // We should not do this if previous node was not a textnode, as we can't combine it then. + if ( + (node.type === "emph" || node.type === "strong") && + previousNode && previousNode.type === "text" + ) { + if (event.entering) { + const foundLinks = linkify.find(text); + for (const { value } of foundLinks) { + if (node && node.firstChild && node.firstChild.literal) { + /** + * NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings + * but this solution seems to work well and is hopefully slightly easier to understand too + */ + const format = formattingChangesByNodeType[node.type]; + const nonEmphasizedText = `${format}${innerNodeLiteral(node)}${format}`; + const f = getTextUntilEndOrLinebreak(node); + const newText = value + nonEmphasizedText + f; + const newLinks = linkify.find(newText); + // Should always find only one link here, if it finds more it means that the algorithm is broken + if (newLinks.length === 1) { + const emphasisTextNode = new commonmark.Node("text"); + emphasisTextNode.literal = nonEmphasizedText; + previousNode.insertAfter(emphasisTextNode); + node.firstChild.literal = ""; + event = node.walker().next(); + if (event) { + // Remove `em` opening and closing nodes + node.unlink(); + previousNode.insertAfter(event.node); + shouldUnlinkFormattingNode = true; + } + } else { + console.warn( + "matrix-chat markdown: link escaping found too many links for text:", + text, + "modified:", + newText, + ); + } + } + } + } else { + if (shouldUnlinkFormattingNode) { + node.unlink(); + shouldUnlinkFormattingNode = false; + } + } + } + } + previousNode = node; + } + return parsed; + } + + isPlainText() { + const walker = this.parsed.walker(); + let ev; + + while ((ev = walker.next())) { + const node = ev.node; + + if (TEXT_NODES.indexOf(node.type) > -1) { + // definitely text + continue; + } else if (node.type == "list" || node.type == "item") { + // Special handling for inputs like `+`, `*`, `-` and `2021.` which + // would otherwise be treated as a list of a single empty item. + // See https://github.com/vector-im/element-web/issues/7631 + if ( + node.type == "list" && + node.firstChild && + emptyItemWithNoSiblings(node.firstChild) + ) { + // A list with a single empty item is treated as plain text. + continue; + } + + if (node.type == "item" && emptyItemWithNoSiblings(node)) { + // An empty list item with no sibling items is treated as plain text. + continue; + } + + // Everything else is actual lists and therefore not plaintext. + return false; + } else if (node.type == "html_inline" || node.type == "html_block") { + // if it's an allowed html tag, we need to render it and therefore + // we will need to use HTML. If it's not allowed, it's not HTML since + // we'll just be treating it as text. + if (isAllowedHtmlTag(node)) { + return false; + } + } else { + return false; + } + } + return true; + } + + toHTML({ externalLinks = false } = {}) { + const renderer = new commonmark.HtmlRenderer({ + safe: false, + + // Set soft breaks to hard HTML breaks: commonmark + // puts softbreaks in for multiple lines in a blockquote, + // so if these are just newline characters then the + // block quote ends up all on one line + // (https://github.com/vector-im/element-web/issues/3154) + softbreak: "
", + }); + + // Trying to strip out the wrapping

causes a lot more complication + // than it's worth, i think. For instance, this code will go and strip + // out any

tag (no matter where it is in the tree) which doesn't + // contain \n's. + // On the flip side,

s are quite opionated and restricted on where + // you can nest them. + // + // Let's try sending with

s anyway for now, though. + const realParagraph = renderer.paragraph; + renderer.paragraph = function (node, entering) { + // If there is only one top level node, just return the + // bare text: it's a single line of text and so should be + // 'inline', rather than unnecessarily wrapped in its own + // p tag. If, however, we have multiple nodes, each gets + // its own p tag to keep them as separate paragraphs. + // However, if it's a blockquote, adds a p tag anyway + // in order to avoid deviation to commonmark and unexpected + // results when parsing the formatted HTML. + if ((node.parent && node.parent.type === "block_quote") || isMultiLine(node)) { + realParagraph.call(this, node, entering); + } + }; + + renderer.link = function (node, entering) { + const attrs = this.attrs(node); + if (entering && node.destination) { + attrs.push(["href", this.esc(node.destination)]); + if (node.title) { + attrs.push(["title", this.esc(node.title)]); + } + // Modified link behaviour to treat them all as external and + // thus opening in a new tab. + if (externalLinks) { + attrs.push(["target", "_blank"]); + attrs.push(["rel", "noreferrer noopener"]); + } + this.tag("a", attrs); + } else { + this.tag("/a"); + } + }; + + renderer.html_inline = function (node) { + if (node.literal) { + if (isAllowedHtmlTag(node)) { + this.lit(node.literal); + } else { + this.lit(escape(node.literal)); + } + } + }; + + renderer.html_block = function (node) { + renderer.html_inline(node); + }; + + return renderer.render(this.parsed); + } + + /* + * Render the markdown message to plain text. That is, essentially + * just remove any backslashes escaping what would otherwise be + * markdown syntax + * (to fix https://github.com/vector-im/element-web/issues/2870). + * + * N.B. this does **NOT** render arbitrary MD to plain text - only MD + * which has no formatting. Otherwise it emits HTML(!). + */ + toPlaintext() { + const renderer = new commonmark.HtmlRenderer({ safe: false }); + + renderer.paragraph = function (node, entering) { + // as with toHTML, only append lines to paragraphs if there are + // multiple paragraphs + if (isMultiLine(node)) { + if (!entering && node.next) { + this.lit("\n\n"); + } + } + }; + + renderer.html_block = function (node) { + if (node.literal) this.lit(node.literal); + if (isMultiLine(node) && node.next) this.lit("\n\n"); + }; + + // We inhibit the default escape function as we escape the entire output string to correctly handle backslashes + renderer.esc = (input) => input; + + return escape(renderer.render(this.parsed)); + } +} + +module.exports = { Markdown }; diff --git a/src/matrix-send-message.html b/src/matrix-send-message.html index b35781b..f445f07 100644 --- a/src/matrix-send-message.html +++ b/src/matrix-send-message.html @@ -99,6 +99,7 @@ @@ -140,7 +141,7 @@

msg.formatted_payload string
-
the formatted HTML message (uses msg.payload if not defined). This only affects HTML messages.
+
the formatted HTML message (uses msg.payload if not defined). This only affects messages sent in HTML format — in Markdown mode the formatted body is generated from the markdown source.
msg.type string | null @@ -150,7 +151,35 @@
msg.format string | null
-
This is only used and required when configured so on the node. Set to null for plain text and 'html' for HTML.
+
This is only used and required when configured so on the node. Set to null for plain text, 'markdown' for markdown (converted to HTML the same way Element does), or 'html' for HTML.
+ + +

Message formats

+
+
Default (plaintext)
+
The payload is sent as-is as the message body.
+ +
Markdown
+
+ The payload is parsed as CommonMark markdown and converted to HTML + the same way Element does (using the same converter ported from + matrix-react-sdk). If the message turns out to contain + no markdown syntax it is sent as plain text; otherwise the original + markdown source becomes the message body and the + rendered HTML is sent as formatted_body, so clients + without HTML rendering still see a readable fallback. +
+ +
HTML
+
+ The payload is sent as HTML. By default the same HTML is used for + both the plain-text and formatted versions; set + msg.formatted_payload if you want the + formatted_body to differ from msg.payload. +
+ +
msg.format input
+
Set msg.format at runtime to one of the options above (null, 'markdown', or 'html').

Outputs

diff --git a/src/matrix-send-message.js b/src/matrix-send-message.js index c954ca1..5cb300f 100644 --- a/src/matrix-send-message.js +++ b/src/matrix-send-message.js @@ -1,4 +1,5 @@ const sdkPromise = import("matrix-js-sdk"); +const { Markdown } = require("./matrix-markdown"); module.exports = function(RED) { function MatrixSendImage(n) { @@ -143,7 +144,28 @@ module.exports = function(RED) { body: payload.toString() }; - if (msgFormat === 'html') { + if (msgFormat === 'markdown') { + // Convert the markdown body to HTML using the same logic + // as Element (matrix-react-sdk's `Markdown` class). + // + // If the message contains any markdown syntax, send the + // rendered HTML as `formatted_body` and keep the original + // markdown source as `body` (matrix spec convention for + // formatted messages). If the message turns out to be + // plain text and contains backslash escapes, strip those + // from `body` and send no HTML; otherwise leave `body` + // as the original payload. + const source = payload.toString(); + const md = new Markdown(source); + if (md.isPlainText()) { + if (source.indexOf("\\") > -1) { + content.body = md.toPlaintext(); + } + } else { + content.format = "org.matrix.custom.html"; + content.formatted_body = md.toHTML(); + } + } else if (msgFormat === 'html') { content.format = "org.matrix.custom.html"; content.formatted_body = (typeof msg.formatted_payload !== 'undefined' && msg.formatted_payload)