hippofish/packages/backend/src/mfm/from-html.ts

214 lines
4.3 KiB
TypeScript
Raw Normal View History

2023-01-13 05:40:33 +01:00
import { URL } from "node:url";
import * as parse5 from "parse5";
import * as TreeAdapter from "../../node_modules/parse5/dist/tree-adapters/default.js";
const treeAdapter = TreeAdapter.defaultTreeAdapter;
2022-06-30 17:21:25 +02:00
const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;
2022-02-04 03:10:53 +01:00
export function fromHtml(html: string, hashtagNames?: string[]): string {
// some AP servers like Pixelfed use br tags as well as newlines
2023-01-13 05:40:33 +01:00
html = html.replace(/<br\s?\/?>\r?\n/gi, "\n");
const dom = parse5.parseFragment(html);
2023-01-13 05:40:33 +01:00
let text = "";
for (const n of dom.childNodes) {
analyze(n);
}
return text.trim();
function getText(node: TreeAdapter.Node): string {
if (treeAdapter.isTextNode(node)) return node.value;
2023-01-13 05:40:33 +01:00
if (!treeAdapter.isElementNode(node)) return "";
if (node.nodeName === "br") return "\n";
if (node.childNodes) {
2023-01-13 05:40:33 +01:00
return node.childNodes.map((n) => getText(n)).join("");
}
2023-01-13 05:40:33 +01:00
return "";
}
function appendChildren(childNodes: TreeAdapter.ChildNode[]): void {
if (childNodes) {
for (const n of childNodes) {
analyze(n);
}
}
}
function analyze(node: TreeAdapter.Node) {
if (treeAdapter.isTextNode(node)) {
text += node.value;
return;
}
// Skip comment or document type node
if (!treeAdapter.isElementNode(node)) return;
switch (node.nodeName) {
2023-01-13 05:40:33 +01:00
case "br": {
text += "\n";
break;
2021-11-12 02:52:10 +01:00
}
2023-01-13 05:40:33 +01:00
case "a": {
const txt = getText(node);
2023-01-13 05:40:33 +01:00
const rel = node.attrs.find((x) => x.name === "rel");
const href = node.attrs.find((x) => x.name === "href");
// ハッシュタグ
2023-01-13 05:40:33 +01:00
if (
hashtagNames &&
href &&
hashtagNames.map((x) => x.toLowerCase()).includes(txt.toLowerCase())
) {
text += txt;
2023-01-13 05:40:33 +01:00
// メンション
2023-01-16 20:19:20 +01:00
} else if (txt.startsWith("@") && !rel?.value.match(/^me /)) {
2023-01-13 05:40:33 +01:00
const part = txt.split("@");
if (part.length === 2 && href) {
//#region ホスト名部分が省略されているので復元する
2023-01-13 05:40:33 +01:00
const acct = `${txt}@${new URL(href.value).hostname}`;
text += acct;
//#endregion
2020-04-04 01:46:54 +02:00
} else if (part.length === 3) {
text += txt;
}
2023-01-13 05:40:33 +01:00
// その他
} else {
const generateLink = () => {
2023-01-13 05:40:33 +01:00
if (!(href || txt)) {
return "";
}
if (!href) {
return txt;
}
2023-01-13 05:40:33 +01:00
if (!txt || txt === href.value) {
// #6383: Missing text node
if (href.value.match(urlRegexFull)) {
return href.value;
} else {
return `<${href.value}>`;
}
}
if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) {
2023-01-13 05:40:33 +01:00
return `[${txt}](<${href.value}>)`; // #6846
} else {
return `[${txt}](${href.value})`;
}
};
text += generateLink();
}
break;
}
2023-01-13 05:40:33 +01:00
case "h1": {
text += "【";
appendChildren(node.childNodes);
2023-01-13 05:40:33 +01:00
text += "】\n";
break;
}
2023-01-13 05:40:33 +01:00
case "b":
case "strong": {
text += "**";
appendChildren(node.childNodes);
2023-01-13 05:40:33 +01:00
text += "**";
break;
}
2023-01-13 05:40:33 +01:00
case "small": {
text += "<small>";
appendChildren(node.childNodes);
2023-01-13 05:40:33 +01:00
text += "</small>";
break;
}
2023-01-13 05:40:33 +01:00
case "s":
case "del": {
text += "~~";
appendChildren(node.childNodes);
2023-01-13 05:40:33 +01:00
text += "~~";
break;
}
2023-01-13 05:40:33 +01:00
case "i":
case "em": {
text += "<i>";
appendChildren(node.childNodes);
2023-01-13 05:40:33 +01:00
text += "</i>";
break;
}
// block code (<pre><code>)
2023-01-13 05:40:33 +01:00
case "pre": {
if (
node.childNodes.length === 1 &&
node.childNodes[0].nodeName === "code"
) {
text += "\n```\n";
text += getText(node.childNodes[0]);
2023-01-13 05:40:33 +01:00
text += "\n```\n";
} else {
appendChildren(node.childNodes);
}
break;
}
// inline code (<code>)
2023-01-13 05:40:33 +01:00
case "code": {
text += "`";
appendChildren(node.childNodes);
2023-01-13 05:40:33 +01:00
text += "`";
break;
}
2023-01-13 05:40:33 +01:00
case "blockquote": {
const t = getText(node);
if (t) {
2023-01-13 05:40:33 +01:00
text += "\n> ";
text += t.split("\n").join("\n> ");
}
break;
}
2023-01-13 05:40:33 +01:00
case "p":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6": {
text += "\n\n";
appendChildren(node.childNodes);
break;
}
// other block elements
2023-01-13 05:40:33 +01:00
case "div":
case "header":
case "footer":
case "article":
case "li":
case "dt":
case "dd": {
text += "\n";
appendChildren(node.childNodes);
break;
}
2023-01-13 05:40:33 +01:00
default: {
// includes inline elements
appendChildren(node.childNodes);
break;
}
}
}
}