2023-01-13 05:40:33 +01:00
|
|
|
import { URL } from "node:url";
|
|
|
|
import * as parse5 from "parse5";
|
|
|
|
import * as TreeAdapter from "../../node_modules/parse5/dist/tree-adapters/default.js";
|
2022-07-01 06:48:03 +02:00
|
|
|
|
|
|
|
const treeAdapter = TreeAdapter.defaultTreeAdapter;
|
2021-04-02 03:36:11 +02:00
|
|
|
|
2022-06-30 17:21:25 +02:00
|
|
|
const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
|
2021-04-02 03:36:11 +02:00
|
|
|
const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2022-02-04 03:10:53 +01:00
|
|
|
export function fromHtml(html: string, hashtagNames?: string[]): string {
|
2022-05-31 11:57:55 +02:00
|
|
|
// some AP servers like Pixelfed use br tags as well as newlines
|
2023-01-13 05:40:33 +01:00
|
|
|
html = html.replace(/<br\s?\/?>\r?\n/gi, "\n");
|
2022-05-31 11:57:55 +02:00
|
|
|
|
2021-02-06 13:44:46 +01:00
|
|
|
const dom = parse5.parseFragment(html);
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
let text = "";
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2018-12-11 12:36:55 +01:00
|
|
|
for (const n of dom.childNodes) {
|
|
|
|
analyze(n);
|
|
|
|
}
|
2018-06-20 18:21:57 +02:00
|
|
|
|
|
|
|
return text.trim();
|
|
|
|
|
2022-07-01 06:48:03 +02:00
|
|
|
function getText(node: TreeAdapter.Node): string {
|
2021-02-06 13:44:46 +01:00
|
|
|
if (treeAdapter.isTextNode(node)) return node.value;
|
2023-01-13 05:40:33 +01:00
|
|
|
if (!treeAdapter.isElementNode(node)) return "";
|
|
|
|
if (node.nodeName === "br") return "\n";
|
2018-06-20 18:21:57 +02:00
|
|
|
|
|
|
|
if (node.childNodes) {
|
2023-01-13 05:40:33 +01:00
|
|
|
return node.childNodes.map((n) => getText(n)).join("");
|
2018-06-20 18:21:57 +02:00
|
|
|
}
|
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
return "";
|
2018-06-20 18:21:57 +02:00
|
|
|
}
|
|
|
|
|
2022-07-01 06:48:03 +02:00
|
|
|
function appendChildren(childNodes: TreeAdapter.ChildNode[]): void {
|
2021-09-25 18:57:38 +02:00
|
|
|
if (childNodes) {
|
|
|
|
for (const n of childNodes) {
|
|
|
|
analyze(n);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-01 06:48:03 +02:00
|
|
|
function analyze(node: TreeAdapter.Node) {
|
2021-02-06 13:44:46 +01:00
|
|
|
if (treeAdapter.isTextNode(node)) {
|
|
|
|
text += node.value;
|
|
|
|
return;
|
|
|
|
}
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2021-02-06 13:44:46 +01:00
|
|
|
// Skip comment or document type node
|
|
|
|
if (!treeAdapter.isElementNode(node)) return;
|
|
|
|
|
|
|
|
switch (node.nodeName) {
|
2023-01-13 05:40:33 +01:00
|
|
|
case "br": {
|
|
|
|
text += "\n";
|
2018-06-20 18:21:57 +02:00
|
|
|
break;
|
2021-11-12 02:52:10 +01:00
|
|
|
}
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
case "a": {
|
2018-06-20 18:21:57 +02:00
|
|
|
const txt = getText(node);
|
2023-01-13 05:40:33 +01:00
|
|
|
const rel = node.attrs.find((x) => x.name === "rel");
|
|
|
|
const href = node.attrs.find((x) => x.name === "href");
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2020-04-03 15:51:38 +02:00
|
|
|
// ハッシュタグ
|
2023-01-13 05:40:33 +01:00
|
|
|
if (
|
|
|
|
hashtagNames &&
|
|
|
|
href &&
|
|
|
|
hashtagNames.map((x) => x.toLowerCase()).includes(txt.toLowerCase())
|
|
|
|
) {
|
2020-04-03 15:51:38 +02:00
|
|
|
text += txt;
|
2023-01-13 05:40:33 +01:00
|
|
|
// メンション
|
2023-01-16 20:19:20 +01:00
|
|
|
} else if (txt.startsWith("@") && !rel?.value.match(/^me /)) {
|
2023-01-13 05:40:33 +01:00
|
|
|
const part = txt.split("@");
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2021-02-06 13:44:46 +01:00
|
|
|
if (part.length === 2 && href) {
|
2018-06-20 18:21:57 +02:00
|
|
|
//#region ホスト名部分が省略されているので復元する
|
2023-01-13 05:40:33 +01:00
|
|
|
const acct = `${txt}@${new URL(href.value).hostname}`;
|
2018-06-20 18:21:57 +02:00
|
|
|
text += acct;
|
|
|
|
//#endregion
|
2020-04-04 01:46:54 +02:00
|
|
|
} else if (part.length === 3) {
|
2018-06-20 18:21:57 +02:00
|
|
|
text += txt;
|
|
|
|
}
|
2023-01-13 05:40:33 +01:00
|
|
|
// その他
|
2018-09-01 15:45:27 +02:00
|
|
|
} else {
|
2021-02-06 13:44:46 +01:00
|
|
|
const generateLink = () => {
|
2023-01-13 05:40:33 +01:00
|
|
|
if (!(href || txt)) {
|
|
|
|
return "";
|
2021-02-06 13:44:46 +01:00
|
|
|
}
|
|
|
|
if (!href) {
|
|
|
|
return txt;
|
|
|
|
}
|
2023-01-13 05:40:33 +01:00
|
|
|
if (!txt || txt === href.value) {
|
|
|
|
// #6383: Missing text node
|
2021-02-06 13:44:46 +01:00
|
|
|
if (href.value.match(urlRegexFull)) {
|
|
|
|
return href.value;
|
|
|
|
} else {
|
|
|
|
return `<${href.value}>`;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) {
|
2023-01-13 05:40:33 +01:00
|
|
|
return `[${txt}](<${href.value}>)`; // #6846
|
2021-02-06 13:44:46 +01:00
|
|
|
} else {
|
|
|
|
return `[${txt}](${href.value})`;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
text += generateLink();
|
2018-06-20 18:21:57 +02:00
|
|
|
}
|
|
|
|
break;
|
2021-09-25 18:57:38 +02:00
|
|
|
}
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
case "h1": {
|
|
|
|
text += "【";
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
2023-01-13 05:40:33 +01:00
|
|
|
text += "】\n";
|
2021-09-25 18:57:38 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
case "b":
|
|
|
|
case "strong": {
|
|
|
|
text += "**";
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
2023-01-13 05:40:33 +01:00
|
|
|
text += "**";
|
2021-09-25 18:57:38 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
case "small": {
|
|
|
|
text += "<small>";
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
2023-01-13 05:40:33 +01:00
|
|
|
text += "</small>";
|
2021-09-25 18:57:38 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
case "s":
|
|
|
|
case "del": {
|
|
|
|
text += "~~";
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
2023-01-13 05:40:33 +01:00
|
|
|
text += "~~";
|
2021-09-25 18:57:38 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
case "i":
|
|
|
|
case "em": {
|
|
|
|
text += "<i>";
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
2023-01-13 05:40:33 +01:00
|
|
|
text += "</i>";
|
2021-09-25 18:57:38 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// block code (<pre><code>)
|
2023-01-13 05:40:33 +01:00
|
|
|
case "pre": {
|
|
|
|
if (
|
|
|
|
node.childNodes.length === 1 &&
|
|
|
|
node.childNodes[0].nodeName === "code"
|
|
|
|
) {
|
|
|
|
text += "\n```\n";
|
2021-09-25 18:57:38 +02:00
|
|
|
text += getText(node.childNodes[0]);
|
2023-01-13 05:40:33 +01:00
|
|
|
text += "\n```\n";
|
2021-09-25 18:57:38 +02:00
|
|
|
} else {
|
|
|
|
appendChildren(node.childNodes);
|
2018-06-20 18:21:57 +02:00
|
|
|
}
|
|
|
|
break;
|
2021-09-25 18:57:38 +02:00
|
|
|
}
|
2018-06-20 18:21:57 +02:00
|
|
|
|
2021-09-25 18:57:38 +02:00
|
|
|
// inline code (<code>)
|
2023-01-13 05:40:33 +01:00
|
|
|
case "code": {
|
|
|
|
text += "`";
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
2023-01-13 05:40:33 +01:00
|
|
|
text += "`";
|
2021-09-25 18:57:38 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
case "blockquote": {
|
2021-09-25 18:57:38 +02:00
|
|
|
const t = getText(node);
|
|
|
|
if (t) {
|
2023-01-13 05:40:33 +01:00
|
|
|
text += "\n> ";
|
|
|
|
text += t.split("\n").join("\n> ");
|
2018-06-20 18:21:57 +02:00
|
|
|
}
|
|
|
|
break;
|
2021-09-25 18:57:38 +02:00
|
|
|
}
|
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
case "p":
|
|
|
|
case "h2":
|
|
|
|
case "h3":
|
|
|
|
case "h4":
|
|
|
|
case "h5":
|
|
|
|
case "h6": {
|
|
|
|
text += "\n\n";
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// other block elements
|
2023-01-13 05:40:33 +01:00
|
|
|
case "div":
|
|
|
|
case "header":
|
|
|
|
case "footer":
|
|
|
|
case "article":
|
|
|
|
case "li":
|
|
|
|
case "dt":
|
|
|
|
case "dd": {
|
|
|
|
text += "\n";
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-01-13 05:40:33 +01:00
|
|
|
default: {
|
|
|
|
// includes inline elements
|
2021-09-25 18:57:38 +02:00
|
|
|
appendChildren(node.childNodes);
|
|
|
|
break;
|
|
|
|
}
|
2018-06-20 18:21:57 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|