Merge branch 'html2everything'
This commit is contained in:
commit
9889ca251a
|
@ -0,0 +1,105 @@
|
||||||
|
"""
|
||||||
|
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||||
|
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
"""
|
||||||
|
from typing import Callable, Iterable, Literal, Optional
|
||||||
|
from bs4.element import Tag, PageElement
|
||||||
|
|
||||||
|
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
|
||||||
|
BULLET = "\u2022"
|
||||||
|
STRIPE = "\u258d"
|
||||||
|
|
||||||
|
|
||||||
|
def md_escape(text: str) -> str:
|
||||||
|
return (
|
||||||
|
text.replace("\\", "\\\\")
|
||||||
|
.replace("*", "\\*")
|
||||||
|
.replace("[", "\\[")
|
||||||
|
.replace("]", "\\]")
|
||||||
|
.replace("_", "\\_")
|
||||||
|
.replace("~", "\\~")
|
||||||
|
.replace("|", "\\|")
|
||||||
|
.replace("`", "\\`")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
node_processors: dict[
|
||||||
|
tuple[VALID_OUTPUT_TYPES, str],
|
||||||
|
list[
|
||||||
|
Callable[
|
||||||
|
[
|
||||||
|
PageElement,
|
||||||
|
],
|
||||||
|
Optional[str],
|
||||||
|
]
|
||||||
|
],
|
||||||
|
] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
|
||||||
|
def decorate(function):
|
||||||
|
node_processors.setdefault((output_type, tag), [])
|
||||||
|
node_processors[output_type, tag].append(function)
|
||||||
|
return function
|
||||||
|
|
||||||
|
return decorate
|
||||||
|
|
||||||
|
|
||||||
|
def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"):
|
||||||
|
def decorate(function):
|
||||||
|
node_processors[output_type, ":text:"] = [function]
|
||||||
|
return function
|
||||||
|
|
||||||
|
return decorate
|
||||||
|
|
||||||
|
|
||||||
|
def register_fmt_converter(
|
||||||
|
format: str,
|
||||||
|
tag: str,
|
||||||
|
output_type: VALID_OUTPUT_TYPES = "plain",
|
||||||
|
separator: str = "",
|
||||||
|
):
|
||||||
|
def fmt_tag(el: Tag) -> str:
|
||||||
|
if "%s" in format:
|
||||||
|
return format % nodes_process(el.children, output_type, separator)
|
||||||
|
return format
|
||||||
|
|
||||||
|
register_converter(tag, output_type)(fmt_tag)
|
||||||
|
|
||||||
|
|
||||||
|
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
|
||||||
|
if isinstance(el, Tag):
|
||||||
|
if (type_, el.name) in node_processors:
|
||||||
|
for func in node_processors[type_, el.name]:
|
||||||
|
result = func(el)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
return nodes_process(el.children, type_)
|
||||||
|
if (type_, ":text:") in node_processors:
|
||||||
|
return node_processors[type_, ":text:"][0](el) or str(el)
|
||||||
|
return str(el)
|
||||||
|
|
||||||
|
|
||||||
|
def nodes_process(
|
||||||
|
els: Iterable[PageElement],
|
||||||
|
type_: VALID_OUTPUT_TYPES = "plain",
|
||||||
|
separator: str = "",
|
||||||
|
) -> str:
|
||||||
|
return str.join(separator, (node_process(el, type_) for el in els))
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["node_process", "nodes_process", "md_escape", "BULLET", "STRIPE"]
|
||||||
|
|
||||||
|
import mastoposter.text.html # noqa F401
|
||||||
|
import mastoposter.text.markdown # noqa F401
|
||||||
|
import mastoposter.text.plain # noqa F401
|
|
@ -0,0 +1,36 @@
|
||||||
|
"""
|
||||||
|
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||||
|
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from mastoposter.text import node_process, VALID_OUTPUT_TYPES
|
||||||
|
from argparse import ArgumentParser, FileType
|
||||||
|
from typing import get_args as T_get_args
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import sys
|
||||||
|
|
||||||
|
parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--type",
|
||||||
|
"-t",
|
||||||
|
choices=T_get_args(VALID_OUTPUT_TYPES),
|
||||||
|
default=T_get_args(VALID_OUTPUT_TYPES)[0],
|
||||||
|
dest="output_type",
|
||||||
|
)
|
||||||
|
parser.add_argument("file", default=sys.stdin, type=FileType("r"))
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(args.file.read(), "lxml")
|
||||||
|
print(node_process(soup, args.output_type))
|
|
@ -0,0 +1,100 @@
|
||||||
|
"""
|
||||||
|
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||||
|
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
"""
|
||||||
|
from bs4 import NavigableString
|
||||||
|
from mastoposter.text import (
|
||||||
|
nodes_process,
|
||||||
|
register_converter,
|
||||||
|
register_fmt_converter,
|
||||||
|
register_text_node_converter,
|
||||||
|
node_process,
|
||||||
|
STRIPE,
|
||||||
|
BULLET,
|
||||||
|
)
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
from bs4.element import Tag
|
||||||
|
from html import escape
|
||||||
|
|
||||||
|
|
||||||
|
@register_text_node_converter("html")
|
||||||
|
def proc_text_node_to_html(txt: NavigableString) -> str:
|
||||||
|
return escape(txt).strip()
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("a", "html")
|
||||||
|
def proc_tag_a_to_html(tag: Tag):
|
||||||
|
return '<a href="%s">%s</a>' % (
|
||||||
|
escape(tag.attrs.get("href", "#")),
|
||||||
|
nodes_process(tag.children, "html"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_fmt_converter("%s\n\n", "p", "html")
|
||||||
|
register_fmt_converter("<i>%s</i>", "i", "html")
|
||||||
|
register_fmt_converter("<i>%s</i>", "em", "html")
|
||||||
|
register_fmt_converter("<b>%s</b>", "b", "html")
|
||||||
|
register_fmt_converter("<b>%s</b>", "strong", "html")
|
||||||
|
register_fmt_converter("<s>%s</s>", "s", "html")
|
||||||
|
register_fmt_converter("<s>%s</s>", "del", "html")
|
||||||
|
register_fmt_converter("<u>%s</u>", "u", "html")
|
||||||
|
register_fmt_converter("<u>%s</u>", "ins", "html")
|
||||||
|
register_fmt_converter("\n", "br", "html")
|
||||||
|
register_fmt_converter("\n<pre>%s</pre>\n", "pre", "html")
|
||||||
|
register_fmt_converter("<code>%s</code>", "code", "html")
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("span", "html")
|
||||||
|
def proc_tag_span_to_html(tag: Tag) -> Optional[str]:
|
||||||
|
if "_mfm_blur_" in tag.attrs.get("class", ""):
|
||||||
|
return '<span class="tg-spoiler">%s</span>' % nodes_process(
|
||||||
|
tag.children, "html"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("blockquote", "html")
|
||||||
|
def proc_tag_blockquote_to_html(tag: Tag) -> str:
|
||||||
|
return str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
STRIPE + " " + line
|
||||||
|
for line in nodes_process(tag.children, "html").strip().split("\n")
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("ul", "html")
|
||||||
|
def proc_tag_ul_to_html(tag: Tag) -> str:
|
||||||
|
return "\n" + str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
BULLET
|
||||||
|
+ " "
|
||||||
|
+ node_process(el, "html").replace("\n", "\n ").rstrip()
|
||||||
|
for el in tag.children
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("ol", "html")
|
||||||
|
def proc_tag_li_to_html(tag: Tag) -> str:
|
||||||
|
return "\n" + str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
"%d. %s"
|
||||||
|
% (i, node_process(el, "html").replace("\n", "\n ").rstrip())
|
||||||
|
for i, el in enumerate(tag.children, 1)
|
||||||
|
),
|
||||||
|
)
|
|
@ -0,0 +1,75 @@
|
||||||
|
from mastoposter.text import (
|
||||||
|
nodes_process,
|
||||||
|
register_converter,
|
||||||
|
register_fmt_converter,
|
||||||
|
node_process,
|
||||||
|
)
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
from bs4.element import Tag
|
||||||
|
from html import escape
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("a", "markdown")
|
||||||
|
def proc_tag_a_to_markdown(tag: Tag):
|
||||||
|
return "[%s](%s)" % (
|
||||||
|
nodes_process(tag.children, "markdown"),
|
||||||
|
escape(tag.attrs.get("href", "#")),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_fmt_converter("%s\n\n", "p", "markdown")
|
||||||
|
register_fmt_converter("*%s*", "i", "markdown")
|
||||||
|
register_fmt_converter("*%s*", "em", "markdown")
|
||||||
|
register_fmt_converter("**%s**", "b", "markdown")
|
||||||
|
register_fmt_converter("**%s**", "strong", "markdown")
|
||||||
|
register_fmt_converter("~~%s~~", "s", "markdown")
|
||||||
|
register_fmt_converter("~~%s~~", "del", "markdown")
|
||||||
|
register_fmt_converter("__%s__", "u", "markdown")
|
||||||
|
register_fmt_converter("__%s__", "ins", "markdown")
|
||||||
|
register_fmt_converter("\n", "br", "markdown")
|
||||||
|
register_fmt_converter("\n```%s```\n", "pre", "markdown")
|
||||||
|
register_fmt_converter("`%s`", "code", "markdown")
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("span", "markdown")
|
||||||
|
def proc_tag_span_to_markdown(tag: Tag) -> Optional[str]:
|
||||||
|
if "_mfm_blur_" in tag.attrs.get("class", ""):
|
||||||
|
return "||%s||" % nodes_process(tag.children, "markdown")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("blockquote", "markdown")
|
||||||
|
def proc_tag_blockquote_to_markdown(tag: Tag) -> str:
|
||||||
|
return str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
"> " + line
|
||||||
|
for line in nodes_process(tag.children, "markdown")
|
||||||
|
.strip()
|
||||||
|
.split("\n")
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("ul", "markdown")
|
||||||
|
def proc_tag_ul_to_markdown(tag: Tag) -> str:
|
||||||
|
return "\n" + str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
"* " + node_process(el, "markdown").replace("\n", "\n ").rstrip()
|
||||||
|
for el in tag.children
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("ol", "markdown")
|
||||||
|
def proc_tag_li_to_markdown(tag: Tag) -> str:
|
||||||
|
return "\n" + str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
"%d. %s"
|
||||||
|
% (i, node_process(el, "markdown").replace("\n", "\n ").rstrip())
|
||||||
|
for i, el in enumerate(tag.children, 1)
|
||||||
|
),
|
||||||
|
)
|
|
@ -0,0 +1,75 @@
|
||||||
|
"""
|
||||||
|
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||||
|
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from mastoposter.text import (
|
||||||
|
nodes_process,
|
||||||
|
register_converter,
|
||||||
|
register_fmt_converter,
|
||||||
|
node_process,
|
||||||
|
STRIPE,
|
||||||
|
BULLET,
|
||||||
|
)
|
||||||
|
|
||||||
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("a", "plain")
|
||||||
|
def proc_tag_a_to_plain(tag: Tag):
|
||||||
|
return "%s (%s)" % (
|
||||||
|
nodes_process(tag.children, "plain"),
|
||||||
|
tag.attrs.get("href", "#"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_fmt_converter("%s\n\n", "p", "plain")
|
||||||
|
register_fmt_converter("\n", "br", "plain")
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("blockquote", "plain")
|
||||||
|
def proc_tag_blockquote_to_plain(tag: Tag) -> str:
|
||||||
|
return str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
STRIPE + " " + line
|
||||||
|
for line in nodes_process(tag.children, "plain")
|
||||||
|
.strip()
|
||||||
|
.split("\n")
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("ul", "plain")
|
||||||
|
def proc_tag_ul_to_plain(tag: Tag) -> str:
|
||||||
|
return "\n" + str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
BULLET
|
||||||
|
+ " "
|
||||||
|
+ node_process(el, "plain").replace("\n", "\n ").rstrip()
|
||||||
|
for el in tag.children
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@register_converter("ol", "plain")
|
||||||
|
def proc_tag_li_to_plain(tag: Tag) -> str:
|
||||||
|
return "\n" + str.join(
|
||||||
|
"\n",
|
||||||
|
(
|
||||||
|
"%d. %s"
|
||||||
|
% (i, node_process(el, "plain").replace("\n", "\n ").rstrip())
|
||||||
|
for i, el in enumerate(tag.children, 1)
|
||||||
|
),
|
||||||
|
)
|
|
@ -18,7 +18,7 @@ from typing import Any, Callable, Optional, List, Literal, TypeVar
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from mastoposter.utils import node_to_html, node_to_markdown, node_to_plaintext
|
from mastoposter.text import node_process
|
||||||
|
|
||||||
|
|
||||||
def _date(val: str) -> datetime:
|
def _date(val: str) -> datetime:
|
||||||
|
@ -355,18 +355,18 @@ class Status:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def content_flathtml(self) -> str:
|
def content_flathtml(self) -> str:
|
||||||
return node_to_html(
|
return node_process(
|
||||||
BeautifulSoup(self.content, features="lxml")
|
BeautifulSoup(self.content, features="lxml"), "html"
|
||||||
).rstrip()
|
).rstrip()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def content_markdown(self) -> str:
|
def content_markdown(self) -> str:
|
||||||
return node_to_markdown(
|
return node_process(
|
||||||
BeautifulSoup(self.content, features="lxml")
|
BeautifulSoup(self.content, features="lxml"), "markdown"
|
||||||
).rstrip()
|
).rstrip()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def content_plaintext(self) -> str:
|
def content_plaintext(self) -> str:
|
||||||
return node_to_plaintext(
|
return node_process(
|
||||||
BeautifulSoup(self.content, features="lxml")
|
BeautifulSoup(self.content, features="lxml"), "plain"
|
||||||
).rstrip()
|
).rstrip()
|
||||||
|
|
|
@ -13,27 +13,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
GNU General Public License for more details.
|
GNU General Public License for more details.
|
||||||
"""
|
"""
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
from html import escape
|
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from typing import Callable, Dict
|
|
||||||
from bs4.element import Tag, PageElement
|
|
||||||
|
|
||||||
logger = getLogger("utils")
|
logger = getLogger("utils")
|
||||||
|
|
||||||
|
|
||||||
def md_escape(text: str) -> str:
|
|
||||||
return (
|
|
||||||
text.replace("\\", "\\\\")
|
|
||||||
.replace("*", "\\*")
|
|
||||||
.replace("[", "\\[")
|
|
||||||
.replace("]", "\\]")
|
|
||||||
.replace("_", "\\_")
|
|
||||||
.replace("~", "\\~")
|
|
||||||
.replace("|", "\\|")
|
|
||||||
.replace("`", "\\`")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_config(conf: ConfigParser):
|
def normalize_config(conf: ConfigParser):
|
||||||
for section in conf.sections():
|
for section in conf.sections():
|
||||||
_remove = set()
|
_remove = set()
|
||||||
|
@ -49,238 +33,3 @@ def normalize_config(conf: ConfigParser):
|
||||||
for k in _remove:
|
for k in _remove:
|
||||||
logger.info("removing key %r.%r", section, k)
|
logger.info("removing key %r.%r", section, k)
|
||||||
del conf[section][k]
|
del conf[section][k]
|
||||||
|
|
||||||
|
|
||||||
def node_to_html(el: PageElement) -> str:
|
|
||||||
TAG_TRANSFORMS: Dict[
|
|
||||||
str,
|
|
||||||
Callable[
|
|
||||||
[
|
|
||||||
Tag,
|
|
||||||
],
|
|
||||||
str,
|
|
||||||
],
|
|
||||||
] = {
|
|
||||||
"a": lambda tag: '<a href="{}">{}</a>'.format(
|
|
||||||
escape(tag.attrs["href"]),
|
|
||||||
str.join("", map(node_to_html, tag.children)),
|
|
||||||
),
|
|
||||||
"p": lambda tag: (
|
|
||||||
str.join("", map(node_to_html, tag.children)) + "\n\n"
|
|
||||||
),
|
|
||||||
"i": lambda tag: (
|
|
||||||
"<i>%s</i>" % str.join("", map(node_to_html, tag.children))
|
|
||||||
),
|
|
||||||
"b": lambda tag: (
|
|
||||||
"<b>%s</b>" % str.join("", map(node_to_html, tag.children))
|
|
||||||
),
|
|
||||||
"s": lambda tag: (
|
|
||||||
"<s>%s</s>" % str.join("", map(node_to_html, tag.children))
|
|
||||||
),
|
|
||||||
"u": lambda tag: (
|
|
||||||
"<u>%s</u>" % str.join("", map(node_to_html, tag.children))
|
|
||||||
),
|
|
||||||
"pre": lambda tag: (
|
|
||||||
"\n<pre>%s</pre>\n" % str.join("", map(node_to_html, tag.children))
|
|
||||||
),
|
|
||||||
"code": lambda tag: (
|
|
||||||
"<code>%s</code>" % str.join("", map(node_to_html, tag.children))
|
|
||||||
),
|
|
||||||
"span": lambda tag: (
|
|
||||||
(
|
|
||||||
'<span class="tg-spoiler">%s</span>'
|
|
||||||
if "_mfm_blur_" in tag.attrs.get("class", "")
|
|
||||||
else "%s"
|
|
||||||
)
|
|
||||||
% str.join("", map(node_to_html, tag.children))
|
|
||||||
),
|
|
||||||
"blockquote": lambda tag: "\n%s"
|
|
||||||
% str.join(
|
|
||||||
"\n",
|
|
||||||
(
|
|
||||||
"| %s" % part
|
|
||||||
for part in str.join(
|
|
||||||
"", map(node_to_html, tag.children)
|
|
||||||
).split("\n")
|
|
||||||
),
|
|
||||||
),
|
|
||||||
"br": lambda _: "\n",
|
|
||||||
# NOTE may fail on nested lists
|
|
||||||
"ul": lambda tag: (
|
|
||||||
"\n"
|
|
||||||
+ str.join(
|
|
||||||
"\n",
|
|
||||||
(
|
|
||||||
" \u2022 "
|
|
||||||
+ node_to_html(li).replace("\n", "\n ").rstrip()
|
|
||||||
for li in tag.children
|
|
||||||
),
|
|
||||||
)
|
|
||||||
+ "\n"
|
|
||||||
),
|
|
||||||
"ol": lambda tag: (
|
|
||||||
"\n"
|
|
||||||
+ str.join(
|
|
||||||
"\n",
|
|
||||||
(
|
|
||||||
"%d. %s"
|
|
||||||
% (i, node_to_html(li).replace("\n", "\n ").rstrip())
|
|
||||||
for i, li in enumerate(tag.children, 1)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
+ "\n"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
TAG_SUBSTITUTIONS: Dict[str, str] = {
|
|
||||||
"strong": "b",
|
|
||||||
"em": "i",
|
|
||||||
"del": "s",
|
|
||||||
"ins": "u",
|
|
||||||
}
|
|
||||||
|
|
||||||
if isinstance(el, Tag):
|
|
||||||
if el.name in TAG_TRANSFORMS:
|
|
||||||
return TAG_TRANSFORMS[el.name](el)
|
|
||||||
if el.name in TAG_SUBSTITUTIONS:
|
|
||||||
sub = TAG_SUBSTITUTIONS[el.name]
|
|
||||||
if sub in TAG_TRANSFORMS:
|
|
||||||
return TAG_TRANSFORMS[sub](el)
|
|
||||||
return str.join("", map(node_to_html, el.children))
|
|
||||||
return escape(str(el))
|
|
||||||
|
|
||||||
|
|
||||||
def node_to_markdown(el: PageElement) -> str:
|
|
||||||
""" Convert HTML to Markdown (Discord flavor) """
|
|
||||||
|
|
||||||
TAG_TRANSFORMS: Dict[
|
|
||||||
str,
|
|
||||||
Callable[
|
|
||||||
[
|
|
||||||
Tag,
|
|
||||||
],
|
|
||||||
str,
|
|
||||||
],
|
|
||||||
] = {
|
|
||||||
"a": lambda tag: "[{}]({})".format(
|
|
||||||
md_escape(str.join("", map(node_to_markdown, tag.children))),
|
|
||||||
tag.attrs["href"],
|
|
||||||
),
|
|
||||||
"p": lambda tag: (
|
|
||||||
str.join("", map(node_to_markdown, tag.children)) + "\n\n"
|
|
||||||
),
|
|
||||||
"i": lambda tag: (
|
|
||||||
"*%s*" % str.join("", map(node_to_markdown, tag.children))
|
|
||||||
),
|
|
||||||
"b": lambda tag: (
|
|
||||||
"**%s**" % str.join("", map(node_to_markdown, tag.children))
|
|
||||||
),
|
|
||||||
"s": lambda tag: (
|
|
||||||
"~~%s~~" % str.join("", map(node_to_markdown, tag.children))
|
|
||||||
),
|
|
||||||
"u": lambda tag: (
|
|
||||||
"__%s__" % str.join("", map(node_to_markdown, tag.children))
|
|
||||||
),
|
|
||||||
"pre": lambda tag: (
|
|
||||||
"\n```%s```\n" % str.join("", map(node_to_markdown, tag.children))
|
|
||||||
),
|
|
||||||
"code": lambda tag: (
|
|
||||||
"`%s`" % str.join("", map(node_to_markdown, tag.children))
|
|
||||||
),
|
|
||||||
"span": lambda tag: (
|
|
||||||
("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s")
|
|
||||||
% str.join("", map(node_to_markdown, tag.children))
|
|
||||||
),
|
|
||||||
"blockquote": lambda tag: (
|
|
||||||
"\n%s"
|
|
||||||
% str.join(
|
|
||||||
"\n",
|
|
||||||
(
|
|
||||||
"> %s" % part
|
|
||||||
for part in str.join(
|
|
||||||
"", map(node_to_markdown, tag.children)
|
|
||||||
).split("\n")
|
|
||||||
),
|
|
||||||
)
|
|
||||||
),
|
|
||||||
"br": lambda _: "\n",
|
|
||||||
# NOTE may fail on nested lists
|
|
||||||
"ul": lambda tag: (
|
|
||||||
"\n%s\n"
|
|
||||||
% str.join(
|
|
||||||
"\n",
|
|
||||||
(
|
|
||||||
"* "
|
|
||||||
+ node_to_markdown(li).replace("\n", "\n ").rstrip()
|
|
||||||
for li in tag.children
|
|
||||||
),
|
|
||||||
)
|
|
||||||
),
|
|
||||||
"ol": lambda tag: (
|
|
||||||
"\n%s\n"
|
|
||||||
% str.join(
|
|
||||||
"\n",
|
|
||||||
(
|
|
||||||
"%d. %s"
|
|
||||||
% (i, node_to_markdown(li).replace("\n", "\n ").rstrip())
|
|
||||||
for i, li in enumerate(tag.children, 1)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
TAG_SUBSTITUTIONS: Dict[str, str] = {
|
|
||||||
"strong": "b",
|
|
||||||
"em": "i",
|
|
||||||
"del": "s",
|
|
||||||
"ins": "u",
|
|
||||||
}
|
|
||||||
|
|
||||||
if isinstance(el, Tag):
|
|
||||||
if el.name in TAG_TRANSFORMS:
|
|
||||||
return TAG_TRANSFORMS[el.name](el)
|
|
||||||
if el.name in TAG_SUBSTITUTIONS:
|
|
||||||
sub = TAG_SUBSTITUTIONS[el.name]
|
|
||||||
if sub in TAG_TRANSFORMS:
|
|
||||||
return TAG_TRANSFORMS[sub](el)
|
|
||||||
return str.join("", map(node_to_markdown, el.children))
|
|
||||||
return md_escape(str(el))
|
|
||||||
|
|
||||||
|
|
||||||
def node_to_plaintext(el: PageElement) -> str:
|
|
||||||
if isinstance(el, Tag):
|
|
||||||
if el.name == "a":
|
|
||||||
return "%s (%s)" % (
|
|
||||||
str.join("", map(node_to_plaintext, el.children)),
|
|
||||||
el.attrs["href"],
|
|
||||||
)
|
|
||||||
elif el.name == "p":
|
|
||||||
return str.join("", map(node_to_plaintext, el.children)) + "\n\n"
|
|
||||||
elif el.name == "br":
|
|
||||||
return "\n"
|
|
||||||
elif el.name == "blockquote":
|
|
||||||
return str.join(
|
|
||||||
"\n",
|
|
||||||
(
|
|
||||||
"\u258d%s" % part
|
|
||||||
for part in str.join(
|
|
||||||
"", map(node_to_plaintext, el.children)
|
|
||||||
).split("\n")
|
|
||||||
),
|
|
||||||
)
|
|
||||||
elif el.name in ("ol", "ul"):
|
|
||||||
children = map(node_to_plaintext, el.children)
|
|
||||||
return "\n%s\n" % str.join(
|
|
||||||
"\n",
|
|
||||||
(
|
|
||||||
" \u2022 %s" % li.replace("\n", "\n ").strip()
|
|
||||||
for li in children
|
|
||||||
)
|
|
||||||
if el.name == "ul"
|
|
||||||
else (
|
|
||||||
"%d. %s" % (i, li.replace("\n", "\n ").strip())
|
|
||||||
for i, li in enumerate(children, 1)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
return str.join("", map(node_to_plaintext, el.children))
|
|
||||||
return str(el)
|
|
||||||
|
|
Loading…
Reference in New Issue