Merge branch 'html2everything'

This commit is contained in:
Casey 2023-05-13 09:58:55 +03:00
commit 9889ca251a
Signed by: hkc
GPG Key ID: F0F6CFE11CDB0960
7 changed files with 398 additions and 258 deletions

View File

@ -0,0 +1,105 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from typing import Callable, Iterable, Literal, Optional
from bs4.element import Tag, PageElement
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
BULLET = "\u2022"
STRIPE = "\u258d"
def md_escape(text: str) -> str:
return (
text.replace("\\", "\\\\")
.replace("*", "\\*")
.replace("[", "\\[")
.replace("]", "\\]")
.replace("_", "\\_")
.replace("~", "\\~")
.replace("|", "\\|")
.replace("`", "\\`")
)
node_processors: dict[
tuple[VALID_OUTPUT_TYPES, str],
list[
Callable[
[
PageElement,
],
Optional[str],
]
],
] = {}
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
def decorate(function):
node_processors.setdefault((output_type, tag), [])
node_processors[output_type, tag].append(function)
return function
return decorate
def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"):
def decorate(function):
node_processors[output_type, ":text:"] = [function]
return function
return decorate
def register_fmt_converter(
format: str,
tag: str,
output_type: VALID_OUTPUT_TYPES = "plain",
separator: str = "",
):
def fmt_tag(el: Tag) -> str:
if "%s" in format:
return format % nodes_process(el.children, output_type, separator)
return format
register_converter(tag, output_type)(fmt_tag)
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
if isinstance(el, Tag):
if (type_, el.name) in node_processors:
for func in node_processors[type_, el.name]:
result = func(el)
if result:
return result
return nodes_process(el.children, type_)
if (type_, ":text:") in node_processors:
return node_processors[type_, ":text:"][0](el) or str(el)
return str(el)
def nodes_process(
els: Iterable[PageElement],
type_: VALID_OUTPUT_TYPES = "plain",
separator: str = "",
) -> str:
return str.join(separator, (node_process(el, type_) for el in els))
__all__ = ["node_process", "nodes_process", "md_escape", "BULLET", "STRIPE"]
import mastoposter.text.html # noqa F401
import mastoposter.text.markdown # noqa F401
import mastoposter.text.plain # noqa F401

View File

@ -0,0 +1,36 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from mastoposter.text import node_process, VALID_OUTPUT_TYPES
from argparse import ArgumentParser, FileType
from typing import get_args as T_get_args
from bs4 import BeautifulSoup
import sys
parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter")
parser.add_argument(
"--type",
"-t",
choices=T_get_args(VALID_OUTPUT_TYPES),
default=T_get_args(VALID_OUTPUT_TYPES)[0],
dest="output_type",
)
parser.add_argument("file", default=sys.stdin, type=FileType("r"))
args = parser.parse_args()
soup = BeautifulSoup(args.file.read(), "lxml")
print(node_process(soup, args.output_type))

100
mastoposter/text/html.py Normal file
View File

@ -0,0 +1,100 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from bs4 import NavigableString
from mastoposter.text import (
nodes_process,
register_converter,
register_fmt_converter,
register_text_node_converter,
node_process,
STRIPE,
BULLET,
)
from typing import Optional
from bs4.element import Tag
from html import escape
@register_text_node_converter("html")
def proc_text_node_to_html(txt: NavigableString) -> str:
return escape(txt).strip()
@register_converter("a", "html")
def proc_tag_a_to_html(tag: Tag):
return '<a href="%s">%s</a>' % (
escape(tag.attrs.get("href", "#")),
nodes_process(tag.children, "html"),
)
register_fmt_converter("%s\n\n", "p", "html")
register_fmt_converter("<i>%s</i>", "i", "html")
register_fmt_converter("<i>%s</i>", "em", "html")
register_fmt_converter("<b>%s</b>", "b", "html")
register_fmt_converter("<b>%s</b>", "strong", "html")
register_fmt_converter("<s>%s</s>", "s", "html")
register_fmt_converter("<s>%s</s>", "del", "html")
register_fmt_converter("<u>%s</u>", "u", "html")
register_fmt_converter("<u>%s</u>", "ins", "html")
register_fmt_converter("\n", "br", "html")
register_fmt_converter("\n<pre>%s</pre>\n", "pre", "html")
register_fmt_converter("<code>%s</code>", "code", "html")
@register_converter("span", "html")
def proc_tag_span_to_html(tag: Tag) -> Optional[str]:
if "_mfm_blur_" in tag.attrs.get("class", ""):
return '<span class="tg-spoiler">%s</span>' % nodes_process(
tag.children, "html"
)
return None
@register_converter("blockquote", "html")
def proc_tag_blockquote_to_html(tag: Tag) -> str:
return str.join(
"\n",
(
STRIPE + " " + line
for line in nodes_process(tag.children, "html").strip().split("\n")
),
)
@register_converter("ul", "html")
def proc_tag_ul_to_html(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
BULLET
+ " "
+ node_process(el, "html").replace("\n", "\n ").rstrip()
for el in tag.children
),
)
@register_converter("ol", "html")
def proc_tag_li_to_html(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
"%d. %s"
% (i, node_process(el, "html").replace("\n", "\n ").rstrip())
for i, el in enumerate(tag.children, 1)
),
)

View File

@ -0,0 +1,75 @@
from mastoposter.text import (
nodes_process,
register_converter,
register_fmt_converter,
node_process,
)
from typing import Optional
from bs4.element import Tag
from html import escape
@register_converter("a", "markdown")
def proc_tag_a_to_markdown(tag: Tag):
return "[%s](%s)" % (
nodes_process(tag.children, "markdown"),
escape(tag.attrs.get("href", "#")),
)
register_fmt_converter("%s\n\n", "p", "markdown")
register_fmt_converter("*%s*", "i", "markdown")
register_fmt_converter("*%s*", "em", "markdown")
register_fmt_converter("**%s**", "b", "markdown")
register_fmt_converter("**%s**", "strong", "markdown")
register_fmt_converter("~~%s~~", "s", "markdown")
register_fmt_converter("~~%s~~", "del", "markdown")
register_fmt_converter("__%s__", "u", "markdown")
register_fmt_converter("__%s__", "ins", "markdown")
register_fmt_converter("\n", "br", "markdown")
register_fmt_converter("\n```%s```\n", "pre", "markdown")
register_fmt_converter("`%s`", "code", "markdown")
@register_converter("span", "markdown")
def proc_tag_span_to_markdown(tag: Tag) -> Optional[str]:
if "_mfm_blur_" in tag.attrs.get("class", ""):
return "||%s||" % nodes_process(tag.children, "markdown")
return None
@register_converter("blockquote", "markdown")
def proc_tag_blockquote_to_markdown(tag: Tag) -> str:
return str.join(
"\n",
(
"> " + line
for line in nodes_process(tag.children, "markdown")
.strip()
.split("\n")
),
)
@register_converter("ul", "markdown")
def proc_tag_ul_to_markdown(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
"* " + node_process(el, "markdown").replace("\n", "\n ").rstrip()
for el in tag.children
),
)
@register_converter("ol", "markdown")
def proc_tag_li_to_markdown(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
"%d. %s"
% (i, node_process(el, "markdown").replace("\n", "\n ").rstrip())
for i, el in enumerate(tag.children, 1)
),
)

75
mastoposter/text/plain.py Normal file
View File

@ -0,0 +1,75 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from mastoposter.text import (
nodes_process,
register_converter,
register_fmt_converter,
node_process,
STRIPE,
BULLET,
)
from bs4.element import Tag
@register_converter("a", "plain")
def proc_tag_a_to_plain(tag: Tag):
return "%s (%s)" % (
nodes_process(tag.children, "plain"),
tag.attrs.get("href", "#"),
)
register_fmt_converter("%s\n\n", "p", "plain")
register_fmt_converter("\n", "br", "plain")
@register_converter("blockquote", "plain")
def proc_tag_blockquote_to_plain(tag: Tag) -> str:
return str.join(
"\n",
(
STRIPE + " " + line
for line in nodes_process(tag.children, "plain")
.strip()
.split("\n")
),
)
@register_converter("ul", "plain")
def proc_tag_ul_to_plain(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
BULLET
+ " "
+ node_process(el, "plain").replace("\n", "\n ").rstrip()
for el in tag.children
),
)
@register_converter("ol", "plain")
def proc_tag_li_to_plain(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
"%d. %s"
% (i, node_process(el, "plain").replace("\n", "\n ").rstrip())
for i, el in enumerate(tag.children, 1)
),
)

View File

@ -18,7 +18,7 @@ from typing import Any, Callable, Optional, List, Literal, TypeVar
from bs4 import BeautifulSoup
from mastoposter.utils import node_to_html, node_to_markdown, node_to_plaintext
from mastoposter.text import node_process
def _date(val: str) -> datetime:
@ -355,18 +355,18 @@ class Status:
@property
def content_flathtml(self) -> str:
return node_to_html(
BeautifulSoup(self.content, features="lxml")
return node_process(
BeautifulSoup(self.content, features="lxml"), "html"
).rstrip()
@property
def content_markdown(self) -> str:
return node_to_markdown(
BeautifulSoup(self.content, features="lxml")
return node_process(
BeautifulSoup(self.content, features="lxml"), "markdown"
).rstrip()
@property
def content_plaintext(self) -> str:
return node_to_plaintext(
BeautifulSoup(self.content, features="lxml")
return node_process(
BeautifulSoup(self.content, features="lxml"), "plain"
).rstrip()

View File

@ -13,27 +13,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from configparser import ConfigParser
from html import escape
from logging import getLogger
from typing import Callable, Dict
from bs4.element import Tag, PageElement
logger = getLogger("utils")
def md_escape(text: str) -> str:
return (
text.replace("\\", "\\\\")
.replace("*", "\\*")
.replace("[", "\\[")
.replace("]", "\\]")
.replace("_", "\\_")
.replace("~", "\\~")
.replace("|", "\\|")
.replace("`", "\\`")
)
def normalize_config(conf: ConfigParser):
for section in conf.sections():
_remove = set()
@ -49,238 +33,3 @@ def normalize_config(conf: ConfigParser):
for k in _remove:
logger.info("removing key %r.%r", section, k)
del conf[section][k]
def node_to_html(el: PageElement) -> str:
TAG_TRANSFORMS: Dict[
str,
Callable[
[
Tag,
],
str,
],
] = {
"a": lambda tag: '<a href="{}">{}</a>'.format(
escape(tag.attrs["href"]),
str.join("", map(node_to_html, tag.children)),
),
"p": lambda tag: (
str.join("", map(node_to_html, tag.children)) + "\n\n"
),
"i": lambda tag: (
"<i>%s</i>" % str.join("", map(node_to_html, tag.children))
),
"b": lambda tag: (
"<b>%s</b>" % str.join("", map(node_to_html, tag.children))
),
"s": lambda tag: (
"<s>%s</s>" % str.join("", map(node_to_html, tag.children))
),
"u": lambda tag: (
"<u>%s</u>" % str.join("", map(node_to_html, tag.children))
),
"pre": lambda tag: (
"\n<pre>%s</pre>\n" % str.join("", map(node_to_html, tag.children))
),
"code": lambda tag: (
"<code>%s</code>" % str.join("", map(node_to_html, tag.children))
),
"span": lambda tag: (
(
'<span class="tg-spoiler">%s</span>'
if "_mfm_blur_" in tag.attrs.get("class", "")
else "%s"
)
% str.join("", map(node_to_html, tag.children))
),
"blockquote": lambda tag: "\n%s"
% str.join(
"\n",
(
"| %s" % part
for part in str.join(
"", map(node_to_html, tag.children)
).split("\n")
),
),
"br": lambda _: "\n",
# NOTE may fail on nested lists
"ul": lambda tag: (
"\n"
+ str.join(
"\n",
(
" \u2022 "
+ node_to_html(li).replace("\n", "\n ").rstrip()
for li in tag.children
),
)
+ "\n"
),
"ol": lambda tag: (
"\n"
+ str.join(
"\n",
(
"%d. %s"
% (i, node_to_html(li).replace("\n", "\n ").rstrip())
for i, li in enumerate(tag.children, 1)
),
)
+ "\n"
),
}
TAG_SUBSTITUTIONS: Dict[str, str] = {
"strong": "b",
"em": "i",
"del": "s",
"ins": "u",
}
if isinstance(el, Tag):
if el.name in TAG_TRANSFORMS:
return TAG_TRANSFORMS[el.name](el)
if el.name in TAG_SUBSTITUTIONS:
sub = TAG_SUBSTITUTIONS[el.name]
if sub in TAG_TRANSFORMS:
return TAG_TRANSFORMS[sub](el)
return str.join("", map(node_to_html, el.children))
return escape(str(el))
def node_to_markdown(el: PageElement) -> str:
""" Convert HTML to Markdown (Discord flavor) """
TAG_TRANSFORMS: Dict[
str,
Callable[
[
Tag,
],
str,
],
] = {
"a": lambda tag: "[{}]({})".format(
md_escape(str.join("", map(node_to_markdown, tag.children))),
tag.attrs["href"],
),
"p": lambda tag: (
str.join("", map(node_to_markdown, tag.children)) + "\n\n"
),
"i": lambda tag: (
"*%s*" % str.join("", map(node_to_markdown, tag.children))
),
"b": lambda tag: (
"**%s**" % str.join("", map(node_to_markdown, tag.children))
),
"s": lambda tag: (
"~~%s~~" % str.join("", map(node_to_markdown, tag.children))
),
"u": lambda tag: (
"__%s__" % str.join("", map(node_to_markdown, tag.children))
),
"pre": lambda tag: (
"\n```%s```\n" % str.join("", map(node_to_markdown, tag.children))
),
"code": lambda tag: (
"`%s`" % str.join("", map(node_to_markdown, tag.children))
),
"span": lambda tag: (
("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s")
% str.join("", map(node_to_markdown, tag.children))
),
"blockquote": lambda tag: (
"\n%s"
% str.join(
"\n",
(
"> %s" % part
for part in str.join(
"", map(node_to_markdown, tag.children)
).split("\n")
),
)
),
"br": lambda _: "\n",
# NOTE may fail on nested lists
"ul": lambda tag: (
"\n%s\n"
% str.join(
"\n",
(
"* "
+ node_to_markdown(li).replace("\n", "\n ").rstrip()
for li in tag.children
),
)
),
"ol": lambda tag: (
"\n%s\n"
% str.join(
"\n",
(
"%d. %s"
% (i, node_to_markdown(li).replace("\n", "\n ").rstrip())
for i, li in enumerate(tag.children, 1)
),
)
),
}
TAG_SUBSTITUTIONS: Dict[str, str] = {
"strong": "b",
"em": "i",
"del": "s",
"ins": "u",
}
if isinstance(el, Tag):
if el.name in TAG_TRANSFORMS:
return TAG_TRANSFORMS[el.name](el)
if el.name in TAG_SUBSTITUTIONS:
sub = TAG_SUBSTITUTIONS[el.name]
if sub in TAG_TRANSFORMS:
return TAG_TRANSFORMS[sub](el)
return str.join("", map(node_to_markdown, el.children))
return md_escape(str(el))
def node_to_plaintext(el: PageElement) -> str:
if isinstance(el, Tag):
if el.name == "a":
return "%s (%s)" % (
str.join("", map(node_to_plaintext, el.children)),
el.attrs["href"],
)
elif el.name == "p":
return str.join("", map(node_to_plaintext, el.children)) + "\n\n"
elif el.name == "br":
return "\n"
elif el.name == "blockquote":
return str.join(
"\n",
(
"\u258d%s" % part
for part in str.join(
"", map(node_to_plaintext, el.children)
).split("\n")
),
)
elif el.name in ("ol", "ul"):
children = map(node_to_plaintext, el.children)
return "\n%s\n" % str.join(
"\n",
(
" \u2022 %s" % li.replace("\n", "\n ").strip()
for li in children
)
if el.name == "ul"
else (
"%d. %s" % (i, li.replace("\n", "\n ").strip())
for i, li in enumerate(children, 1)
),
)
return str.join("", map(node_to_plaintext, el.children))
return str(el)