Reworked node_to_* functions

This commit is contained in:
Casey 2023-05-13 08:45:46 +03:00
parent 1b3a0bbe0b
commit 32d7784276
Signed by: hkc
GPG Key ID: F0F6CFE11CDB0960
6 changed files with 344 additions and 256 deletions

View File

@ -0,0 +1,93 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from typing import Callable, Iterable, Literal, Optional
from bs4.element import Tag, PageElement
from html import escape
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
BULLET = "\u2022"
STRIPE = "\u258d"
def md_escape(text: str) -> str:
return (
text.replace("\\", "\\\\")
.replace("*", "\\*")
.replace("[", "\\[")
.replace("]", "\\]")
.replace("_", "\\_")
.replace("~", "\\~")
.replace("|", "\\|")
.replace("`", "\\`")
)
node_processors: dict[
tuple[VALID_OUTPUT_TYPES, str],
list[
Callable[
[
Tag,
],
Optional[str],
]
],
] = {}
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
def decorate(function):
node_processors[output_type, tag].append(function)
return function
return decorate
def register_fmt_converter(
format: str,
tag: str,
output_type: VALID_OUTPUT_TYPES = "plain",
separator: str = "",
):
def fmt_tag(el: Tag) -> str:
if "%s" in format:
return format % nodes_process(el.children, output_type, separator)
return format
register_converter(tag, output_type)(fmt_tag)
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
if isinstance(el, Tag):
for func in node_processors[type_, el.name]:
result = func(el) # XXX: could use walrus, but it's py3.8+ only
if result:
return result
return escape(str(el))
def nodes_process(
els: Iterable[PageElement],
type_: VALID_OUTPUT_TYPES = "plain",
separator: str = "",
) -> str:
return str.join(separator, (node_process(el, type_) for el in els))
__all__ = ["node_process", "nodes_process", "md_escape", "BULLET", "STRIPE"]
import mastoposter.text.html # noqa F401
import mastoposter.text.markdown # noqa F401
import mastoposter.text.plain # noqa F401

93
mastoposter/text/html.py Normal file
View File

@ -0,0 +1,93 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from mastoposter.text import (
nodes_process,
register_converter,
register_fmt_converter,
node_process,
STRIPE,
BULLET,
)
from typing import Optional
from bs4.element import Tag
from html import escape
@register_converter("a", "html")
def proc_tag_a_to_html(tag: Tag):
return '<a href="%s">%s</a>' % (
escape(tag.attrs.get("href", "#")),
nodes_process(tag.children, "html"),
)
register_fmt_converter("%s\n\n", "p", "html")
register_fmt_converter("<i>%s</i>", "i", "html")
register_fmt_converter("<i>%s</i>", "em", "html")
register_fmt_converter("<b>%s</b>", "b", "html")
register_fmt_converter("<b>%s</b>", "strong", "html")
register_fmt_converter("<s>%s</s>", "s", "html")
register_fmt_converter("<s>%s</s>", "del", "html")
register_fmt_converter("<u>%s</u>", "u", "html")
register_fmt_converter("<u>%s</u>", "ins", "html")
register_fmt_converter("\n", "br", "html")
register_fmt_converter("\n<pre>%s</pre>\n", "pre", "html")
register_fmt_converter("<code>%s</code>", "code", "html")
@register_converter("span", "html")
def proc_tag_span_to_html(tag: Tag) -> Optional[str]:
if "_mfm_blur_" in tag.attrs.get("class", ""):
return '<span class="tg-spoiler">%s</span>' % nodes_process(
tag.children, "html"
)
return None
@register_converter("blockquote", "html")
def proc_tag_blockquote_to_html(tag: Tag) -> str:
return str.join(
"\n",
(
STRIPE + " " + line
for line in nodes_process(tag.children, "html").strip().split("\n")
),
)
@register_converter("ul", "html")
def proc_tag_ul_to_html(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
BULLET
+ " "
+ node_process(el, "html").replace("\n", "\n ").rstrip()
for el in tag.children
),
)
@register_converter("li", "html")
def proc_tag_li_to_html(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
"%d. %s"
% (i, node_process(el, "html").replace("\n", "\n ").rstrip())
for i, el in enumerate(tag.children, 1)
),
)

View File

@ -0,0 +1,75 @@
from mastoposter.text import (
nodes_process,
register_converter,
register_fmt_converter,
node_process,
)
from typing import Optional
from bs4.element import Tag
from html import escape
@register_converter("a", "markdown")
def proc_tag_a_to_markdown(tag: Tag):
return "[%s](%s)" % (
nodes_process(tag.children, "markdown"),
escape(tag.attrs.get("href", "#")),
)
register_fmt_converter("%s\n\n", "p", "markdown")
register_fmt_converter("*%s*", "i", "markdown")
register_fmt_converter("*%s*", "em", "markdown")
register_fmt_converter("**%s**", "b", "markdown")
register_fmt_converter("**%s**", "strong", "markdown")
register_fmt_converter("~~%s~~", "s", "markdown")
register_fmt_converter("~~%s~~", "del", "markdown")
register_fmt_converter("__%s__", "u", "markdown")
register_fmt_converter("__%s__", "ins", "markdown")
register_fmt_converter("\n", "br", "markdown")
register_fmt_converter("\n```%s```\n", "pre", "markdown")
register_fmt_converter("`%s`", "code", "markdown")
@register_converter("span", "markdown")
def proc_tag_span_to_markdown(tag: Tag) -> Optional[str]:
if "_mfm_blur_" in tag.attrs.get("class", ""):
return "||%s||" % nodes_process(tag.children, "markdown")
return None
@register_converter("blockquote", "markdown")
def proc_tag_blockquote_to_markdown(tag: Tag) -> str:
return str.join(
"\n",
(
"> " + line
for line in nodes_process(tag.children, "markdown")
.strip()
.split("\n")
),
)
@register_converter("ul", "markdown")
def proc_tag_ul_to_markdown(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
"* " + node_process(el, "markdown").replace("\n", "\n ").rstrip()
for el in tag.children
),
)
@register_converter("li", "markdown")
def proc_tag_li_to_markdown(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
"%d. %s"
% (i, node_process(el, "markdown").replace("\n", "\n ").rstrip())
for i, el in enumerate(tag.children, 1)
),
)

76
mastoposter/text/plain.py Normal file
View File

@ -0,0 +1,76 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from mastoposter.text import (
nodes_process,
register_converter,
register_fmt_converter,
node_process,
STRIPE,
BULLET,
)
from bs4.element import Tag
from html import escape
@register_converter("a", "plain")
def proc_tag_a_to_plain(tag: Tag):
return "%s (%s)" % (
nodes_process(tag.children, "plain"),
escape(tag.attrs.get("href", "#")),
)
register_fmt_converter("%s\n\n", "p", "plain")
register_fmt_converter("\n", "br", "plain")
@register_converter("blockquote", "plain")
def proc_tag_blockquote_to_plain(tag: Tag) -> str:
return str.join(
"\n",
(
STRIPE + " " + line
for line in nodes_process(tag.children, "plain")
.strip()
.split("\n")
),
)
@register_converter("ul", "plain")
def proc_tag_ul_to_plain(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
BULLET
+ " "
+ node_process(el, "plain").replace("\n", "\n ").rstrip()
for el in tag.children
),
)
@register_converter("li", "plain")
def proc_tag_li_to_plain(tag: Tag) -> str:
return "\n" + str.join(
"\n",
(
"%d. %s"
% (i, node_process(el, "plain").replace("\n", "\n ").rstrip())
for i, el in enumerate(tag.children, 1)
),
)

View File

@ -18,7 +18,7 @@ from typing import Any, Callable, Optional, List, Literal, TypeVar
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from mastoposter.utils import node_to_html, node_to_markdown, node_to_plaintext from mastoposter.text import node_process
def _date(val: str) -> datetime: def _date(val: str) -> datetime:
@ -355,18 +355,18 @@ class Status:
@property @property
def content_flathtml(self) -> str: def content_flathtml(self) -> str:
return node_to_html( return node_process(
BeautifulSoup(self.content, features="lxml") BeautifulSoup(self.content, features="lxml"), "html"
).rstrip() ).rstrip()
@property @property
def content_markdown(self) -> str: def content_markdown(self) -> str:
return node_to_markdown( return node_process(
BeautifulSoup(self.content, features="lxml") BeautifulSoup(self.content, features="lxml"), "markdown"
).rstrip() ).rstrip()
@property @property
def content_plaintext(self) -> str: def content_plaintext(self) -> str:
return node_to_plaintext( return node_process(
BeautifulSoup(self.content, features="lxml") BeautifulSoup(self.content, features="lxml"), "plain"
).rstrip() ).rstrip()

View File

@ -13,27 +13,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
""" """
from configparser import ConfigParser from configparser import ConfigParser
from html import escape
from logging import getLogger from logging import getLogger
from typing import Callable, Dict
from bs4.element import Tag, PageElement
logger = getLogger("utils") logger = getLogger("utils")
def md_escape(text: str) -> str:
return (
text.replace("\\", "\\\\")
.replace("*", "\\*")
.replace("[", "\\[")
.replace("]", "\\]")
.replace("_", "\\_")
.replace("~", "\\~")
.replace("|", "\\|")
.replace("`", "\\`")
)
def normalize_config(conf: ConfigParser): def normalize_config(conf: ConfigParser):
for section in conf.sections(): for section in conf.sections():
_remove = set() _remove = set()
@ -49,236 +33,3 @@ def normalize_config(conf: ConfigParser):
for k in _remove: for k in _remove:
logger.info("removing key %r.%r", section, k) logger.info("removing key %r.%r", section, k)
del conf[section][k] del conf[section][k]
def node_to_html(el: PageElement) -> str:
TAG_TRANSFORMS: Dict[
str,
Callable[
[
Tag,
],
str,
],
] = {
"a": lambda tag: '<a href="{}">{}</a>'.format(
escape(tag.attrs["href"]),
str.join("", map(node_to_html, tag.children)),
),
"p": lambda tag: (
str.join("", map(node_to_html, tag.children)) + "\n\n"
),
"i": lambda tag: (
"<i>%s</i>" % str.join("", map(node_to_html, tag.children))
),
"b": lambda tag: (
"<b>%s</b>" % str.join("", map(node_to_html, tag.children))
),
"s": lambda tag: (
"<s>%s</s>" % str.join("", map(node_to_html, tag.children))
),
"u": lambda tag: (
"<u>%s</u>" % str.join("", map(node_to_html, tag.children))
),
"pre": lambda tag: (
"\n<pre>%s</pre>\n" % str.join("", map(node_to_html, tag.children))
),
"code": lambda tag: (
"<code>%s</code>" % str.join("", map(node_to_html, tag.children))
),
"span": lambda tag: (
(
'<span class="tg-spoiler">%s</span>'
if "_mfm_blur_" in tag.attrs.get("class", "")
else "%s"
)
% str.join("", map(node_to_html, tag.children))
),
"blockquote": lambda tag: "\n%s"
% str.join(
"\n",
(
"| %s" % part
for part in str.join(
"", map(node_to_html, tag.children)
).split("\n")
),
),
"br": lambda _: "\n",
# NOTE may fail on nested lists
"ul": lambda tag: (
"\n"
+ str.join(
"\n",
(
" \u2022 "
+ node_to_html(li).replace("\n", "\n ").rstrip()
for li in tag.children
),
)
+ "\n"
),
"ol": lambda tag: (
"\n"
+ str.join(
"\n",
(
"%d. %s"
% (i, node_to_html(li).replace("\n", "\n ").rstrip())
for i, li in enumerate(tag.children, 1)
),
)
+ "\n"
),
}
TAG_SUBSTITUTIONS: Dict[str, str] = {
"strong": "b",
"em": "i",
"del": "s",
"ins": "u",
}
if isinstance(el, Tag):
if el.name in TAG_TRANSFORMS:
return TAG_TRANSFORMS[el.name](el)
if el.name in TAG_SUBSTITUTIONS:
sub = TAG_SUBSTITUTIONS[el.name]
if sub in TAG_TRANSFORMS:
return TAG_TRANSFORMS[sub](el)
return str.join("", map(node_to_html, el.children))
return escape(str(el))
def node_to_markdown(el: PageElement) -> str:
TAG_TRANSFORMS: Dict[
str,
Callable[
[
Tag,
],
str,
],
] = {
"a": lambda tag: "[{}]({})".format(
md_escape(str.join("", map(node_to_markdown, tag.children))),
tag.attrs["href"],
),
"p": lambda tag: (
str.join("", map(node_to_markdown, tag.children)) + "\n\n"
),
"i": lambda tag: (
"_%s_" % str.join("", map(node_to_markdown, tag.children))
),
"b": lambda tag: (
"*%s*" % str.join("", map(node_to_markdown, tag.children))
),
"s": lambda tag: (
"~%s~" % str.join("", map(node_to_markdown, tag.children))
),
"u": lambda tag: (
"__%s__" % str.join("", map(node_to_markdown, tag.children))
),
"pre": lambda tag: (
"\n``%s``\n" % str.join("", map(node_to_markdown, tag.children))
),
"code": lambda tag: (
"`%s`" % str.join("", map(node_to_markdown, tag.children))
),
"span": lambda tag: (
("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s")
% str.join("", map(node_to_markdown, tag.children))
),
"blockquote": lambda tag: (
"\n%s"
% str.join(
"\n",
(
"\u258d%s" % part
for part in str.join(
"", map(node_to_markdown, tag.children)
).split("\n")
),
)
),
"br": lambda _: "\n",
# NOTE may fail on nested lists
"ul": lambda tag: (
"\n%s\n"
% str.join(
"\n",
(
" \u2022 "
+ node_to_markdown(li).replace("\n", "\n ").rstrip()
for li in tag.children
),
)
),
"ol": lambda tag: (
"\n%s\n"
% str.join(
"\n",
(
"%d. %s"
% (i, node_to_markdown(li).replace("\n", "\n ").rstrip())
for i, li in enumerate(tag.children, 1)
),
)
),
}
TAG_SUBSTITUTIONS: Dict[str, str] = {
"strong": "b",
"em": "i",
"del": "s",
"ins": "u",
}
if isinstance(el, Tag):
if el.name in TAG_TRANSFORMS:
return TAG_TRANSFORMS[el.name](el)
if el.name in TAG_SUBSTITUTIONS:
sub = TAG_SUBSTITUTIONS[el.name]
if sub in TAG_TRANSFORMS:
return TAG_TRANSFORMS[sub](el)
return str.join("", map(node_to_markdown, el.children))
return md_escape(str(el))
def node_to_plaintext(el: PageElement) -> str:
if isinstance(el, Tag):
if el.name == "a":
return "%s (%s)" % (
str.join("", map(node_to_plaintext, el.children)),
el.attrs["href"],
)
elif el.name == "p":
return str.join("", map(node_to_plaintext, el.children)) + "\n\n"
elif el.name == "br":
return "\n"
elif el.name == "blockquote":
return str.join(
"\n",
(
"\u258d%s" % part
for part in str.join(
"", map(node_to_plaintext, el.children)
).split("\n")
),
)
elif el.name in ("ol", "ul"):
children = map(node_to_plaintext, el.children)
return "\n%s\n" % str.join(
"\n",
(
" \u2022 %s" % li.replace("\n", "\n ").strip()
for li in children
)
if el.name == "ul"
else (
"%d. %s" % (i, li.replace("\n", "\n ").strip())
for i, li in enumerate(children, 1)
),
)
return str.join("", map(node_to_plaintext, el.children))
return str(el)