2023-05-08 13:18:44 +03:00
|
|
|
"""
|
|
|
|
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
|
|
|
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 3 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
"""
|
2022-11-01 12:55:23 +03:00
|
|
|
from configparser import ConfigParser
|
2022-08-31 16:19:39 +03:00
|
|
|
from html import escape
|
2022-11-01 14:33:47 +03:00
|
|
|
from logging import getLogger
|
2022-10-11 09:00:37 +03:00
|
|
|
from typing import Callable, Dict
|
2022-08-31 16:19:39 +03:00
|
|
|
from bs4.element import Tag, PageElement
|
|
|
|
|
2022-11-01 14:33:47 +03:00
|
|
|
logger = getLogger("utils")
|
|
|
|
|
2022-08-31 16:19:39 +03:00
|
|
|
|
|
|
|
def md_escape(text: str) -> str:
|
|
|
|
return (
|
|
|
|
text.replace("\\", "\\\\")
|
|
|
|
.replace("*", "\\*")
|
|
|
|
.replace("[", "\\[")
|
|
|
|
.replace("]", "\\]")
|
|
|
|
.replace("_", "\\_")
|
|
|
|
.replace("~", "\\~")
|
|
|
|
.replace("|", "\\|")
|
|
|
|
.replace("`", "\\`")
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-11-01 12:55:23 +03:00
|
|
|
def normalize_config(conf: ConfigParser):
|
|
|
|
for section in conf.sections():
|
|
|
|
_remove = set()
|
|
|
|
for k, v in conf[section].items():
|
|
|
|
normalized_key = k.replace(" ", "_").replace("-", "_")
|
|
|
|
if k == normalized_key:
|
|
|
|
continue
|
2022-11-01 14:33:47 +03:00
|
|
|
logger.info(
|
|
|
|
"moving %r.%r -> %r.%r", section, k, section, normalized_key
|
|
|
|
)
|
2022-11-01 12:55:23 +03:00
|
|
|
conf[section][normalized_key] = v
|
|
|
|
_remove.add(k)
|
|
|
|
for k in _remove:
|
2022-11-01 14:33:47 +03:00
|
|
|
logger.info("removing key %r.%r", section, k)
|
2022-11-01 12:55:23 +03:00
|
|
|
del conf[section][k]
|
|
|
|
|
|
|
|
|
2022-08-31 16:19:39 +03:00
|
|
|
def node_to_html(el: PageElement) -> str:
|
2023-05-08 13:18:44 +03:00
|
|
|
TAG_TRANSFORMS: Dict[
|
|
|
|
str,
|
|
|
|
Callable[
|
|
|
|
[
|
|
|
|
Tag,
|
|
|
|
],
|
|
|
|
str,
|
|
|
|
],
|
|
|
|
] = {
|
2022-10-11 09:00:37 +03:00
|
|
|
"a": lambda tag: '<a href="{}">{}</a>'.format(
|
|
|
|
escape(tag.attrs["href"]),
|
|
|
|
str.join("", map(node_to_html, tag.children)),
|
|
|
|
),
|
|
|
|
"p": lambda tag: (
|
|
|
|
str.join("", map(node_to_html, tag.children)) + "\n\n"
|
|
|
|
),
|
|
|
|
"i": lambda tag: (
|
|
|
|
"<i>%s</i>" % str.join("", map(node_to_html, tag.children))
|
|
|
|
),
|
|
|
|
"b": lambda tag: (
|
|
|
|
"<b>%s</b>" % str.join("", map(node_to_html, tag.children))
|
|
|
|
),
|
|
|
|
"s": lambda tag: (
|
|
|
|
"<s>%s</s>" % str.join("", map(node_to_html, tag.children))
|
|
|
|
),
|
|
|
|
"u": lambda tag: (
|
|
|
|
"<u>%s</u>" % str.join("", map(node_to_html, tag.children))
|
|
|
|
),
|
|
|
|
"pre": lambda tag: (
|
|
|
|
"\n<pre>%s</pre>\n" % str.join("", map(node_to_html, tag.children))
|
|
|
|
),
|
|
|
|
"code": lambda tag: (
|
|
|
|
"<code>%s</code>" % str.join("", map(node_to_html, tag.children))
|
|
|
|
),
|
2023-05-08 18:28:43 +03:00
|
|
|
"span": lambda tag: (
|
|
|
|
(
|
|
|
|
'<span class="tg-spoiler">%s</span>'
|
2023-05-09 18:42:47 +03:00
|
|
|
if "_mfm_blur_" in tag.attrs.get("class", "")
|
2023-05-08 18:28:43 +03:00
|
|
|
else "%s"
|
|
|
|
)
|
|
|
|
% str.join("", map(node_to_html, tag.children))
|
|
|
|
),
|
2022-10-11 09:00:37 +03:00
|
|
|
"blockquote": lambda tag: "\n%s"
|
|
|
|
% str.join(
|
|
|
|
"\n",
|
|
|
|
(
|
|
|
|
"| %s" % part
|
|
|
|
for part in str.join(
|
|
|
|
"", map(node_to_html, tag.children)
|
|
|
|
).split("\n")
|
|
|
|
),
|
|
|
|
),
|
|
|
|
"br": lambda _: "\n",
|
2023-01-13 11:47:40 +03:00
|
|
|
# NOTE may fail on nested lists
|
2023-01-13 10:36:58 +03:00
|
|
|
"ul": lambda tag: (
|
2023-01-13 11:47:40 +03:00
|
|
|
"\n"
|
|
|
|
+ str.join(
|
2023-01-13 10:36:58 +03:00
|
|
|
"\n",
|
|
|
|
(
|
|
|
|
" \u2022 "
|
2023-01-13 11:47:40 +03:00
|
|
|
+ node_to_html(li).replace("\n", "\n ").rstrip()
|
2023-01-13 10:36:58 +03:00
|
|
|
for li in tag.children
|
|
|
|
),
|
|
|
|
)
|
2023-01-13 11:47:40 +03:00
|
|
|
+ "\n"
|
2023-01-13 10:36:58 +03:00
|
|
|
),
|
2023-01-13 10:40:40 +03:00
|
|
|
"ol": lambda tag: (
|
2023-01-13 11:47:40 +03:00
|
|
|
"\n"
|
|
|
|
+ str.join(
|
2023-01-13 10:40:40 +03:00
|
|
|
"\n",
|
|
|
|
(
|
|
|
|
"%d. %s"
|
2023-01-13 11:47:40 +03:00
|
|
|
% (i, node_to_html(li).replace("\n", "\n ").rstrip())
|
2023-01-13 10:40:40 +03:00
|
|
|
for i, li in enumerate(tag.children, 1)
|
|
|
|
),
|
|
|
|
)
|
2023-01-13 11:47:40 +03:00
|
|
|
+ "\n"
|
2023-01-13 10:40:40 +03:00
|
|
|
),
|
2022-10-11 09:00:37 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
TAG_SUBSTITUTIONS: Dict[str, str] = {
|
|
|
|
"strong": "b",
|
|
|
|
"em": "i",
|
|
|
|
"del": "s",
|
|
|
|
"ins": "u",
|
|
|
|
}
|
|
|
|
|
2022-08-31 16:19:39 +03:00
|
|
|
if isinstance(el, Tag):
|
2022-10-11 09:00:37 +03:00
|
|
|
if el.name in TAG_TRANSFORMS:
|
|
|
|
return TAG_TRANSFORMS[el.name](el)
|
|
|
|
if el.name in TAG_SUBSTITUTIONS:
|
|
|
|
sub = TAG_SUBSTITUTIONS[el.name]
|
|
|
|
if sub in TAG_TRANSFORMS:
|
|
|
|
return TAG_TRANSFORMS[sub](el)
|
2022-08-31 16:19:39 +03:00
|
|
|
return str.join("", map(node_to_html, el.children))
|
|
|
|
return escape(str(el))
|
|
|
|
|
|
|
|
|
|
|
|
def node_to_markdown(el: PageElement) -> str:
|
2023-05-13 03:49:35 +03:00
|
|
|
""" Convert HTML to Markdown (Discord flavor) """
|
|
|
|
|
2023-05-08 13:18:44 +03:00
|
|
|
TAG_TRANSFORMS: Dict[
|
|
|
|
str,
|
|
|
|
Callable[
|
|
|
|
[
|
|
|
|
Tag,
|
|
|
|
],
|
|
|
|
str,
|
|
|
|
],
|
|
|
|
] = {
|
2022-10-11 09:00:37 +03:00
|
|
|
"a": lambda tag: "[{}]({})".format(
|
|
|
|
md_escape(str.join("", map(node_to_markdown, tag.children))),
|
|
|
|
tag.attrs["href"],
|
|
|
|
),
|
|
|
|
"p": lambda tag: (
|
|
|
|
str.join("", map(node_to_markdown, tag.children)) + "\n\n"
|
|
|
|
),
|
|
|
|
"i": lambda tag: (
|
2023-05-13 03:49:35 +03:00
|
|
|
"*%s*" % str.join("", map(node_to_markdown, tag.children))
|
2022-10-11 09:00:37 +03:00
|
|
|
),
|
|
|
|
"b": lambda tag: (
|
2023-05-13 03:49:35 +03:00
|
|
|
"**%s**" % str.join("", map(node_to_markdown, tag.children))
|
2022-10-11 09:00:37 +03:00
|
|
|
),
|
|
|
|
"s": lambda tag: (
|
2023-05-13 03:49:35 +03:00
|
|
|
"~~%s~~" % str.join("", map(node_to_markdown, tag.children))
|
2022-10-11 09:00:37 +03:00
|
|
|
),
|
|
|
|
"u": lambda tag: (
|
|
|
|
"__%s__" % str.join("", map(node_to_markdown, tag.children))
|
|
|
|
),
|
|
|
|
"pre": lambda tag: (
|
2023-05-13 03:49:35 +03:00
|
|
|
"\n```%s```\n" % str.join("", map(node_to_markdown, tag.children))
|
2022-10-11 09:00:37 +03:00
|
|
|
),
|
|
|
|
"code": lambda tag: (
|
|
|
|
"`%s`" % str.join("", map(node_to_markdown, tag.children))
|
|
|
|
),
|
2023-05-08 18:28:43 +03:00
|
|
|
"span": lambda tag: (
|
2023-05-09 18:42:47 +03:00
|
|
|
("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s")
|
2023-05-08 18:28:43 +03:00
|
|
|
% str.join("", map(node_to_markdown, tag.children))
|
|
|
|
),
|
2022-10-11 09:00:37 +03:00
|
|
|
"blockquote": lambda tag: (
|
|
|
|
"\n%s"
|
|
|
|
% str.join(
|
|
|
|
"\n",
|
|
|
|
(
|
2023-05-13 03:49:35 +03:00
|
|
|
"> %s" % part
|
2022-10-11 09:00:37 +03:00
|
|
|
for part in str.join(
|
|
|
|
"", map(node_to_markdown, tag.children)
|
|
|
|
).split("\n")
|
|
|
|
),
|
2022-08-31 16:19:39 +03:00
|
|
|
)
|
2022-10-11 09:00:37 +03:00
|
|
|
),
|
|
|
|
"br": lambda _: "\n",
|
2023-01-13 11:47:40 +03:00
|
|
|
# NOTE may fail on nested lists
|
2023-01-13 10:36:58 +03:00
|
|
|
"ul": lambda tag: (
|
2023-01-13 11:47:40 +03:00
|
|
|
"\n%s\n"
|
2023-01-13 10:36:58 +03:00
|
|
|
% str.join(
|
|
|
|
"\n",
|
|
|
|
(
|
2023-05-13 03:49:35 +03:00
|
|
|
"* "
|
2023-01-13 11:47:40 +03:00
|
|
|
+ node_to_markdown(li).replace("\n", "\n ").rstrip()
|
2023-01-13 10:36:58 +03:00
|
|
|
for li in tag.children
|
|
|
|
),
|
|
|
|
)
|
|
|
|
),
|
2023-01-13 10:40:40 +03:00
|
|
|
"ol": lambda tag: (
|
2023-01-13 11:47:40 +03:00
|
|
|
"\n%s\n"
|
2023-01-13 10:40:40 +03:00
|
|
|
% str.join(
|
|
|
|
"\n",
|
|
|
|
(
|
|
|
|
"%d. %s"
|
2023-01-13 11:47:40 +03:00
|
|
|
% (i, node_to_markdown(li).replace("\n", "\n ").rstrip())
|
2023-01-13 10:40:40 +03:00
|
|
|
for i, li in enumerate(tag.children, 1)
|
|
|
|
),
|
|
|
|
)
|
|
|
|
),
|
2022-10-11 09:00:37 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
TAG_SUBSTITUTIONS: Dict[str, str] = {
|
|
|
|
"strong": "b",
|
|
|
|
"em": "i",
|
|
|
|
"del": "s",
|
|
|
|
"ins": "u",
|
|
|
|
}
|
|
|
|
|
|
|
|
if isinstance(el, Tag):
|
|
|
|
if el.name in TAG_TRANSFORMS:
|
|
|
|
return TAG_TRANSFORMS[el.name](el)
|
|
|
|
if el.name in TAG_SUBSTITUTIONS:
|
|
|
|
sub = TAG_SUBSTITUTIONS[el.name]
|
|
|
|
if sub in TAG_TRANSFORMS:
|
|
|
|
return TAG_TRANSFORMS[sub](el)
|
2022-08-31 16:19:39 +03:00
|
|
|
return str.join("", map(node_to_markdown, el.children))
|
|
|
|
return md_escape(str(el))
|
|
|
|
|
|
|
|
|
|
|
|
def node_to_plaintext(el: PageElement) -> str:
|
|
|
|
if isinstance(el, Tag):
|
|
|
|
if el.name == "a":
|
|
|
|
return "%s (%s)" % (
|
|
|
|
str.join("", map(node_to_plaintext, el.children)),
|
|
|
|
el.attrs["href"],
|
|
|
|
)
|
|
|
|
elif el.name == "p":
|
|
|
|
return str.join("", map(node_to_plaintext, el.children)) + "\n\n"
|
|
|
|
elif el.name == "br":
|
|
|
|
return "\n"
|
2023-05-10 12:33:33 +03:00
|
|
|
elif el.name == "blockquote":
|
|
|
|
return str.join(
|
|
|
|
"\n",
|
|
|
|
(
|
|
|
|
"\u258d%s" % part
|
|
|
|
for part in str.join(
|
|
|
|
"", map(node_to_plaintext, el.children)
|
|
|
|
).split("\n")
|
|
|
|
),
|
|
|
|
)
|
2023-01-13 10:47:23 +03:00
|
|
|
elif el.name in ("ol", "ul"):
|
|
|
|
children = map(node_to_plaintext, el.children)
|
2023-01-13 11:47:40 +03:00
|
|
|
return "\n%s\n" % str.join(
|
2023-01-13 10:47:23 +03:00
|
|
|
"\n",
|
|
|
|
(
|
|
|
|
" \u2022 %s" % li.replace("\n", "\n ").strip()
|
|
|
|
for li in children
|
|
|
|
)
|
2023-05-10 06:50:52 +03:00
|
|
|
if el.name == "ul"
|
2023-01-13 10:47:23 +03:00
|
|
|
else (
|
|
|
|
"%d. %s" % (i, li.replace("\n", "\n ").strip())
|
2023-05-10 12:27:12 +03:00
|
|
|
for i, li in enumerate(children, 1)
|
2023-01-13 10:47:23 +03:00
|
|
|
),
|
|
|
|
)
|
2022-08-31 16:19:39 +03:00
|
|
|
return str.join("", map(node_to_plaintext, el.children))
|
|
|
|
return str(el)
|