mastoposter-oss_images/mastoposter/utils.py

263 lines
7.8 KiB
Python
Raw Normal View History

"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from configparser import ConfigParser
2022-08-31 16:19:39 +03:00
from html import escape
2022-11-01 14:33:47 +03:00
from logging import getLogger
2022-10-11 09:00:37 +03:00
from typing import Callable, Dict
2022-08-31 16:19:39 +03:00
from bs4.element import Tag, PageElement
2022-11-01 14:33:47 +03:00
logger = getLogger("utils")
2022-08-31 16:19:39 +03:00
def md_escape(text: str) -> str:
return (
text.replace("\\", "\\\\")
.replace("*", "\\*")
.replace("[", "\\[")
.replace("]", "\\]")
.replace("_", "\\_")
.replace("~", "\\~")
.replace("|", "\\|")
.replace("`", "\\`")
)
def normalize_config(conf: ConfigParser):
for section in conf.sections():
_remove = set()
for k, v in conf[section].items():
normalized_key = k.replace(" ", "_").replace("-", "_")
if k == normalized_key:
continue
2022-11-01 14:33:47 +03:00
logger.info(
"moving %r.%r -> %r.%r", section, k, section, normalized_key
)
conf[section][normalized_key] = v
_remove.add(k)
for k in _remove:
2022-11-01 14:33:47 +03:00
logger.info("removing key %r.%r", section, k)
del conf[section][k]
2022-08-31 16:19:39 +03:00
def node_to_html(el: PageElement) -> str:
TAG_TRANSFORMS: Dict[
str,
Callable[
[
Tag,
],
str,
],
] = {
2022-10-11 09:00:37 +03:00
"a": lambda tag: '<a href="{}">{}</a>'.format(
escape(tag.attrs["href"]),
str.join("", map(node_to_html, tag.children)),
),
"p": lambda tag: (
str.join("", map(node_to_html, tag.children)) + "\n\n"
),
"i": lambda tag: (
"<i>%s</i>" % str.join("", map(node_to_html, tag.children))
),
"b": lambda tag: (
"<b>%s</b>" % str.join("", map(node_to_html, tag.children))
),
"s": lambda tag: (
"<s>%s</s>" % str.join("", map(node_to_html, tag.children))
),
"u": lambda tag: (
"<u>%s</u>" % str.join("", map(node_to_html, tag.children))
),
"pre": lambda tag: (
"\n<pre>%s</pre>\n" % str.join("", map(node_to_html, tag.children))
),
"code": lambda tag: (
"<code>%s</code>" % str.join("", map(node_to_html, tag.children))
),
"blockquote": lambda tag: "\n%s"
% str.join(
"\n",
(
"| %s" % part
for part in str.join(
"", map(node_to_html, tag.children)
).split("\n")
),
),
"br": lambda _: "\n",
# NOTE may fail on nested lists
2023-01-13 10:36:58 +03:00
"ul": lambda tag: (
"\n"
+ str.join(
2023-01-13 10:36:58 +03:00
"\n",
(
" \u2022 "
+ node_to_html(li).replace("\n", "\n ").rstrip()
2023-01-13 10:36:58 +03:00
for li in tag.children
),
)
+ "\n"
2023-01-13 10:36:58 +03:00
),
2023-01-13 10:40:40 +03:00
"ol": lambda tag: (
"\n"
+ str.join(
2023-01-13 10:40:40 +03:00
"\n",
(
"%d. %s"
% (i, node_to_html(li).replace("\n", "\n ").rstrip())
2023-01-13 10:40:40 +03:00
for i, li in enumerate(tag.children, 1)
),
)
+ "\n"
2023-01-13 10:40:40 +03:00
),
2022-10-11 09:00:37 +03:00
}
TAG_SUBSTITUTIONS: Dict[str, str] = {
"strong": "b",
"em": "i",
"del": "s",
"ins": "u",
}
2022-08-31 16:19:39 +03:00
if isinstance(el, Tag):
2022-10-11 09:00:37 +03:00
if el.name in TAG_TRANSFORMS:
return TAG_TRANSFORMS[el.name](el)
if el.name in TAG_SUBSTITUTIONS:
sub = TAG_SUBSTITUTIONS[el.name]
if sub in TAG_TRANSFORMS:
return TAG_TRANSFORMS[sub](el)
2022-08-31 16:19:39 +03:00
return str.join("", map(node_to_html, el.children))
return escape(str(el))
def node_to_markdown(el: PageElement) -> str:
TAG_TRANSFORMS: Dict[
str,
Callable[
[
Tag,
],
str,
],
] = {
2022-10-11 09:00:37 +03:00
"a": lambda tag: "[{}]({})".format(
md_escape(str.join("", map(node_to_markdown, tag.children))),
tag.attrs["href"],
),
"p": lambda tag: (
str.join("", map(node_to_markdown, tag.children)) + "\n\n"
),
"i": lambda tag: (
"_%s_" % str.join("", map(node_to_markdown, tag.children))
),
"b": lambda tag: (
"*%s*" % str.join("", map(node_to_markdown, tag.children))
),
"s": lambda tag: (
"~%s~" % str.join("", map(node_to_markdown, tag.children))
),
"u": lambda tag: (
"__%s__" % str.join("", map(node_to_markdown, tag.children))
),
"pre": lambda tag: (
"\n``%s``\n" % str.join("", map(node_to_markdown, tag.children))
),
"code": lambda tag: (
"`%s`" % str.join("", map(node_to_markdown, tag.children))
),
"blockquote": lambda tag: (
"\n%s"
% str.join(
"\n",
(
"%s" % part
for part in str.join(
"", map(node_to_markdown, tag.children)
).split("\n")
),
2022-08-31 16:19:39 +03:00
)
2022-10-11 09:00:37 +03:00
),
"br": lambda _: "\n",
# NOTE may fail on nested lists
2023-01-13 10:36:58 +03:00
"ul": lambda tag: (
"\n%s\n"
2023-01-13 10:36:58 +03:00
% str.join(
"\n",
(
" \u2022 "
+ node_to_markdown(li).replace("\n", "\n ").rstrip()
2023-01-13 10:36:58 +03:00
for li in tag.children
),
)
),
2023-01-13 10:40:40 +03:00
"ol": lambda tag: (
"\n%s\n"
2023-01-13 10:40:40 +03:00
% str.join(
"\n",
(
"%d. %s"
% (i, node_to_markdown(li).replace("\n", "\n ").rstrip())
2023-01-13 10:40:40 +03:00
for i, li in enumerate(tag.children, 1)
),
)
),
2022-10-11 09:00:37 +03:00
}
TAG_SUBSTITUTIONS: Dict[str, str] = {
"strong": "b",
"em": "i",
"del": "s",
"ins": "u",
}
if isinstance(el, Tag):
if el.name in TAG_TRANSFORMS:
return TAG_TRANSFORMS[el.name](el)
if el.name in TAG_SUBSTITUTIONS:
sub = TAG_SUBSTITUTIONS[el.name]
if sub in TAG_TRANSFORMS:
return TAG_TRANSFORMS[sub](el)
2022-08-31 16:19:39 +03:00
return str.join("", map(node_to_markdown, el.children))
return md_escape(str(el))
def node_to_plaintext(el: PageElement) -> str:
if isinstance(el, Tag):
if el.name == "a":
return "%s (%s)" % (
str.join("", map(node_to_plaintext, el.children)),
el.attrs["href"],
)
elif el.name == "p":
return str.join("", map(node_to_plaintext, el.children)) + "\n\n"
elif el.name == "br":
return "\n"
2023-01-13 10:47:23 +03:00
elif el.name in ("ol", "ul"):
children = map(node_to_plaintext, el.children)
return "\n%s\n" % str.join(
2023-01-13 10:47:23 +03:00
"\n",
(
" \u2022 %s" % li.replace("\n", "\n ").strip()
for li in children
)
if el.name == "ol"
else (
"%d. %s" % (i, li.replace("\n", "\n ").strip())
for i, li in enumerate(children)
),
)
2022-08-31 16:19:39 +03:00
return str.join("", map(node_to_plaintext, el.children))
return str(el)