From 32d77842764fe1501d6ff4ba3f615ab7d08e8403 Mon Sep 17 00:00:00 2001 From: hkc Date: Sat, 13 May 2023 08:45:46 +0300 Subject: [PATCH] Reworked node_to_* functions --- mastoposter/text/__init__.py | 93 +++++++++++++ mastoposter/text/html.py | 93 +++++++++++++ mastoposter/text/markdown.py | 75 +++++++++++ mastoposter/text/plain.py | 76 +++++++++++ mastoposter/types.py | 14 +- mastoposter/utils.py | 249 ----------------------------------- 6 files changed, 344 insertions(+), 256 deletions(-) create mode 100644 mastoposter/text/__init__.py create mode 100644 mastoposter/text/html.py create mode 100644 mastoposter/text/markdown.py create mode 100644 mastoposter/text/plain.py diff --git a/mastoposter/text/__init__.py b/mastoposter/text/__init__.py new file mode 100644 index 0000000..ec4848d --- /dev/null +++ b/mastoposter/text/__init__.py @@ -0,0 +1,93 @@ +""" +mastoposter - configurable reposter from Mastodon-compatible Fediverse servers +Copyright (C) 2022-2023 hatkidchan + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +""" +from typing import Callable, Iterable, Literal, Optional +from bs4.element import Tag, PageElement +from html import escape + +VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"] +BULLET = "\u2022" +STRIPE = "\u258d" + + +def md_escape(text: str) -> str: + return ( + text.replace("\\", "\\\\") + .replace("*", "\\*") + .replace("[", "\\[") + .replace("]", "\\]") + .replace("_", "\\_") + .replace("~", "\\~") + .replace("|", "\\|") + .replace("`", "\\`") + ) + + +node_processors: dict[ + tuple[VALID_OUTPUT_TYPES, str], + list[ + Callable[ + [ + Tag, + ], + Optional[str], + ] + ], +] = {} + + +def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"): + def decorate(function): + node_processors[output_type, tag].append(function) + return function + + return decorate + + +def register_fmt_converter( + format: str, + tag: str, + output_type: VALID_OUTPUT_TYPES = "plain", + separator: str = "", +): + def fmt_tag(el: Tag) -> str: + if "%s" in format: + return format % nodes_process(el.children, output_type, separator) + return format + + register_converter(tag, output_type)(fmt_tag) + + +def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str: + if isinstance(el, Tag): + for func in node_processors[type_, el.name]: + result = func(el) # XXX: could use walrus, but it's py3.8+ only + if result: + return result + return escape(str(el)) + + +def nodes_process( + els: Iterable[PageElement], + type_: VALID_OUTPUT_TYPES = "plain", + separator: str = "", +) -> str: + return str.join(separator, (node_process(el, type_) for el in els)) + + +__all__ = ["node_process", "nodes_process", "md_escape", "BULLET", "STRIPE"] + +import mastoposter.text.html # noqa F401 +import mastoposter.text.markdown # noqa F401 +import mastoposter.text.plain # noqa F401 diff --git a/mastoposter/text/html.py b/mastoposter/text/html.py new file mode 100644 index 0000000..9cc35c8 --- /dev/null +++ b/mastoposter/text/html.py @@ -0,0 +1,93 @@ +""" +mastoposter - configurable reposter from Mastodon-compatible Fediverse servers +Copyright (C) 2022-2023 hatkidchan + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +""" +from mastoposter.text import ( + nodes_process, + register_converter, + register_fmt_converter, + node_process, + STRIPE, + BULLET, +) + +from typing import Optional +from bs4.element import Tag +from html import escape + + +@register_converter("a", "html") +def proc_tag_a_to_html(tag: Tag): + return '%s' % ( + escape(tag.attrs.get("href", "#")), + nodes_process(tag.children, "html"), + ) + + +register_fmt_converter("%s\n\n", "p", "html") +register_fmt_converter("%s", "i", "html") +register_fmt_converter("%s", "em", "html") +register_fmt_converter("%s", "b", "html") +register_fmt_converter("%s", "strong", "html") +register_fmt_converter("%s", "s", "html") +register_fmt_converter("%s", "del", "html") +register_fmt_converter("%s", "u", "html") +register_fmt_converter("%s", "ins", "html") +register_fmt_converter("\n", "br", "html") +register_fmt_converter("\n
%s
\n", "pre", "html") +register_fmt_converter("%s", "code", "html") + + +@register_converter("span", "html") +def proc_tag_span_to_html(tag: Tag) -> Optional[str]: + if "_mfm_blur_" in tag.attrs.get("class", ""): + return '%s' % nodes_process( + tag.children, "html" + ) + return None + + +@register_converter("blockquote", "html") +def proc_tag_blockquote_to_html(tag: Tag) -> str: + return str.join( + "\n", + ( + STRIPE + " " + line + for line in nodes_process(tag.children, "html").strip().split("\n") + ), + ) + + +@register_converter("ul", "html") +def proc_tag_ul_to_html(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + BULLET + + " " + + node_process(el, "html").replace("\n", "\n ").rstrip() + for el in tag.children + ), + ) + + +@register_converter("li", "html") +def proc_tag_li_to_html(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + "%d. %s" + % (i, node_process(el, "html").replace("\n", "\n ").rstrip()) + for i, el in enumerate(tag.children, 1) + ), + ) diff --git a/mastoposter/text/markdown.py b/mastoposter/text/markdown.py new file mode 100644 index 0000000..23935d3 --- /dev/null +++ b/mastoposter/text/markdown.py @@ -0,0 +1,75 @@ +from mastoposter.text import ( + nodes_process, + register_converter, + register_fmt_converter, + node_process, +) + +from typing import Optional +from bs4.element import Tag +from html import escape + + +@register_converter("a", "markdown") +def proc_tag_a_to_markdown(tag: Tag): + return "[%s](%s)" % ( + nodes_process(tag.children, "markdown"), + escape(tag.attrs.get("href", "#")), + ) + + +register_fmt_converter("%s\n\n", "p", "markdown") +register_fmt_converter("*%s*", "i", "markdown") +register_fmt_converter("*%s*", "em", "markdown") +register_fmt_converter("**%s**", "b", "markdown") +register_fmt_converter("**%s**", "strong", "markdown") +register_fmt_converter("~~%s~~", "s", "markdown") +register_fmt_converter("~~%s~~", "del", "markdown") +register_fmt_converter("__%s__", "u", "markdown") +register_fmt_converter("__%s__", "ins", "markdown") +register_fmt_converter("\n", "br", "markdown") +register_fmt_converter("\n```%s```\n", "pre", "markdown") +register_fmt_converter("`%s`", "code", "markdown") + + +@register_converter("span", "markdown") +def proc_tag_span_to_markdown(tag: Tag) -> Optional[str]: + if "_mfm_blur_" in tag.attrs.get("class", ""): + return "||%s||" % nodes_process(tag.children, "markdown") + return None + + +@register_converter("blockquote", "markdown") +def proc_tag_blockquote_to_markdown(tag: Tag) -> str: + return str.join( + "\n", + ( + "> " + line + for line in nodes_process(tag.children, "markdown") + .strip() + .split("\n") + ), + ) + + +@register_converter("ul", "markdown") +def proc_tag_ul_to_markdown(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + "* " + node_process(el, "markdown").replace("\n", "\n ").rstrip() + for el in tag.children + ), + ) + + +@register_converter("li", "markdown") +def proc_tag_li_to_markdown(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + "%d. %s" + % (i, node_process(el, "markdown").replace("\n", "\n ").rstrip()) + for i, el in enumerate(tag.children, 1) + ), + ) diff --git a/mastoposter/text/plain.py b/mastoposter/text/plain.py new file mode 100644 index 0000000..8fcbb30 --- /dev/null +++ b/mastoposter/text/plain.py @@ -0,0 +1,76 @@ +""" +mastoposter - configurable reposter from Mastodon-compatible Fediverse servers +Copyright (C) 2022-2023 hatkidchan + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +""" + +from mastoposter.text import ( + nodes_process, + register_converter, + register_fmt_converter, + node_process, + STRIPE, + BULLET, +) + +from bs4.element import Tag +from html import escape + + +@register_converter("a", "plain") +def proc_tag_a_to_plain(tag: Tag): + return "%s (%s)" % ( + nodes_process(tag.children, "plain"), + escape(tag.attrs.get("href", "#")), + ) + + +register_fmt_converter("%s\n\n", "p", "plain") +register_fmt_converter("\n", "br", "plain") + + +@register_converter("blockquote", "plain") +def proc_tag_blockquote_to_plain(tag: Tag) -> str: + return str.join( + "\n", + ( + STRIPE + " " + line + for line in nodes_process(tag.children, "plain") + .strip() + .split("\n") + ), + ) + + +@register_converter("ul", "plain") +def proc_tag_ul_to_plain(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + BULLET + + " " + + node_process(el, "plain").replace("\n", "\n ").rstrip() + for el in tag.children + ), + ) + + +@register_converter("li", "plain") +def proc_tag_li_to_plain(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + "%d. %s" + % (i, node_process(el, "plain").replace("\n", "\n ").rstrip()) + for i, el in enumerate(tag.children, 1) + ), + ) diff --git a/mastoposter/types.py b/mastoposter/types.py index 3b719b9..4e08038 100644 --- a/mastoposter/types.py +++ b/mastoposter/types.py @@ -18,7 +18,7 @@ from typing import Any, Callable, Optional, List, Literal, TypeVar from bs4 import BeautifulSoup -from mastoposter.utils import node_to_html, node_to_markdown, node_to_plaintext +from mastoposter.text import node_process def _date(val: str) -> datetime: @@ -355,18 +355,18 @@ class Status: @property def content_flathtml(self) -> str: - return node_to_html( - BeautifulSoup(self.content, features="lxml") + return node_process( + BeautifulSoup(self.content, features="lxml"), "html" ).rstrip() @property def content_markdown(self) -> str: - return node_to_markdown( - BeautifulSoup(self.content, features="lxml") + return node_process( + BeautifulSoup(self.content, features="lxml"), "markdown" ).rstrip() @property def content_plaintext(self) -> str: - return node_to_plaintext( - BeautifulSoup(self.content, features="lxml") + return node_process( + BeautifulSoup(self.content, features="lxml"), "plain" ).rstrip() diff --git a/mastoposter/utils.py b/mastoposter/utils.py index 4385b5d..2ec843a 100644 --- a/mastoposter/utils.py +++ b/mastoposter/utils.py @@ -13,27 +13,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. """ from configparser import ConfigParser -from html import escape from logging import getLogger -from typing import Callable, Dict -from bs4.element import Tag, PageElement logger = getLogger("utils") -def md_escape(text: str) -> str: - return ( - text.replace("\\", "\\\\") - .replace("*", "\\*") - .replace("[", "\\[") - .replace("]", "\\]") - .replace("_", "\\_") - .replace("~", "\\~") - .replace("|", "\\|") - .replace("`", "\\`") - ) - - def normalize_config(conf: ConfigParser): for section in conf.sections(): _remove = set() @@ -49,236 +33,3 @@ def normalize_config(conf: ConfigParser): for k in _remove: logger.info("removing key %r.%r", section, k) del conf[section][k] - - -def node_to_html(el: PageElement) -> str: - TAG_TRANSFORMS: Dict[ - str, - Callable[ - [ - Tag, - ], - str, - ], - ] = { - "a": lambda tag: '{}'.format( - escape(tag.attrs["href"]), - str.join("", map(node_to_html, tag.children)), - ), - "p": lambda tag: ( - str.join("", map(node_to_html, tag.children)) + "\n\n" - ), - "i": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "b": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "s": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "u": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "pre": lambda tag: ( - "\n
%s
\n" % str.join("", map(node_to_html, tag.children)) - ), - "code": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "span": lambda tag: ( - ( - '%s' - if "_mfm_blur_" in tag.attrs.get("class", "") - else "%s" - ) - % str.join("", map(node_to_html, tag.children)) - ), - "blockquote": lambda tag: "\n%s" - % str.join( - "\n", - ( - "| %s" % part - for part in str.join( - "", map(node_to_html, tag.children) - ).split("\n") - ), - ), - "br": lambda _: "\n", - # NOTE may fail on nested lists - "ul": lambda tag: ( - "\n" - + str.join( - "\n", - ( - " \u2022 " - + node_to_html(li).replace("\n", "\n ").rstrip() - for li in tag.children - ), - ) - + "\n" - ), - "ol": lambda tag: ( - "\n" - + str.join( - "\n", - ( - "%d. %s" - % (i, node_to_html(li).replace("\n", "\n ").rstrip()) - for i, li in enumerate(tag.children, 1) - ), - ) - + "\n" - ), - } - - TAG_SUBSTITUTIONS: Dict[str, str] = { - "strong": "b", - "em": "i", - "del": "s", - "ins": "u", - } - - if isinstance(el, Tag): - if el.name in TAG_TRANSFORMS: - return TAG_TRANSFORMS[el.name](el) - if el.name in TAG_SUBSTITUTIONS: - sub = TAG_SUBSTITUTIONS[el.name] - if sub in TAG_TRANSFORMS: - return TAG_TRANSFORMS[sub](el) - return str.join("", map(node_to_html, el.children)) - return escape(str(el)) - - -def node_to_markdown(el: PageElement) -> str: - TAG_TRANSFORMS: Dict[ - str, - Callable[ - [ - Tag, - ], - str, - ], - ] = { - "a": lambda tag: "[{}]({})".format( - md_escape(str.join("", map(node_to_markdown, tag.children))), - tag.attrs["href"], - ), - "p": lambda tag: ( - str.join("", map(node_to_markdown, tag.children)) + "\n\n" - ), - "i": lambda tag: ( - "_%s_" % str.join("", map(node_to_markdown, tag.children)) - ), - "b": lambda tag: ( - "*%s*" % str.join("", map(node_to_markdown, tag.children)) - ), - "s": lambda tag: ( - "~%s~" % str.join("", map(node_to_markdown, tag.children)) - ), - "u": lambda tag: ( - "__%s__" % str.join("", map(node_to_markdown, tag.children)) - ), - "pre": lambda tag: ( - "\n``%s``\n" % str.join("", map(node_to_markdown, tag.children)) - ), - "code": lambda tag: ( - "`%s`" % str.join("", map(node_to_markdown, tag.children)) - ), - "span": lambda tag: ( - ("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s") - % str.join("", map(node_to_markdown, tag.children)) - ), - "blockquote": lambda tag: ( - "\n%s" - % str.join( - "\n", - ( - "\u258d%s" % part - for part in str.join( - "", map(node_to_markdown, tag.children) - ).split("\n") - ), - ) - ), - "br": lambda _: "\n", - # NOTE may fail on nested lists - "ul": lambda tag: ( - "\n%s\n" - % str.join( - "\n", - ( - " \u2022 " - + node_to_markdown(li).replace("\n", "\n ").rstrip() - for li in tag.children - ), - ) - ), - "ol": lambda tag: ( - "\n%s\n" - % str.join( - "\n", - ( - "%d. %s" - % (i, node_to_markdown(li).replace("\n", "\n ").rstrip()) - for i, li in enumerate(tag.children, 1) - ), - ) - ), - } - - TAG_SUBSTITUTIONS: Dict[str, str] = { - "strong": "b", - "em": "i", - "del": "s", - "ins": "u", - } - - if isinstance(el, Tag): - if el.name in TAG_TRANSFORMS: - return TAG_TRANSFORMS[el.name](el) - if el.name in TAG_SUBSTITUTIONS: - sub = TAG_SUBSTITUTIONS[el.name] - if sub in TAG_TRANSFORMS: - return TAG_TRANSFORMS[sub](el) - return str.join("", map(node_to_markdown, el.children)) - return md_escape(str(el)) - - -def node_to_plaintext(el: PageElement) -> str: - if isinstance(el, Tag): - if el.name == "a": - return "%s (%s)" % ( - str.join("", map(node_to_plaintext, el.children)), - el.attrs["href"], - ) - elif el.name == "p": - return str.join("", map(node_to_plaintext, el.children)) + "\n\n" - elif el.name == "br": - return "\n" - elif el.name == "blockquote": - return str.join( - "\n", - ( - "\u258d%s" % part - for part in str.join( - "", map(node_to_plaintext, el.children) - ).split("\n") - ), - ) - elif el.name in ("ol", "ul"): - children = map(node_to_plaintext, el.children) - return "\n%s\n" % str.join( - "\n", - ( - " \u2022 %s" % li.replace("\n", "\n ").strip() - for li in children - ) - if el.name == "ul" - else ( - "%d. %s" % (i, li.replace("\n", "\n ").strip()) - for i, li in enumerate(children, 1) - ), - ) - return str.join("", map(node_to_plaintext, el.children)) - return str(el)