Reworked node_to_* functions

2023-05-13 08:45:46 +03:00 · 2023-05-13 08:45:46 +03:00 · 32d7784276
parent 1b3a0bbe0b
commit 32d7784276
6 changed files with 344 additions and 256 deletions
--- a/mastoposter/text/init.py
+++ b/mastoposter/text/init.py
@ -0,0 +1,93 @@
 """
 mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
 Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 """
 from typing import Callable, Iterable, Literal, Optional
 from bs4.element import Tag, PageElement
 from html import escape
 VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
 BULLET = "\u2022"
 STRIPE = "\u258d"
 def md_escape(text: str) -> str:
    return (
        text.replace("\\", "\\\\")
        .replace("*", "\\*")
        .replace("[", "\\[")
        .replace("]", "\\]")
        .replace("_", "\\_")
        .replace("~", "\\~")
        .replace("|", "\\|")
        .replace("`", "\\`")
    )
 node_processors: dict[
    tuple[VALID_OUTPUT_TYPES, str],
    list[
        Callable[
            [
                Tag,
            ],
            Optional[str],
        ]
    ],
 ] = {}
 def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
    def decorate(function):
        node_processors[output_type, tag].append(function)
        return function
    return decorate
 def register_fmt_converter(
    format: str,
    tag: str,
    output_type: VALID_OUTPUT_TYPES = "plain",
    separator: str = "",
 ):
    def fmt_tag(el: Tag) -> str:
        if "%s" in format:
            return format % nodes_process(el.children, output_type, separator)
        return format
    register_converter(tag, output_type)(fmt_tag)
 def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
    if isinstance(el, Tag):
        for func in node_processors[type_, el.name]:
            result = func(el)  # XXX: could use walrus, but it's py3.8+ only
            if result:
                return result
    return escape(str(el))
 def nodes_process(
    els: Iterable[PageElement],
    type_: VALID_OUTPUT_TYPES = "plain",
    separator: str = "",
 ) -> str:
    return str.join(separator, (node_process(el, type_) for el in els))
 __all__ = ["node_process", "nodes_process", "md_escape", "BULLET", "STRIPE"]
 import mastoposter.text.html  # noqa F401
 import mastoposter.text.markdown  # noqa F401
 import mastoposter.text.plain  # noqa F401
--- a/mastoposter/text/html.py
+++ b/mastoposter/text/html.py
@ -0,0 +1,93 @@
 """
 mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
 Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 """
 from mastoposter.text import (
    nodes_process,
    register_converter,
    register_fmt_converter,
    node_process,
    STRIPE,
    BULLET,
 )
 from typing import Optional
 from bs4.element import Tag
 from html import escape
@register_converter("a", "html")
 def proc_tag_a_to_html(tag: Tag):
    return '<a href="%s">%s</a>' % (
        escape(tag.attrs.get("href", "#")),
        nodes_process(tag.children, "html"),
    )
 register_fmt_converter("%s\n\n", "p", "html")
 register_fmt_converter("<i>%s</i>", "i", "html")
 register_fmt_converter("<i>%s</i>", "em", "html")
 register_fmt_converter("<b>%s</b>", "b", "html")
 register_fmt_converter("<b>%s</b>", "strong", "html")
 register_fmt_converter("<s>%s</s>", "s", "html")
 register_fmt_converter("<s>%s</s>", "del", "html")
 register_fmt_converter("<u>%s</u>", "u", "html")
 register_fmt_converter("<u>%s</u>", "ins", "html")
 register_fmt_converter("\n", "br", "html")
 register_fmt_converter("\n<pre>%s</pre>\n", "pre", "html")
 register_fmt_converter("<code>%s</code>", "code", "html")
@register_converter("span", "html")
 def proc_tag_span_to_html(tag: Tag) -> Optional[str]:
    if "_mfm_blur_" in tag.attrs.get("class", ""):
        return '<span class="tg-spoiler">%s</span>' % nodes_process(
            tag.children, "html"
        )
    return None
@register_converter("blockquote", "html")
 def proc_tag_blockquote_to_html(tag: Tag) -> str:
    return str.join(
        "\n",
        (
            STRIPE + " " + line
            for line in nodes_process(tag.children, "html").strip().split("\n")
        ),
    )
@register_converter("ul", "html")
 def proc_tag_ul_to_html(tag: Tag) -> str:
    return "\n" + str.join(
        "\n",
        (
            BULLET
            + " "
            + node_process(el, "html").replace("\n", "\n   ").rstrip()
            for el in tag.children
        ),
    )
@register_converter("li", "html")
 def proc_tag_li_to_html(tag: Tag) -> str:
    return "\n" + str.join(
        "\n",
        (
            "%d. %s"
            % (i, node_process(el, "html").replace("\n", "\n   ").rstrip())
            for i, el in enumerate(tag.children, 1)
        ),
    )
--- a/mastoposter/text/markdown.py
+++ b/mastoposter/text/markdown.py
@ -0,0 +1,75 @@
 from mastoposter.text import (
    nodes_process,
    register_converter,
    register_fmt_converter,
    node_process,
 )
 from typing import Optional
 from bs4.element import Tag
 from html import escape
@register_converter("a", "markdown")
 def proc_tag_a_to_markdown(tag: Tag):
    return "[%s](%s)" % (
        nodes_process(tag.children, "markdown"),
        escape(tag.attrs.get("href", "#")),
    )
 register_fmt_converter("%s\n\n", "p", "markdown")
 register_fmt_converter("*%s*", "i", "markdown")
 register_fmt_converter("*%s*", "em", "markdown")
 register_fmt_converter("**%s**", "b", "markdown")
 register_fmt_converter("**%s**", "strong", "markdown")
 register_fmt_converter("~~%s~~", "s", "markdown")
 register_fmt_converter("~~%s~~", "del", "markdown")
 register_fmt_converter("__%s__", "u", "markdown")
 register_fmt_converter("__%s__", "ins", "markdown")
 register_fmt_converter("\n", "br", "markdown")
 register_fmt_converter("\n```%s```\n", "pre", "markdown")
 register_fmt_converter("`%s`", "code", "markdown")
@register_converter("span", "markdown")
 def proc_tag_span_to_markdown(tag: Tag) -> Optional[str]:
    if "_mfm_blur_" in tag.attrs.get("class", ""):
        return "||%s||" % nodes_process(tag.children, "markdown")
    return None
@register_converter("blockquote", "markdown")
 def proc_tag_blockquote_to_markdown(tag: Tag) -> str:
    return str.join(
        "\n",
        (
            "> " + line
            for line in nodes_process(tag.children, "markdown")
            .strip()
            .split("\n")
        ),
    )
@register_converter("ul", "markdown")
 def proc_tag_ul_to_markdown(tag: Tag) -> str:
    return "\n" + str.join(
        "\n",
        (
            "* " + node_process(el, "markdown").replace("\n", "\n   ").rstrip()
            for el in tag.children
        ),
    )
@register_converter("li", "markdown")
 def proc_tag_li_to_markdown(tag: Tag) -> str:
    return "\n" + str.join(
        "\n",
        (
            "%d. %s"
            % (i, node_process(el, "markdown").replace("\n", "\n   ").rstrip())
            for i, el in enumerate(tag.children, 1)
        ),
    )
--- a/mastoposter/text/plain.py
+++ b/mastoposter/text/plain.py
@ -0,0 +1,76 @@
 """
 mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
 Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 """
 from mastoposter.text import (
    nodes_process,
    register_converter,
    register_fmt_converter,
    node_process,
    STRIPE,
    BULLET,
 )
 from bs4.element import Tag
 from html import escape
@register_converter("a", "plain")
 def proc_tag_a_to_plain(tag: Tag):
    return "%s (%s)" % (
        nodes_process(tag.children, "plain"),
        escape(tag.attrs.get("href", "#")),
    )
 register_fmt_converter("%s\n\n", "p", "plain")
 register_fmt_converter("\n", "br", "plain")
@register_converter("blockquote", "plain")
 def proc_tag_blockquote_to_plain(tag: Tag) -> str:
    return str.join(
        "\n",
        (
            STRIPE + " " + line
            for line in nodes_process(tag.children, "plain")
            .strip()
            .split("\n")
        ),
    )
@register_converter("ul", "plain")
 def proc_tag_ul_to_plain(tag: Tag) -> str:
    return "\n" + str.join(
        "\n",
        (
            BULLET
            + " "
            + node_process(el, "plain").replace("\n", "\n   ").rstrip()
            for el in tag.children
        ),
    )
@register_converter("li", "plain")
 def proc_tag_li_to_plain(tag: Tag) -> str:
    return "\n" + str.join(
        "\n",
        (
            "%d. %s"
            % (i, node_process(el, "plain").replace("\n", "\n   ").rstrip())
            for i, el in enumerate(tag.children, 1)
        ),
    )
--- a/mastoposter/types.py
+++ b/mastoposter/types.py
@ -18,7 +18,7 @@ from typing import Any, Callable, Optional, List, Literal, TypeVar
 from bs4 import BeautifulSoup
-from mastoposter.utils import node_to_html, node_to_markdown, node_to_plaintext
+from mastoposter.text import node_process
 def _date(val: str) -> datetime:
@ -355,18 +355,18 @@ class Status:
    @property
    def content_flathtml(self) -> str:
-        return node_to_html(
+        return node_process(
-            BeautifulSoup(self.content, features="lxml")
+            BeautifulSoup(self.content, features="lxml"), "html"
        ).rstrip()
    @property
    def content_markdown(self) -> str:
-        return node_to_markdown(
+        return node_process(
-            BeautifulSoup(self.content, features="lxml")
+            BeautifulSoup(self.content, features="lxml"), "markdown"
        ).rstrip()
    @property
    def content_plaintext(self) -> str:
-        return node_to_plaintext(
+        return node_process(
-            BeautifulSoup(self.content, features="lxml")
+            BeautifulSoup(self.content, features="lxml"), "plain"
        ).rstrip()
--- a/mastoposter/utils.py
+++ b/mastoposter/utils.py
@ -13,27 +13,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 """
 from configparser import ConfigParser
 from html import escape
 from logging import getLogger
 from typing import Callable, Dict
 from bs4.element import Tag, PageElement
 logger = getLogger("utils")
 def md_escape(text: str) -> str:
    return (
        text.replace("\\", "\\\\")
        .replace("*", "\\*")
        .replace("[", "\\[")
        .replace("]", "\\]")
        .replace("_", "\\_")
        .replace("~", "\\~")
        .replace("|", "\\|")
        .replace("`", "\\`")
    )
 def normalize_config(conf: ConfigParser):
    for section in conf.sections():
        _remove = set()
@ -49,236 +33,3 @@ def normalize_config(conf: ConfigParser):
        for k in _remove:
            logger.info("removing key %r.%r", section, k)
            del conf[section][k]
 def node_to_html(el: PageElement) -> str:
    TAG_TRANSFORMS: Dict[
        str,
        Callable[
            [
                Tag,
            ],
            str,
        ],
    ] = {
        "a": lambda tag: '<a href="{}">{}</a>'.format(
            escape(tag.attrs["href"]),
            str.join("", map(node_to_html, tag.children)),
        ),
        "p": lambda tag: (
            str.join("", map(node_to_html, tag.children)) + "\n\n"
        ),
        "i": lambda tag: (
            "<i>%s</i>" % str.join("", map(node_to_html, tag.children))
        ),
        "b": lambda tag: (
            "<b>%s</b>" % str.join("", map(node_to_html, tag.children))
        ),
        "s": lambda tag: (
            "<s>%s</s>" % str.join("", map(node_to_html, tag.children))
        ),
        "u": lambda tag: (
            "<u>%s</u>" % str.join("", map(node_to_html, tag.children))
        ),
        "pre": lambda tag: (
            "\n<pre>%s</pre>\n" % str.join("", map(node_to_html, tag.children))
        ),
        "code": lambda tag: (
            "<code>%s</code>" % str.join("", map(node_to_html, tag.children))
        ),
        "span": lambda tag: (
            (
                '<span class="tg-spoiler">%s</span>'
                if "_mfm_blur_" in tag.attrs.get("class", "")
                else "%s"
            )
            % str.join("", map(node_to_html, tag.children))
        ),
        "blockquote": lambda tag: "\n%s"
        % str.join(
            "\n",
            (
                "| %s" % part
                for part in str.join(
                    "", map(node_to_html, tag.children)
                ).split("\n")
            ),
        ),
        "br": lambda _: "\n",
        # NOTE may fail on nested lists
        "ul": lambda tag: (
            "\n"
            + str.join(
                "\n",
                (
                    " \u2022 "
                    + node_to_html(li).replace("\n", "\n   ").rstrip()
                    for li in tag.children
                ),
            )
            + "\n"
        ),
        "ol": lambda tag: (
            "\n"
            + str.join(
                "\n",
                (
                    "%d. %s"
                    % (i, node_to_html(li).replace("\n", "\n   ").rstrip())
                    for i, li in enumerate(tag.children, 1)
                ),
            )
            + "\n"
        ),
    }
    TAG_SUBSTITUTIONS: Dict[str, str] = {
        "strong": "b",
        "em": "i",
        "del": "s",
        "ins": "u",
    }
    if isinstance(el, Tag):
        if el.name in TAG_TRANSFORMS:
            return TAG_TRANSFORMS[el.name](el)
        if el.name in TAG_SUBSTITUTIONS:
            sub = TAG_SUBSTITUTIONS[el.name]
            if sub in TAG_TRANSFORMS:
                return TAG_TRANSFORMS[sub](el)
        return str.join("", map(node_to_html, el.children))
    return escape(str(el))
 def node_to_markdown(el: PageElement) -> str:
    TAG_TRANSFORMS: Dict[
        str,
        Callable[
            [
                Tag,
            ],
            str,
        ],
    ] = {
        "a": lambda tag: "[{}]({})".format(
            md_escape(str.join("", map(node_to_markdown, tag.children))),
            tag.attrs["href"],
        ),
        "p": lambda tag: (
            str.join("", map(node_to_markdown, tag.children)) + "\n\n"
        ),
        "i": lambda tag: (
            "_%s_" % str.join("", map(node_to_markdown, tag.children))
        ),
        "b": lambda tag: (
            "*%s*" % str.join("", map(node_to_markdown, tag.children))
        ),
        "s": lambda tag: (
            "~%s~" % str.join("", map(node_to_markdown, tag.children))
        ),
        "u": lambda tag: (
            "__%s__" % str.join("", map(node_to_markdown, tag.children))
        ),
        "pre": lambda tag: (
            "\n``%s``\n" % str.join("", map(node_to_markdown, tag.children))
        ),
        "code": lambda tag: (
            "`%s`" % str.join("", map(node_to_markdown, tag.children))
        ),
        "span": lambda tag: (
            ("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s")
            % str.join("", map(node_to_markdown, tag.children))
        ),
        "blockquote": lambda tag: (
            "\n%s"
            % str.join(
                "\n",
                (
                    "\u258d%s" % part
                    for part in str.join(
                        "", map(node_to_markdown, tag.children)
                    ).split("\n")
                ),
            )
        ),
        "br": lambda _: "\n",
        # NOTE may fail on nested lists
        "ul": lambda tag: (
            "\n%s\n"
            % str.join(
                "\n",
                (
                    " \u2022 "
                    + node_to_markdown(li).replace("\n", "\n   ").rstrip()
                    for li in tag.children
                ),
            )
        ),
        "ol": lambda tag: (
            "\n%s\n"
            % str.join(
                "\n",
                (
                    "%d. %s"
                    % (i, node_to_markdown(li).replace("\n", "\n   ").rstrip())
                    for i, li in enumerate(tag.children, 1)
                ),
            )
        ),
    }
    TAG_SUBSTITUTIONS: Dict[str, str] = {
        "strong": "b",
        "em": "i",
        "del": "s",
        "ins": "u",
    }
    if isinstance(el, Tag):
        if el.name in TAG_TRANSFORMS:
            return TAG_TRANSFORMS[el.name](el)
        if el.name in TAG_SUBSTITUTIONS:
            sub = TAG_SUBSTITUTIONS[el.name]
            if sub in TAG_TRANSFORMS:
                return TAG_TRANSFORMS[sub](el)
        return str.join("", map(node_to_markdown, el.children))
    return md_escape(str(el))
 def node_to_plaintext(el: PageElement) -> str:
    if isinstance(el, Tag):
        if el.name == "a":
            return "%s (%s)" % (
                str.join("", map(node_to_plaintext, el.children)),
                el.attrs["href"],
            )
        elif el.name == "p":
            return str.join("", map(node_to_plaintext, el.children)) + "\n\n"
        elif el.name == "br":
            return "\n"
        elif el.name == "blockquote":
            return str.join(
                "\n",
                (
                    "\u258d%s" % part
                    for part in str.join(
                        "", map(node_to_plaintext, el.children)
                    ).split("\n")
                ),
            )
        elif el.name in ("ol", "ul"):
            children = map(node_to_plaintext, el.children)
            return "\n%s\n" % str.join(
                "\n",
                (
                    " \u2022 %s" % li.replace("\n", "\n   ").strip()
                    for li in children
                )
                if el.name == "ul"
                else (
                    "%d. %s" % (i, li.replace("\n", "\n   ").strip())
                    for i, li in enumerate(children, 1)
                ),
            )
        return str.join("", map(node_to_plaintext, el.children))
    return str(el)