Merge branch 'html2everything'

2023-05-13 09:58:55 +03:00 · 2023-05-13 09:58:55 +03:00 · 9889ca251a
parent 9cfbcf635a db100c0f7e
commit 9889ca251a
7 changed files with 398 additions and 258 deletions
--- a/mastoposter/text/init.py
+++ b/mastoposter/text/init.py
@ -0,0 +1,105 @@
+"""
+mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
+Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+"""
+from typing import Callable, Iterable, Literal, Optional
+from bs4.element import Tag, PageElement
+
+VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
+BULLET = "\u2022"
+STRIPE = "\u258d"
+
+
+def md_escape(text: str) -> str:
+    return (
+        text.replace("\\", "\\\\")
+        .replace("*", "\\*")
+        .replace("[", "\\[")
+        .replace("]", "\\]")
+        .replace("_", "\\_")
+        .replace("~", "\\~")
+        .replace("|", "\\|")
+        .replace("`", "\\`")
+    )
+
+
+node_processors: dict[
+    tuple[VALID_OUTPUT_TYPES, str],
+    list[
+        Callable[
+            [
+                PageElement,
+            ],
+            Optional[str],
+        ]
+    ],
+] = {}
+
+
+def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
+    def decorate(function):
+        node_processors.setdefault((output_type, tag), [])
+        node_processors[output_type, tag].append(function)
+        return function
+
+    return decorate
+
+
+def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"):
+    def decorate(function):
+        node_processors[output_type, ":text:"] = [function]
+        return function
+
+    return decorate
+
+
+def register_fmt_converter(
+    format: str,
+    tag: str,
+    output_type: VALID_OUTPUT_TYPES = "plain",
+    separator: str = "",
+):
+    def fmt_tag(el: Tag) -> str:
+        if "%s" in format:
+            return format % nodes_process(el.children, output_type, separator)
+        return format
+
+    register_converter(tag, output_type)(fmt_tag)
+
+
+def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
+    if isinstance(el, Tag):
+        if (type_, el.name) in node_processors:
+            for func in node_processors[type_, el.name]:
+                result = func(el)
+                if result:
+                    return result
+        return nodes_process(el.children, type_)
+    if (type_, ":text:") in node_processors:
+        return node_processors[type_, ":text:"][0](el) or str(el)
+    return str(el)
+
+
+def nodes_process(
+    els: Iterable[PageElement],
+    type_: VALID_OUTPUT_TYPES = "plain",
+    separator: str = "",
+) -> str:
+    return str.join(separator, (node_process(el, type_) for el in els))
+
+
+__all__ = ["node_process", "nodes_process", "md_escape", "BULLET", "STRIPE"]
+
+import mastoposter.text.html  # noqa F401
+import mastoposter.text.markdown  # noqa F401
+import mastoposter.text.plain  # noqa F401
--- a/mastoposter/text/main.py
+++ b/mastoposter/text/main.py
@ -0,0 +1,36 @@
+"""
+mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
+Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+"""
+
+from mastoposter.text import node_process, VALID_OUTPUT_TYPES
+from argparse import ArgumentParser, FileType
+from typing import get_args as T_get_args
+from bs4 import BeautifulSoup
+import sys
+
+parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter")
+
+parser.add_argument(
+    "--type",
+    "-t",
+    choices=T_get_args(VALID_OUTPUT_TYPES),
+    default=T_get_args(VALID_OUTPUT_TYPES)[0],
+    dest="output_type",
+)
+parser.add_argument("file", default=sys.stdin, type=FileType("r"))
+
+args = parser.parse_args()
+
+soup = BeautifulSoup(args.file.read(), "lxml")
+print(node_process(soup, args.output_type))
--- a/mastoposter/text/html.py
+++ b/mastoposter/text/html.py
@ -0,0 +1,100 @@
+"""
+mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
+Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+"""
+from bs4 import NavigableString
+from mastoposter.text import (
+    nodes_process,
+    register_converter,
+    register_fmt_converter,
+    register_text_node_converter,
+    node_process,
+    STRIPE,
+    BULLET,
+)
+
+from typing import Optional
+from bs4.element import Tag
+from html import escape
+
+
+@register_text_node_converter("html")
+def proc_text_node_to_html(txt: NavigableString) -> str:
+    return escape(txt).strip()
+
+
+@register_converter("a", "html")
+def proc_tag_a_to_html(tag: Tag):
+    return '<a href="%s">%s</a>' % (
+        escape(tag.attrs.get("href", "#")),
+        nodes_process(tag.children, "html"),
+    )
+
+
+register_fmt_converter("%s\n\n", "p", "html")
+register_fmt_converter("<i>%s</i>", "i", "html")
+register_fmt_converter("<i>%s</i>", "em", "html")
+register_fmt_converter("<b>%s</b>", "b", "html")
+register_fmt_converter("<b>%s</b>", "strong", "html")
+register_fmt_converter("<s>%s</s>", "s", "html")
+register_fmt_converter("<s>%s</s>", "del", "html")
+register_fmt_converter("<u>%s</u>", "u", "html")
+register_fmt_converter("<u>%s</u>", "ins", "html")
+register_fmt_converter("\n", "br", "html")
+register_fmt_converter("\n<pre>%s</pre>\n", "pre", "html")
+register_fmt_converter("<code>%s</code>", "code", "html")
+
+
+@register_converter("span", "html")
+def proc_tag_span_to_html(tag: Tag) -> Optional[str]:
+    if "_mfm_blur_" in tag.attrs.get("class", ""):
+        return '<span class="tg-spoiler">%s</span>' % nodes_process(
+            tag.children, "html"
+        )
+    return None
+
+
+@register_converter("blockquote", "html")
+def proc_tag_blockquote_to_html(tag: Tag) -> str:
+    return str.join(
+        "\n",
+        (
+            STRIPE + " " + line
+            for line in nodes_process(tag.children, "html").strip().split("\n")
+        ),
+    )
+
+
+@register_converter("ul", "html")
+def proc_tag_ul_to_html(tag: Tag) -> str:
+    return "\n" + str.join(
+        "\n",
+        (
+            BULLET
+            + " "
+            + node_process(el, "html").replace("\n", "\n   ").rstrip()
+            for el in tag.children
+        ),
+    )
+
+
+@register_converter("ol", "html")
+def proc_tag_li_to_html(tag: Tag) -> str:
+    return "\n" + str.join(
+        "\n",
+        (
+            "%d. %s"
+            % (i, node_process(el, "html").replace("\n", "\n   ").rstrip())
+            for i, el in enumerate(tag.children, 1)
+        ),
+    )
--- a/mastoposter/text/markdown.py
+++ b/mastoposter/text/markdown.py
@ -0,0 +1,75 @@
+from mastoposter.text import (
+    nodes_process,
+    register_converter,
+    register_fmt_converter,
+    node_process,
+)
+
+from typing import Optional
+from bs4.element import Tag
+from html import escape
+
+
+@register_converter("a", "markdown")
+def proc_tag_a_to_markdown(tag: Tag):
+    return "[%s](%s)" % (
+        nodes_process(tag.children, "markdown"),
+        escape(tag.attrs.get("href", "#")),
+    )
+
+
+register_fmt_converter("%s\n\n", "p", "markdown")
+register_fmt_converter("*%s*", "i", "markdown")
+register_fmt_converter("*%s*", "em", "markdown")
+register_fmt_converter("**%s**", "b", "markdown")
+register_fmt_converter("**%s**", "strong", "markdown")
+register_fmt_converter("~~%s~~", "s", "markdown")
+register_fmt_converter("~~%s~~", "del", "markdown")
+register_fmt_converter("__%s__", "u", "markdown")
+register_fmt_converter("__%s__", "ins", "markdown")
+register_fmt_converter("\n", "br", "markdown")
+register_fmt_converter("\n```%s```\n", "pre", "markdown")
+register_fmt_converter("`%s`", "code", "markdown")
+
+
+@register_converter("span", "markdown")
+def proc_tag_span_to_markdown(tag: Tag) -> Optional[str]:
+    if "_mfm_blur_" in tag.attrs.get("class", ""):
+        return "||%s||" % nodes_process(tag.children, "markdown")
+    return None
+
+
+@register_converter("blockquote", "markdown")
+def proc_tag_blockquote_to_markdown(tag: Tag) -> str:
+    return str.join(
+        "\n",
+        (
+            "> " + line
+            for line in nodes_process(tag.children, "markdown")
+            .strip()
+            .split("\n")
+        ),
+    )
+
+
+@register_converter("ul", "markdown")
+def proc_tag_ul_to_markdown(tag: Tag) -> str:
+    return "\n" + str.join(
+        "\n",
+        (
+            "* " + node_process(el, "markdown").replace("\n", "\n   ").rstrip()
+            for el in tag.children
+        ),
+    )
+
+
+@register_converter("ol", "markdown")
+def proc_tag_li_to_markdown(tag: Tag) -> str:
+    return "\n" + str.join(
+        "\n",
+        (
+            "%d. %s"
+            % (i, node_process(el, "markdown").replace("\n", "\n   ").rstrip())
+            for i, el in enumerate(tag.children, 1)
+        ),
+    )
--- a/mastoposter/text/plain.py
+++ b/mastoposter/text/plain.py
@ -0,0 +1,75 @@
+"""
+mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
+Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+"""
+
+from mastoposter.text import (
+    nodes_process,
+    register_converter,
+    register_fmt_converter,
+    node_process,
+    STRIPE,
+    BULLET,
+)
+
+from bs4.element import Tag
+
+
+@register_converter("a", "plain")
+def proc_tag_a_to_plain(tag: Tag):
+    return "%s (%s)" % (
+        nodes_process(tag.children, "plain"),
+        tag.attrs.get("href", "#"),
+    )
+
+
+register_fmt_converter("%s\n\n", "p", "plain")
+register_fmt_converter("\n", "br", "plain")
+
+
+@register_converter("blockquote", "plain")
+def proc_tag_blockquote_to_plain(tag: Tag) -> str:
+    return str.join(
+        "\n",
+        (
+            STRIPE + " " + line
+            for line in nodes_process(tag.children, "plain")
+            .strip()
+            .split("\n")
+        ),
+    )
+
+
+@register_converter("ul", "plain")
+def proc_tag_ul_to_plain(tag: Tag) -> str:
+    return "\n" + str.join(
+        "\n",
+        (
+            BULLET
+            + " "
+            + node_process(el, "plain").replace("\n", "\n   ").rstrip()
+            for el in tag.children
+        ),
+    )
+
+
+@register_converter("ol", "plain")
+def proc_tag_li_to_plain(tag: Tag) -> str:
+    return "\n" + str.join(
+        "\n",
+        (
+            "%d. %s"
+            % (i, node_process(el, "plain").replace("\n", "\n   ").rstrip())
+            for i, el in enumerate(tag.children, 1)
+        ),
+    )
--- a/mastoposter/types.py
+++ b/mastoposter/types.py
@ -18,7 +18,7 @@ from typing import Any, Callable, Optional, List, Literal, TypeVar

 from bs4 import BeautifulSoup

-from mastoposter.utils import node_to_html, node_to_markdown, node_to_plaintext
+from mastoposter.text import node_process


 def _date(val: str) -> datetime:
@ -355,18 +355,18 @@ class Status:

    @property
    def content_flathtml(self) -> str:
-        return node_to_html(
-            BeautifulSoup(self.content, features="lxml")
+        return node_process(
+            BeautifulSoup(self.content, features="lxml"), "html"
        ).rstrip()

    @property
    def content_markdown(self) -> str:
-        return node_to_markdown(
-            BeautifulSoup(self.content, features="lxml")
+        return node_process(
+            BeautifulSoup(self.content, features="lxml"), "markdown"
        ).rstrip()

    @property
    def content_plaintext(self) -> str:
-        return node_to_plaintext(
-            BeautifulSoup(self.content, features="lxml")
+        return node_process(
+            BeautifulSoup(self.content, features="lxml"), "plain"
        ).rstrip()
--- a/mastoposter/utils.py
+++ b/mastoposter/utils.py
@ -13,27 +13,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 """
 from configparser import ConfigParser
-from html import escape
 from logging import getLogger
-from typing import Callable, Dict
-from bs4.element import Tag, PageElement

 logger = getLogger("utils")


-def md_escape(text: str) -> str:
-    return (
-        text.replace("\\", "\\\\")
-        .replace("*", "\\*")
-        .replace("[", "\\[")
-        .replace("]", "\\]")
-        .replace("_", "\\_")
-        .replace("~", "\\~")
-        .replace("|", "\\|")
-        .replace("`", "\\`")
-    )
-
-
 def normalize_config(conf: ConfigParser):
    for section in conf.sections():
        _remove = set()
@ -49,238 +33,3 @@ def normalize_config(conf: ConfigParser):
        for k in _remove:
            logger.info("removing key %r.%r", section, k)
            del conf[section][k]
-
-
-def node_to_html(el: PageElement) -> str:
-    TAG_TRANSFORMS: Dict[
-        str,
-        Callable[
-            [
-                Tag,
-            ],
-            str,
-        ],
-    ] = {
-        "a": lambda tag: '<a href="{}">{}</a>'.format(
-            escape(tag.attrs["href"]),
-            str.join("", map(node_to_html, tag.children)),
-        ),
-        "p": lambda tag: (
-            str.join("", map(node_to_html, tag.children)) + "\n\n"
-        ),
-        "i": lambda tag: (
-            "<i>%s</i>" % str.join("", map(node_to_html, tag.children))
-        ),
-        "b": lambda tag: (
-            "<b>%s</b>" % str.join("", map(node_to_html, tag.children))
-        ),
-        "s": lambda tag: (
-            "<s>%s</s>" % str.join("", map(node_to_html, tag.children))
-        ),
-        "u": lambda tag: (
-            "<u>%s</u>" % str.join("", map(node_to_html, tag.children))
-        ),
-        "pre": lambda tag: (
-            "\n<pre>%s</pre>\n" % str.join("", map(node_to_html, tag.children))
-        ),
-        "code": lambda tag: (
-            "<code>%s</code>" % str.join("", map(node_to_html, tag.children))
-        ),
-        "span": lambda tag: (
-            (
-                '<span class="tg-spoiler">%s</span>'
-                if "_mfm_blur_" in tag.attrs.get("class", "")
-                else "%s"
-            )
-            % str.join("", map(node_to_html, tag.children))
-        ),
-        "blockquote": lambda tag: "\n%s"
-        % str.join(
-            "\n",
-            (
-                "| %s" % part
-                for part in str.join(
-                    "", map(node_to_html, tag.children)
-                ).split("\n")
-            ),
-        ),
-        "br": lambda _: "\n",
-        # NOTE may fail on nested lists
-        "ul": lambda tag: (
-            "\n"
-            + str.join(
-                "\n",
-                (
-                    " \u2022 "
-                    + node_to_html(li).replace("\n", "\n   ").rstrip()
-                    for li in tag.children
-                ),
-            )
-            + "\n"
-        ),
-        "ol": lambda tag: (
-            "\n"
-            + str.join(
-                "\n",
-                (
-                    "%d. %s"
-                    % (i, node_to_html(li).replace("\n", "\n   ").rstrip())
-                    for i, li in enumerate(tag.children, 1)
-                ),
-            )
-            + "\n"
-        ),
-    }
-
-    TAG_SUBSTITUTIONS: Dict[str, str] = {
-        "strong": "b",
-        "em": "i",
-        "del": "s",
-        "ins": "u",
-    }
-
-    if isinstance(el, Tag):
-        if el.name in TAG_TRANSFORMS:
-            return TAG_TRANSFORMS[el.name](el)
-        if el.name in TAG_SUBSTITUTIONS:
-            sub = TAG_SUBSTITUTIONS[el.name]
-            if sub in TAG_TRANSFORMS:
-                return TAG_TRANSFORMS[sub](el)
-        return str.join("", map(node_to_html, el.children))
-    return escape(str(el))
-
-
-def node_to_markdown(el: PageElement) -> str:
-    """ Convert HTML to Markdown (Discord flavor) """
-
-    TAG_TRANSFORMS: Dict[
-        str,
-        Callable[
-            [
-                Tag,
-            ],
-            str,
-        ],
-    ] = {
-        "a": lambda tag: "[{}]({})".format(
-            md_escape(str.join("", map(node_to_markdown, tag.children))),
-            tag.attrs["href"],
-        ),
-        "p": lambda tag: (
-            str.join("", map(node_to_markdown, tag.children)) + "\n\n"
-        ),
-        "i": lambda tag: (
-            "*%s*" % str.join("", map(node_to_markdown, tag.children))
-        ),
-        "b": lambda tag: (
-            "**%s**" % str.join("", map(node_to_markdown, tag.children))
-        ),
-        "s": lambda tag: (
-            "~~%s~~" % str.join("", map(node_to_markdown, tag.children))
-        ),
-        "u": lambda tag: (
-            "__%s__" % str.join("", map(node_to_markdown, tag.children))
-        ),
-        "pre": lambda tag: (
-            "\n```%s```\n" % str.join("", map(node_to_markdown, tag.children))
-        ),
-        "code": lambda tag: (
-            "`%s`" % str.join("", map(node_to_markdown, tag.children))
-        ),
-        "span": lambda tag: (
-            ("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s")
-            % str.join("", map(node_to_markdown, tag.children))
-        ),
-        "blockquote": lambda tag: (
-            "\n%s"
-            % str.join(
-                "\n",
-                (
-                    "> %s" % part
-                    for part in str.join(
-                        "", map(node_to_markdown, tag.children)
-                    ).split("\n")
-                ),
-            )
-        ),
-        "br": lambda _: "\n",
-        # NOTE may fail on nested lists
-        "ul": lambda tag: (
-            "\n%s\n"
-            % str.join(
-                "\n",
-                (
-                    "* "
-                    + node_to_markdown(li).replace("\n", "\n   ").rstrip()
-                    for li in tag.children
-                ),
-            )
-        ),
-        "ol": lambda tag: (
-            "\n%s\n"
-            % str.join(
-                "\n",
-                (
-                    "%d. %s"
-                    % (i, node_to_markdown(li).replace("\n", "\n   ").rstrip())
-                    for i, li in enumerate(tag.children, 1)
-                ),
-            )
-        ),
-    }
-
-    TAG_SUBSTITUTIONS: Dict[str, str] = {
-        "strong": "b",
-        "em": "i",
-        "del": "s",
-        "ins": "u",
-    }
-
-    if isinstance(el, Tag):
-        if el.name in TAG_TRANSFORMS:
-            return TAG_TRANSFORMS[el.name](el)
-        if el.name in TAG_SUBSTITUTIONS:
-            sub = TAG_SUBSTITUTIONS[el.name]
-            if sub in TAG_TRANSFORMS:
-                return TAG_TRANSFORMS[sub](el)
-        return str.join("", map(node_to_markdown, el.children))
-    return md_escape(str(el))
-
-
-def node_to_plaintext(el: PageElement) -> str:
-    if isinstance(el, Tag):
-        if el.name == "a":
-            return "%s (%s)" % (
-                str.join("", map(node_to_plaintext, el.children)),
-                el.attrs["href"],
-            )
-        elif el.name == "p":
-            return str.join("", map(node_to_plaintext, el.children)) + "\n\n"
-        elif el.name == "br":
-            return "\n"
-        elif el.name == "blockquote":
-            return str.join(
-                "\n",
-                (
-                    "\u258d%s" % part
-                    for part in str.join(
-                        "", map(node_to_plaintext, el.children)
-                    ).split("\n")
-                ),
-            )
-        elif el.name in ("ol", "ul"):
-            children = map(node_to_plaintext, el.children)
-            return "\n%s\n" % str.join(
-                "\n",
-                (
-                    " \u2022 %s" % li.replace("\n", "\n   ").strip()
-                    for li in children
-                )
-                if el.name == "ul"
-                else (
-                    "%d. %s" % (i, li.replace("\n", "\n   ").strip())
-                    for i, li in enumerate(children, 1)
-                ),
-            )
-        return str.join("", map(node_to_plaintext, el.children))
-    return str(el)