diff --git a/mastoposter/text/__init__.py b/mastoposter/text/__init__.py new file mode 100644 index 0000000..e349396 --- /dev/null +++ b/mastoposter/text/__init__.py @@ -0,0 +1,105 @@ +""" +mastoposter - configurable reposter from Mastodon-compatible Fediverse servers +Copyright (C) 2022-2023 hatkidchan + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +""" +from typing import Callable, Iterable, Literal, Optional +from bs4.element import Tag, PageElement + +VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"] +BULLET = "\u2022" +STRIPE = "\u258d" + + +def md_escape(text: str) -> str: + return ( + text.replace("\\", "\\\\") + .replace("*", "\\*") + .replace("[", "\\[") + .replace("]", "\\]") + .replace("_", "\\_") + .replace("~", "\\~") + .replace("|", "\\|") + .replace("`", "\\`") + ) + + +node_processors: dict[ + tuple[VALID_OUTPUT_TYPES, str], + list[ + Callable[ + [ + PageElement, + ], + Optional[str], + ] + ], +] = {} + + +def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"): + def decorate(function): + node_processors.setdefault((output_type, tag), []) + node_processors[output_type, tag].append(function) + return function + + return decorate + + +def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"): + def decorate(function): + node_processors[output_type, ":text:"] = [function] + return function + + return decorate + + +def register_fmt_converter( + format: str, + tag: str, + output_type: VALID_OUTPUT_TYPES = "plain", + separator: str = "", +): + def fmt_tag(el: Tag) -> str: + if "%s" in format: + return format % nodes_process(el.children, output_type, separator) + return format + + register_converter(tag, output_type)(fmt_tag) + + +def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str: + if isinstance(el, Tag): + if (type_, el.name) in node_processors: + for func in node_processors[type_, el.name]: + result = func(el) + if result: + return result + return nodes_process(el.children, type_) + if (type_, ":text:") in node_processors: + return node_processors[type_, ":text:"][0](el) or str(el) + return str(el) + + +def nodes_process( + els: Iterable[PageElement], + type_: VALID_OUTPUT_TYPES = "plain", + separator: str = "", +) -> str: + return str.join(separator, (node_process(el, type_) for el in els)) + + +__all__ = ["node_process", "nodes_process", "md_escape", "BULLET", "STRIPE"] + +import mastoposter.text.html # noqa F401 +import mastoposter.text.markdown # noqa F401 +import mastoposter.text.plain # noqa F401 diff --git a/mastoposter/text/__main__.py b/mastoposter/text/__main__.py new file mode 100644 index 0000000..dd54d35 --- /dev/null +++ b/mastoposter/text/__main__.py @@ -0,0 +1,36 @@ +""" +mastoposter - configurable reposter from Mastodon-compatible Fediverse servers +Copyright (C) 2022-2023 hatkidchan + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +""" + +from mastoposter.text import node_process, VALID_OUTPUT_TYPES +from argparse import ArgumentParser, FileType +from typing import get_args as T_get_args +from bs4 import BeautifulSoup +import sys + +parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter") + +parser.add_argument( + "--type", + "-t", + choices=T_get_args(VALID_OUTPUT_TYPES), + default=T_get_args(VALID_OUTPUT_TYPES)[0], + dest="output_type", +) +parser.add_argument("file", default=sys.stdin, type=FileType("r")) + +args = parser.parse_args() + +soup = BeautifulSoup(args.file.read(), "lxml") +print(node_process(soup, args.output_type)) diff --git a/mastoposter/text/html.py b/mastoposter/text/html.py new file mode 100644 index 0000000..c36806c --- /dev/null +++ b/mastoposter/text/html.py @@ -0,0 +1,100 @@ +""" +mastoposter - configurable reposter from Mastodon-compatible Fediverse servers +Copyright (C) 2022-2023 hatkidchan + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +""" +from bs4 import NavigableString +from mastoposter.text import ( + nodes_process, + register_converter, + register_fmt_converter, + register_text_node_converter, + node_process, + STRIPE, + BULLET, +) + +from typing import Optional +from bs4.element import Tag +from html import escape + + +@register_text_node_converter("html") +def proc_text_node_to_html(txt: NavigableString) -> str: + return escape(txt).strip() + + +@register_converter("a", "html") +def proc_tag_a_to_html(tag: Tag): + return '%s' % ( + escape(tag.attrs.get("href", "#")), + nodes_process(tag.children, "html"), + ) + + +register_fmt_converter("%s\n\n", "p", "html") +register_fmt_converter("%s", "i", "html") +register_fmt_converter("%s", "em", "html") +register_fmt_converter("%s", "b", "html") +register_fmt_converter("%s", "strong", "html") +register_fmt_converter("%s", "s", "html") +register_fmt_converter("%s", "del", "html") +register_fmt_converter("%s", "u", "html") +register_fmt_converter("%s", "ins", "html") +register_fmt_converter("\n", "br", "html") +register_fmt_converter("\n
%s
\n", "pre", "html") +register_fmt_converter("%s", "code", "html") + + +@register_converter("span", "html") +def proc_tag_span_to_html(tag: Tag) -> Optional[str]: + if "_mfm_blur_" in tag.attrs.get("class", ""): + return '%s' % nodes_process( + tag.children, "html" + ) + return None + + +@register_converter("blockquote", "html") +def proc_tag_blockquote_to_html(tag: Tag) -> str: + return str.join( + "\n", + ( + STRIPE + " " + line + for line in nodes_process(tag.children, "html").strip().split("\n") + ), + ) + + +@register_converter("ul", "html") +def proc_tag_ul_to_html(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + BULLET + + " " + + node_process(el, "html").replace("\n", "\n ").rstrip() + for el in tag.children + ), + ) + + +@register_converter("ol", "html") +def proc_tag_li_to_html(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + "%d. %s" + % (i, node_process(el, "html").replace("\n", "\n ").rstrip()) + for i, el in enumerate(tag.children, 1) + ), + ) diff --git a/mastoposter/text/markdown.py b/mastoposter/text/markdown.py new file mode 100644 index 0000000..1d39b65 --- /dev/null +++ b/mastoposter/text/markdown.py @@ -0,0 +1,75 @@ +from mastoposter.text import ( + nodes_process, + register_converter, + register_fmt_converter, + node_process, +) + +from typing import Optional +from bs4.element import Tag +from html import escape + + +@register_converter("a", "markdown") +def proc_tag_a_to_markdown(tag: Tag): + return "[%s](%s)" % ( + nodes_process(tag.children, "markdown"), + escape(tag.attrs.get("href", "#")), + ) + + +register_fmt_converter("%s\n\n", "p", "markdown") +register_fmt_converter("*%s*", "i", "markdown") +register_fmt_converter("*%s*", "em", "markdown") +register_fmt_converter("**%s**", "b", "markdown") +register_fmt_converter("**%s**", "strong", "markdown") +register_fmt_converter("~~%s~~", "s", "markdown") +register_fmt_converter("~~%s~~", "del", "markdown") +register_fmt_converter("__%s__", "u", "markdown") +register_fmt_converter("__%s__", "ins", "markdown") +register_fmt_converter("\n", "br", "markdown") +register_fmt_converter("\n```%s```\n", "pre", "markdown") +register_fmt_converter("`%s`", "code", "markdown") + + +@register_converter("span", "markdown") +def proc_tag_span_to_markdown(tag: Tag) -> Optional[str]: + if "_mfm_blur_" in tag.attrs.get("class", ""): + return "||%s||" % nodes_process(tag.children, "markdown") + return None + + +@register_converter("blockquote", "markdown") +def proc_tag_blockquote_to_markdown(tag: Tag) -> str: + return str.join( + "\n", + ( + "> " + line + for line in nodes_process(tag.children, "markdown") + .strip() + .split("\n") + ), + ) + + +@register_converter("ul", "markdown") +def proc_tag_ul_to_markdown(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + "* " + node_process(el, "markdown").replace("\n", "\n ").rstrip() + for el in tag.children + ), + ) + + +@register_converter("ol", "markdown") +def proc_tag_li_to_markdown(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + "%d. %s" + % (i, node_process(el, "markdown").replace("\n", "\n ").rstrip()) + for i, el in enumerate(tag.children, 1) + ), + ) diff --git a/mastoposter/text/plain.py b/mastoposter/text/plain.py new file mode 100644 index 0000000..ed589db --- /dev/null +++ b/mastoposter/text/plain.py @@ -0,0 +1,75 @@ +""" +mastoposter - configurable reposter from Mastodon-compatible Fediverse servers +Copyright (C) 2022-2023 hatkidchan + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +""" + +from mastoposter.text import ( + nodes_process, + register_converter, + register_fmt_converter, + node_process, + STRIPE, + BULLET, +) + +from bs4.element import Tag + + +@register_converter("a", "plain") +def proc_tag_a_to_plain(tag: Tag): + return "%s (%s)" % ( + nodes_process(tag.children, "plain"), + tag.attrs.get("href", "#"), + ) + + +register_fmt_converter("%s\n\n", "p", "plain") +register_fmt_converter("\n", "br", "plain") + + +@register_converter("blockquote", "plain") +def proc_tag_blockquote_to_plain(tag: Tag) -> str: + return str.join( + "\n", + ( + STRIPE + " " + line + for line in nodes_process(tag.children, "plain") + .strip() + .split("\n") + ), + ) + + +@register_converter("ul", "plain") +def proc_tag_ul_to_plain(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + BULLET + + " " + + node_process(el, "plain").replace("\n", "\n ").rstrip() + for el in tag.children + ), + ) + + +@register_converter("ol", "plain") +def proc_tag_li_to_plain(tag: Tag) -> str: + return "\n" + str.join( + "\n", + ( + "%d. %s" + % (i, node_process(el, "plain").replace("\n", "\n ").rstrip()) + for i, el in enumerate(tag.children, 1) + ), + ) diff --git a/mastoposter/types.py b/mastoposter/types.py index 3b719b9..4e08038 100644 --- a/mastoposter/types.py +++ b/mastoposter/types.py @@ -18,7 +18,7 @@ from typing import Any, Callable, Optional, List, Literal, TypeVar from bs4 import BeautifulSoup -from mastoposter.utils import node_to_html, node_to_markdown, node_to_plaintext +from mastoposter.text import node_process def _date(val: str) -> datetime: @@ -355,18 +355,18 @@ class Status: @property def content_flathtml(self) -> str: - return node_to_html( - BeautifulSoup(self.content, features="lxml") + return node_process( + BeautifulSoup(self.content, features="lxml"), "html" ).rstrip() @property def content_markdown(self) -> str: - return node_to_markdown( - BeautifulSoup(self.content, features="lxml") + return node_process( + BeautifulSoup(self.content, features="lxml"), "markdown" ).rstrip() @property def content_plaintext(self) -> str: - return node_to_plaintext( - BeautifulSoup(self.content, features="lxml") + return node_process( + BeautifulSoup(self.content, features="lxml"), "plain" ).rstrip() diff --git a/mastoposter/utils.py b/mastoposter/utils.py index 636d402..2ec843a 100644 --- a/mastoposter/utils.py +++ b/mastoposter/utils.py @@ -13,27 +13,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. """ from configparser import ConfigParser -from html import escape from logging import getLogger -from typing import Callable, Dict -from bs4.element import Tag, PageElement logger = getLogger("utils") -def md_escape(text: str) -> str: - return ( - text.replace("\\", "\\\\") - .replace("*", "\\*") - .replace("[", "\\[") - .replace("]", "\\]") - .replace("_", "\\_") - .replace("~", "\\~") - .replace("|", "\\|") - .replace("`", "\\`") - ) - - def normalize_config(conf: ConfigParser): for section in conf.sections(): _remove = set() @@ -49,238 +33,3 @@ def normalize_config(conf: ConfigParser): for k in _remove: logger.info("removing key %r.%r", section, k) del conf[section][k] - - -def node_to_html(el: PageElement) -> str: - TAG_TRANSFORMS: Dict[ - str, - Callable[ - [ - Tag, - ], - str, - ], - ] = { - "a": lambda tag: '{}'.format( - escape(tag.attrs["href"]), - str.join("", map(node_to_html, tag.children)), - ), - "p": lambda tag: ( - str.join("", map(node_to_html, tag.children)) + "\n\n" - ), - "i": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "b": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "s": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "u": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "pre": lambda tag: ( - "\n
%s
\n" % str.join("", map(node_to_html, tag.children)) - ), - "code": lambda tag: ( - "%s" % str.join("", map(node_to_html, tag.children)) - ), - "span": lambda tag: ( - ( - '%s' - if "_mfm_blur_" in tag.attrs.get("class", "") - else "%s" - ) - % str.join("", map(node_to_html, tag.children)) - ), - "blockquote": lambda tag: "\n%s" - % str.join( - "\n", - ( - "| %s" % part - for part in str.join( - "", map(node_to_html, tag.children) - ).split("\n") - ), - ), - "br": lambda _: "\n", - # NOTE may fail on nested lists - "ul": lambda tag: ( - "\n" - + str.join( - "\n", - ( - " \u2022 " - + node_to_html(li).replace("\n", "\n ").rstrip() - for li in tag.children - ), - ) - + "\n" - ), - "ol": lambda tag: ( - "\n" - + str.join( - "\n", - ( - "%d. %s" - % (i, node_to_html(li).replace("\n", "\n ").rstrip()) - for i, li in enumerate(tag.children, 1) - ), - ) - + "\n" - ), - } - - TAG_SUBSTITUTIONS: Dict[str, str] = { - "strong": "b", - "em": "i", - "del": "s", - "ins": "u", - } - - if isinstance(el, Tag): - if el.name in TAG_TRANSFORMS: - return TAG_TRANSFORMS[el.name](el) - if el.name in TAG_SUBSTITUTIONS: - sub = TAG_SUBSTITUTIONS[el.name] - if sub in TAG_TRANSFORMS: - return TAG_TRANSFORMS[sub](el) - return str.join("", map(node_to_html, el.children)) - return escape(str(el)) - - -def node_to_markdown(el: PageElement) -> str: - """ Convert HTML to Markdown (Discord flavor) """ - - TAG_TRANSFORMS: Dict[ - str, - Callable[ - [ - Tag, - ], - str, - ], - ] = { - "a": lambda tag: "[{}]({})".format( - md_escape(str.join("", map(node_to_markdown, tag.children))), - tag.attrs["href"], - ), - "p": lambda tag: ( - str.join("", map(node_to_markdown, tag.children)) + "\n\n" - ), - "i": lambda tag: ( - "*%s*" % str.join("", map(node_to_markdown, tag.children)) - ), - "b": lambda tag: ( - "**%s**" % str.join("", map(node_to_markdown, tag.children)) - ), - "s": lambda tag: ( - "~~%s~~" % str.join("", map(node_to_markdown, tag.children)) - ), - "u": lambda tag: ( - "__%s__" % str.join("", map(node_to_markdown, tag.children)) - ), - "pre": lambda tag: ( - "\n```%s```\n" % str.join("", map(node_to_markdown, tag.children)) - ), - "code": lambda tag: ( - "`%s`" % str.join("", map(node_to_markdown, tag.children)) - ), - "span": lambda tag: ( - ("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s") - % str.join("", map(node_to_markdown, tag.children)) - ), - "blockquote": lambda tag: ( - "\n%s" - % str.join( - "\n", - ( - "> %s" % part - for part in str.join( - "", map(node_to_markdown, tag.children) - ).split("\n") - ), - ) - ), - "br": lambda _: "\n", - # NOTE may fail on nested lists - "ul": lambda tag: ( - "\n%s\n" - % str.join( - "\n", - ( - "* " - + node_to_markdown(li).replace("\n", "\n ").rstrip() - for li in tag.children - ), - ) - ), - "ol": lambda tag: ( - "\n%s\n" - % str.join( - "\n", - ( - "%d. %s" - % (i, node_to_markdown(li).replace("\n", "\n ").rstrip()) - for i, li in enumerate(tag.children, 1) - ), - ) - ), - } - - TAG_SUBSTITUTIONS: Dict[str, str] = { - "strong": "b", - "em": "i", - "del": "s", - "ins": "u", - } - - if isinstance(el, Tag): - if el.name in TAG_TRANSFORMS: - return TAG_TRANSFORMS[el.name](el) - if el.name in TAG_SUBSTITUTIONS: - sub = TAG_SUBSTITUTIONS[el.name] - if sub in TAG_TRANSFORMS: - return TAG_TRANSFORMS[sub](el) - return str.join("", map(node_to_markdown, el.children)) - return md_escape(str(el)) - - -def node_to_plaintext(el: PageElement) -> str: - if isinstance(el, Tag): - if el.name == "a": - return "%s (%s)" % ( - str.join("", map(node_to_plaintext, el.children)), - el.attrs["href"], - ) - elif el.name == "p": - return str.join("", map(node_to_plaintext, el.children)) + "\n\n" - elif el.name == "br": - return "\n" - elif el.name == "blockquote": - return str.join( - "\n", - ( - "\u258d%s" % part - for part in str.join( - "", map(node_to_plaintext, el.children) - ).split("\n") - ), - ) - elif el.name in ("ol", "ul"): - children = map(node_to_plaintext, el.children) - return "\n%s\n" % str.join( - "\n", - ( - " \u2022 %s" % li.replace("\n", "\n ").strip() - for li in children - ) - if el.name == "ul" - else ( - "%d. %s" % (i, li.replace("\n", "\n ").strip()) - for i, li in enumerate(children, 1) - ), - ) - return str.join("", map(node_to_plaintext, el.children)) - return str(el)