From db100c0f7e5a607face2821fd02c2f97f6d2d256 Mon Sep 17 00:00:00 2001 From: hkc Date: Sat, 13 May 2023 09:33:21 +0300 Subject: [PATCH] Text node processor, typo fixes, text2* cli program --- mastoposter/text/__init__.py | 26 +++++++++++++++++++------- mastoposter/text/__main__.py | 36 ++++++++++++++++++++++++++++++++++++ mastoposter/text/html.py | 9 ++++++++- mastoposter/text/markdown.py | 2 +- mastoposter/text/plain.py | 5 ++--- 5 files changed, 66 insertions(+), 12 deletions(-) create mode 100644 mastoposter/text/__main__.py diff --git a/mastoposter/text/__init__.py b/mastoposter/text/__init__.py index ec4848d..e349396 100644 --- a/mastoposter/text/__init__.py +++ b/mastoposter/text/__init__.py @@ -14,7 +14,6 @@ GNU General Public License for more details. """ from typing import Callable, Iterable, Literal, Optional from bs4.element import Tag, PageElement -from html import escape VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"] BULLET = "\u2022" @@ -39,7 +38,7 @@ node_processors: dict[ list[ Callable[ [ - Tag, + PageElement, ], Optional[str], ] @@ -49,12 +48,21 @@ node_processors: dict[ def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"): def decorate(function): + node_processors.setdefault((output_type, tag), []) node_processors[output_type, tag].append(function) return function return decorate +def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"): + def decorate(function): + node_processors[output_type, ":text:"] = [function] + return function + + return decorate + + def register_fmt_converter( format: str, tag: str, @@ -71,11 +79,15 @@ def register_fmt_converter( def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str: if isinstance(el, Tag): - for func in node_processors[type_, el.name]: - result = func(el) # XXX: could use walrus, but it's py3.8+ only - if result: - return result - return escape(str(el)) + if (type_, el.name) in node_processors: + for func in node_processors[type_, el.name]: + result = func(el) + if result: + return result + return nodes_process(el.children, type_) + if (type_, ":text:") in node_processors: + return node_processors[type_, ":text:"][0](el) or str(el) + return str(el) def nodes_process( diff --git a/mastoposter/text/__main__.py b/mastoposter/text/__main__.py new file mode 100644 index 0000000..dd54d35 --- /dev/null +++ b/mastoposter/text/__main__.py @@ -0,0 +1,36 @@ +""" +mastoposter - configurable reposter from Mastodon-compatible Fediverse servers +Copyright (C) 2022-2023 hatkidchan + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +""" + +from mastoposter.text import node_process, VALID_OUTPUT_TYPES +from argparse import ArgumentParser, FileType +from typing import get_args as T_get_args +from bs4 import BeautifulSoup +import sys + +parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter") + +parser.add_argument( + "--type", + "-t", + choices=T_get_args(VALID_OUTPUT_TYPES), + default=T_get_args(VALID_OUTPUT_TYPES)[0], + dest="output_type", +) +parser.add_argument("file", default=sys.stdin, type=FileType("r")) + +args = parser.parse_args() + +soup = BeautifulSoup(args.file.read(), "lxml") +print(node_process(soup, args.output_type)) diff --git a/mastoposter/text/html.py b/mastoposter/text/html.py index 9cc35c8..c36806c 100644 --- a/mastoposter/text/html.py +++ b/mastoposter/text/html.py @@ -12,10 +12,12 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. """ +from bs4 import NavigableString from mastoposter.text import ( nodes_process, register_converter, register_fmt_converter, + register_text_node_converter, node_process, STRIPE, BULLET, @@ -26,6 +28,11 @@ from bs4.element import Tag from html import escape +@register_text_node_converter("html") +def proc_text_node_to_html(txt: NavigableString) -> str: + return escape(txt).strip() + + @register_converter("a", "html") def proc_tag_a_to_html(tag: Tag): return '%s' % ( @@ -81,7 +88,7 @@ def proc_tag_ul_to_html(tag: Tag) -> str: ) -@register_converter("li", "html") +@register_converter("ol", "html") def proc_tag_li_to_html(tag: Tag) -> str: return "\n" + str.join( "\n", diff --git a/mastoposter/text/markdown.py b/mastoposter/text/markdown.py index 23935d3..1d39b65 100644 --- a/mastoposter/text/markdown.py +++ b/mastoposter/text/markdown.py @@ -63,7 +63,7 @@ def proc_tag_ul_to_markdown(tag: Tag) -> str: ) -@register_converter("li", "markdown") +@register_converter("ol", "markdown") def proc_tag_li_to_markdown(tag: Tag) -> str: return "\n" + str.join( "\n", diff --git a/mastoposter/text/plain.py b/mastoposter/text/plain.py index 8fcbb30..ed589db 100644 --- a/mastoposter/text/plain.py +++ b/mastoposter/text/plain.py @@ -23,14 +23,13 @@ from mastoposter.text import ( ) from bs4.element import Tag -from html import escape @register_converter("a", "plain") def proc_tag_a_to_plain(tag: Tag): return "%s (%s)" % ( nodes_process(tag.children, "plain"), - escape(tag.attrs.get("href", "#")), + tag.attrs.get("href", "#"), ) @@ -64,7 +63,7 @@ def proc_tag_ul_to_plain(tag: Tag) -> str: ) -@register_converter("li", "plain") +@register_converter("ol", "plain") def proc_tag_li_to_plain(tag: Tag) -> str: return "\n" + str.join( "\n",