Text node processor, typo fixes, text2* cli program

This commit is contained in:
Casey 2023-05-13 09:33:21 +03:00
parent 32d7784276
commit db100c0f7e
Signed by: hkc
GPG Key ID: F0F6CFE11CDB0960
5 changed files with 66 additions and 12 deletions

View File

@ -14,7 +14,6 @@ GNU General Public License for more details.
"""
from typing import Callable, Iterable, Literal, Optional
from bs4.element import Tag, PageElement
from html import escape
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
BULLET = "\u2022"
@ -39,7 +38,7 @@ node_processors: dict[
list[
Callable[
[
Tag,
PageElement,
],
Optional[str],
]
@ -49,12 +48,21 @@ node_processors: dict[
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
def decorate(function):
node_processors.setdefault((output_type, tag), [])
node_processors[output_type, tag].append(function)
return function
return decorate
def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"):
def decorate(function):
node_processors[output_type, ":text:"] = [function]
return function
return decorate
def register_fmt_converter(
format: str,
tag: str,
@ -71,11 +79,15 @@ def register_fmt_converter(
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
if isinstance(el, Tag):
for func in node_processors[type_, el.name]:
result = func(el) # XXX: could use walrus, but it's py3.8+ only
if result:
return result
return escape(str(el))
if (type_, el.name) in node_processors:
for func in node_processors[type_, el.name]:
result = func(el)
if result:
return result
return nodes_process(el.children, type_)
if (type_, ":text:") in node_processors:
return node_processors[type_, ":text:"][0](el) or str(el)
return str(el)
def nodes_process(

View File

@ -0,0 +1,36 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from mastoposter.text import node_process, VALID_OUTPUT_TYPES
from argparse import ArgumentParser, FileType
from typing import get_args as T_get_args
from bs4 import BeautifulSoup
import sys
parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter")
parser.add_argument(
"--type",
"-t",
choices=T_get_args(VALID_OUTPUT_TYPES),
default=T_get_args(VALID_OUTPUT_TYPES)[0],
dest="output_type",
)
parser.add_argument("file", default=sys.stdin, type=FileType("r"))
args = parser.parse_args()
soup = BeautifulSoup(args.file.read(), "lxml")
print(node_process(soup, args.output_type))

View File

@ -12,10 +12,12 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from bs4 import NavigableString
from mastoposter.text import (
nodes_process,
register_converter,
register_fmt_converter,
register_text_node_converter,
node_process,
STRIPE,
BULLET,
@ -26,6 +28,11 @@ from bs4.element import Tag
from html import escape
@register_text_node_converter("html")
def proc_text_node_to_html(txt: NavigableString) -> str:
return escape(txt).strip()
@register_converter("a", "html")
def proc_tag_a_to_html(tag: Tag):
return '<a href="%s">%s</a>' % (
@ -81,7 +88,7 @@ def proc_tag_ul_to_html(tag: Tag) -> str:
)
@register_converter("li", "html")
@register_converter("ol", "html")
def proc_tag_li_to_html(tag: Tag) -> str:
return "\n" + str.join(
"\n",

View File

@ -63,7 +63,7 @@ def proc_tag_ul_to_markdown(tag: Tag) -> str:
)
@register_converter("li", "markdown")
@register_converter("ol", "markdown")
def proc_tag_li_to_markdown(tag: Tag) -> str:
return "\n" + str.join(
"\n",

View File

@ -23,14 +23,13 @@ from mastoposter.text import (
)
from bs4.element import Tag
from html import escape
@register_converter("a", "plain")
def proc_tag_a_to_plain(tag: Tag):
return "%s (%s)" % (
nodes_process(tag.children, "plain"),
escape(tag.attrs.get("href", "#")),
tag.attrs.get("href", "#"),
)
@ -64,7 +63,7 @@ def proc_tag_ul_to_plain(tag: Tag) -> str:
)
@register_converter("li", "plain")
@register_converter("ol", "plain")
def proc_tag_li_to_plain(tag: Tag) -> str:
return "\n" + str.join(
"\n",