Text node processor, typo fixes, text2* cli program

This commit is contained in:
Casey 2023-05-13 09:33:21 +03:00
parent 32d7784276
commit db100c0f7e
Signed by: hkc
GPG Key ID: F0F6CFE11CDB0960
5 changed files with 66 additions and 12 deletions

View File

@ -14,7 +14,6 @@ GNU General Public License for more details.
""" """
from typing import Callable, Iterable, Literal, Optional from typing import Callable, Iterable, Literal, Optional
from bs4.element import Tag, PageElement from bs4.element import Tag, PageElement
from html import escape
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"] VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
BULLET = "\u2022" BULLET = "\u2022"
@ -39,7 +38,7 @@ node_processors: dict[
list[ list[
Callable[ Callable[
[ [
Tag, PageElement,
], ],
Optional[str], Optional[str],
] ]
@ -49,12 +48,21 @@ node_processors: dict[
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"): def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
def decorate(function): def decorate(function):
node_processors.setdefault((output_type, tag), [])
node_processors[output_type, tag].append(function) node_processors[output_type, tag].append(function)
return function return function
return decorate return decorate
def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"):
def decorate(function):
node_processors[output_type, ":text:"] = [function]
return function
return decorate
def register_fmt_converter( def register_fmt_converter(
format: str, format: str,
tag: str, tag: str,
@ -71,11 +79,15 @@ def register_fmt_converter(
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str: def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
if isinstance(el, Tag): if isinstance(el, Tag):
for func in node_processors[type_, el.name]: if (type_, el.name) in node_processors:
result = func(el) # XXX: could use walrus, but it's py3.8+ only for func in node_processors[type_, el.name]:
if result: result = func(el)
return result if result:
return escape(str(el)) return result
return nodes_process(el.children, type_)
if (type_, ":text:") in node_processors:
return node_processors[type_, ":text:"][0](el) or str(el)
return str(el)
def nodes_process( def nodes_process(

View File

@ -0,0 +1,36 @@
"""
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
from mastoposter.text import node_process, VALID_OUTPUT_TYPES
from argparse import ArgumentParser, FileType
from typing import get_args as T_get_args
from bs4 import BeautifulSoup
import sys
parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter")
parser.add_argument(
"--type",
"-t",
choices=T_get_args(VALID_OUTPUT_TYPES),
default=T_get_args(VALID_OUTPUT_TYPES)[0],
dest="output_type",
)
parser.add_argument("file", default=sys.stdin, type=FileType("r"))
args = parser.parse_args()
soup = BeautifulSoup(args.file.read(), "lxml")
print(node_process(soup, args.output_type))

View File

@ -12,10 +12,12 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
""" """
from bs4 import NavigableString
from mastoposter.text import ( from mastoposter.text import (
nodes_process, nodes_process,
register_converter, register_converter,
register_fmt_converter, register_fmt_converter,
register_text_node_converter,
node_process, node_process,
STRIPE, STRIPE,
BULLET, BULLET,
@ -26,6 +28,11 @@ from bs4.element import Tag
from html import escape from html import escape
@register_text_node_converter("html")
def proc_text_node_to_html(txt: NavigableString) -> str:
return escape(txt).strip()
@register_converter("a", "html") @register_converter("a", "html")
def proc_tag_a_to_html(tag: Tag): def proc_tag_a_to_html(tag: Tag):
return '<a href="%s">%s</a>' % ( return '<a href="%s">%s</a>' % (
@ -81,7 +88,7 @@ def proc_tag_ul_to_html(tag: Tag) -> str:
) )
@register_converter("li", "html") @register_converter("ol", "html")
def proc_tag_li_to_html(tag: Tag) -> str: def proc_tag_li_to_html(tag: Tag) -> str:
return "\n" + str.join( return "\n" + str.join(
"\n", "\n",

View File

@ -63,7 +63,7 @@ def proc_tag_ul_to_markdown(tag: Tag) -> str:
) )
@register_converter("li", "markdown") @register_converter("ol", "markdown")
def proc_tag_li_to_markdown(tag: Tag) -> str: def proc_tag_li_to_markdown(tag: Tag) -> str:
return "\n" + str.join( return "\n" + str.join(
"\n", "\n",

View File

@ -23,14 +23,13 @@ from mastoposter.text import (
) )
from bs4.element import Tag from bs4.element import Tag
from html import escape
@register_converter("a", "plain") @register_converter("a", "plain")
def proc_tag_a_to_plain(tag: Tag): def proc_tag_a_to_plain(tag: Tag):
return "%s (%s)" % ( return "%s (%s)" % (
nodes_process(tag.children, "plain"), nodes_process(tag.children, "plain"),
escape(tag.attrs.get("href", "#")), tag.attrs.get("href", "#"),
) )
@ -64,7 +63,7 @@ def proc_tag_ul_to_plain(tag: Tag) -> str:
) )
@register_converter("li", "plain") @register_converter("ol", "plain")
def proc_tag_li_to_plain(tag: Tag) -> str: def proc_tag_li_to_plain(tag: Tag) -> str:
return "\n" + str.join( return "\n" + str.join(
"\n", "\n",