Text node processor, typo fixes, text2* cli program
This commit is contained in:
parent
32d7784276
commit
db100c0f7e
|
@ -14,7 +14,6 @@ GNU General Public License for more details.
|
|||
"""
|
||||
from typing import Callable, Iterable, Literal, Optional
|
||||
from bs4.element import Tag, PageElement
|
||||
from html import escape
|
||||
|
||||
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
|
||||
BULLET = "\u2022"
|
||||
|
@ -39,7 +38,7 @@ node_processors: dict[
|
|||
list[
|
||||
Callable[
|
||||
[
|
||||
Tag,
|
||||
PageElement,
|
||||
],
|
||||
Optional[str],
|
||||
]
|
||||
|
@ -49,12 +48,21 @@ node_processors: dict[
|
|||
|
||||
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
|
||||
def decorate(function):
|
||||
node_processors.setdefault((output_type, tag), [])
|
||||
node_processors[output_type, tag].append(function)
|
||||
return function
|
||||
|
||||
return decorate
|
||||
|
||||
|
||||
def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"):
|
||||
def decorate(function):
|
||||
node_processors[output_type, ":text:"] = [function]
|
||||
return function
|
||||
|
||||
return decorate
|
||||
|
||||
|
||||
def register_fmt_converter(
|
||||
format: str,
|
||||
tag: str,
|
||||
|
@ -71,11 +79,15 @@ def register_fmt_converter(
|
|||
|
||||
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
|
||||
if isinstance(el, Tag):
|
||||
if (type_, el.name) in node_processors:
|
||||
for func in node_processors[type_, el.name]:
|
||||
result = func(el) # XXX: could use walrus, but it's py3.8+ only
|
||||
result = func(el)
|
||||
if result:
|
||||
return result
|
||||
return escape(str(el))
|
||||
return nodes_process(el.children, type_)
|
||||
if (type_, ":text:") in node_processors:
|
||||
return node_processors[type_, ":text:"][0](el) or str(el)
|
||||
return str(el)
|
||||
|
||||
|
||||
def nodes_process(
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
"""
|
||||
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
"""
|
||||
|
||||
from mastoposter.text import node_process, VALID_OUTPUT_TYPES
|
||||
from argparse import ArgumentParser, FileType
|
||||
from typing import get_args as T_get_args
|
||||
from bs4 import BeautifulSoup
|
||||
import sys
|
||||
|
||||
parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter")
|
||||
|
||||
parser.add_argument(
|
||||
"--type",
|
||||
"-t",
|
||||
choices=T_get_args(VALID_OUTPUT_TYPES),
|
||||
default=T_get_args(VALID_OUTPUT_TYPES)[0],
|
||||
dest="output_type",
|
||||
)
|
||||
parser.add_argument("file", default=sys.stdin, type=FileType("r"))
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
soup = BeautifulSoup(args.file.read(), "lxml")
|
||||
print(node_process(soup, args.output_type))
|
|
@ -12,10 +12,12 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
"""
|
||||
from bs4 import NavigableString
|
||||
from mastoposter.text import (
|
||||
nodes_process,
|
||||
register_converter,
|
||||
register_fmt_converter,
|
||||
register_text_node_converter,
|
||||
node_process,
|
||||
STRIPE,
|
||||
BULLET,
|
||||
|
@ -26,6 +28,11 @@ from bs4.element import Tag
|
|||
from html import escape
|
||||
|
||||
|
||||
@register_text_node_converter("html")
|
||||
def proc_text_node_to_html(txt: NavigableString) -> str:
|
||||
return escape(txt).strip()
|
||||
|
||||
|
||||
@register_converter("a", "html")
|
||||
def proc_tag_a_to_html(tag: Tag):
|
||||
return '<a href="%s">%s</a>' % (
|
||||
|
@ -81,7 +88,7 @@ def proc_tag_ul_to_html(tag: Tag) -> str:
|
|||
)
|
||||
|
||||
|
||||
@register_converter("li", "html")
|
||||
@register_converter("ol", "html")
|
||||
def proc_tag_li_to_html(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
|
|
|
@ -63,7 +63,7 @@ def proc_tag_ul_to_markdown(tag: Tag) -> str:
|
|||
)
|
||||
|
||||
|
||||
@register_converter("li", "markdown")
|
||||
@register_converter("ol", "markdown")
|
||||
def proc_tag_li_to_markdown(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
|
|
|
@ -23,14 +23,13 @@ from mastoposter.text import (
|
|||
)
|
||||
|
||||
from bs4.element import Tag
|
||||
from html import escape
|
||||
|
||||
|
||||
@register_converter("a", "plain")
|
||||
def proc_tag_a_to_plain(tag: Tag):
|
||||
return "%s (%s)" % (
|
||||
nodes_process(tag.children, "plain"),
|
||||
escape(tag.attrs.get("href", "#")),
|
||||
tag.attrs.get("href", "#"),
|
||||
)
|
||||
|
||||
|
||||
|
@ -64,7 +63,7 @@ def proc_tag_ul_to_plain(tag: Tag) -> str:
|
|||
)
|
||||
|
||||
|
||||
@register_converter("li", "plain")
|
||||
@register_converter("ol", "plain")
|
||||
def proc_tag_li_to_plain(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
|
|
Loading…
Reference in New Issue