Merge branch 'html2everything'
This commit is contained in:
commit
9889ca251a
|
@ -0,0 +1,105 @@
|
|||
"""
|
||||
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
"""
|
||||
from typing import Callable, Iterable, Literal, Optional
|
||||
from bs4.element import Tag, PageElement
|
||||
|
||||
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
|
||||
BULLET = "\u2022"
|
||||
STRIPE = "\u258d"
|
||||
|
||||
|
||||
def md_escape(text: str) -> str:
|
||||
return (
|
||||
text.replace("\\", "\\\\")
|
||||
.replace("*", "\\*")
|
||||
.replace("[", "\\[")
|
||||
.replace("]", "\\]")
|
||||
.replace("_", "\\_")
|
||||
.replace("~", "\\~")
|
||||
.replace("|", "\\|")
|
||||
.replace("`", "\\`")
|
||||
)
|
||||
|
||||
|
||||
node_processors: dict[
|
||||
tuple[VALID_OUTPUT_TYPES, str],
|
||||
list[
|
||||
Callable[
|
||||
[
|
||||
PageElement,
|
||||
],
|
||||
Optional[str],
|
||||
]
|
||||
],
|
||||
] = {}
|
||||
|
||||
|
||||
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
|
||||
def decorate(function):
|
||||
node_processors.setdefault((output_type, tag), [])
|
||||
node_processors[output_type, tag].append(function)
|
||||
return function
|
||||
|
||||
return decorate
|
||||
|
||||
|
||||
def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"):
|
||||
def decorate(function):
|
||||
node_processors[output_type, ":text:"] = [function]
|
||||
return function
|
||||
|
||||
return decorate
|
||||
|
||||
|
||||
def register_fmt_converter(
|
||||
format: str,
|
||||
tag: str,
|
||||
output_type: VALID_OUTPUT_TYPES = "plain",
|
||||
separator: str = "",
|
||||
):
|
||||
def fmt_tag(el: Tag) -> str:
|
||||
if "%s" in format:
|
||||
return format % nodes_process(el.children, output_type, separator)
|
||||
return format
|
||||
|
||||
register_converter(tag, output_type)(fmt_tag)
|
||||
|
||||
|
||||
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
|
||||
if isinstance(el, Tag):
|
||||
if (type_, el.name) in node_processors:
|
||||
for func in node_processors[type_, el.name]:
|
||||
result = func(el)
|
||||
if result:
|
||||
return result
|
||||
return nodes_process(el.children, type_)
|
||||
if (type_, ":text:") in node_processors:
|
||||
return node_processors[type_, ":text:"][0](el) or str(el)
|
||||
return str(el)
|
||||
|
||||
|
||||
def nodes_process(
|
||||
els: Iterable[PageElement],
|
||||
type_: VALID_OUTPUT_TYPES = "plain",
|
||||
separator: str = "",
|
||||
) -> str:
|
||||
return str.join(separator, (node_process(el, type_) for el in els))
|
||||
|
||||
|
||||
__all__ = ["node_process", "nodes_process", "md_escape", "BULLET", "STRIPE"]
|
||||
|
||||
import mastoposter.text.html # noqa F401
|
||||
import mastoposter.text.markdown # noqa F401
|
||||
import mastoposter.text.plain # noqa F401
|
|
@ -0,0 +1,36 @@
|
|||
"""
|
||||
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
"""
|
||||
|
||||
from mastoposter.text import node_process, VALID_OUTPUT_TYPES
|
||||
from argparse import ArgumentParser, FileType
|
||||
from typing import get_args as T_get_args
|
||||
from bs4 import BeautifulSoup
|
||||
import sys
|
||||
|
||||
parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter")
|
||||
|
||||
parser.add_argument(
|
||||
"--type",
|
||||
"-t",
|
||||
choices=T_get_args(VALID_OUTPUT_TYPES),
|
||||
default=T_get_args(VALID_OUTPUT_TYPES)[0],
|
||||
dest="output_type",
|
||||
)
|
||||
parser.add_argument("file", default=sys.stdin, type=FileType("r"))
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
soup = BeautifulSoup(args.file.read(), "lxml")
|
||||
print(node_process(soup, args.output_type))
|
|
@ -0,0 +1,100 @@
|
|||
"""
|
||||
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
"""
|
||||
from bs4 import NavigableString
|
||||
from mastoposter.text import (
|
||||
nodes_process,
|
||||
register_converter,
|
||||
register_fmt_converter,
|
||||
register_text_node_converter,
|
||||
node_process,
|
||||
STRIPE,
|
||||
BULLET,
|
||||
)
|
||||
|
||||
from typing import Optional
|
||||
from bs4.element import Tag
|
||||
from html import escape
|
||||
|
||||
|
||||
@register_text_node_converter("html")
|
||||
def proc_text_node_to_html(txt: NavigableString) -> str:
|
||||
return escape(txt).strip()
|
||||
|
||||
|
||||
@register_converter("a", "html")
|
||||
def proc_tag_a_to_html(tag: Tag):
|
||||
return '<a href="%s">%s</a>' % (
|
||||
escape(tag.attrs.get("href", "#")),
|
||||
nodes_process(tag.children, "html"),
|
||||
)
|
||||
|
||||
|
||||
register_fmt_converter("%s\n\n", "p", "html")
|
||||
register_fmt_converter("<i>%s</i>", "i", "html")
|
||||
register_fmt_converter("<i>%s</i>", "em", "html")
|
||||
register_fmt_converter("<b>%s</b>", "b", "html")
|
||||
register_fmt_converter("<b>%s</b>", "strong", "html")
|
||||
register_fmt_converter("<s>%s</s>", "s", "html")
|
||||
register_fmt_converter("<s>%s</s>", "del", "html")
|
||||
register_fmt_converter("<u>%s</u>", "u", "html")
|
||||
register_fmt_converter("<u>%s</u>", "ins", "html")
|
||||
register_fmt_converter("\n", "br", "html")
|
||||
register_fmt_converter("\n<pre>%s</pre>\n", "pre", "html")
|
||||
register_fmt_converter("<code>%s</code>", "code", "html")
|
||||
|
||||
|
||||
@register_converter("span", "html")
|
||||
def proc_tag_span_to_html(tag: Tag) -> Optional[str]:
|
||||
if "_mfm_blur_" in tag.attrs.get("class", ""):
|
||||
return '<span class="tg-spoiler">%s</span>' % nodes_process(
|
||||
tag.children, "html"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
@register_converter("blockquote", "html")
|
||||
def proc_tag_blockquote_to_html(tag: Tag) -> str:
|
||||
return str.join(
|
||||
"\n",
|
||||
(
|
||||
STRIPE + " " + line
|
||||
for line in nodes_process(tag.children, "html").strip().split("\n")
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@register_converter("ul", "html")
|
||||
def proc_tag_ul_to_html(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
(
|
||||
BULLET
|
||||
+ " "
|
||||
+ node_process(el, "html").replace("\n", "\n ").rstrip()
|
||||
for el in tag.children
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@register_converter("ol", "html")
|
||||
def proc_tag_li_to_html(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
(
|
||||
"%d. %s"
|
||||
% (i, node_process(el, "html").replace("\n", "\n ").rstrip())
|
||||
for i, el in enumerate(tag.children, 1)
|
||||
),
|
||||
)
|
|
@ -0,0 +1,75 @@
|
|||
from mastoposter.text import (
|
||||
nodes_process,
|
||||
register_converter,
|
||||
register_fmt_converter,
|
||||
node_process,
|
||||
)
|
||||
|
||||
from typing import Optional
|
||||
from bs4.element import Tag
|
||||
from html import escape
|
||||
|
||||
|
||||
@register_converter("a", "markdown")
|
||||
def proc_tag_a_to_markdown(tag: Tag):
|
||||
return "[%s](%s)" % (
|
||||
nodes_process(tag.children, "markdown"),
|
||||
escape(tag.attrs.get("href", "#")),
|
||||
)
|
||||
|
||||
|
||||
register_fmt_converter("%s\n\n", "p", "markdown")
|
||||
register_fmt_converter("*%s*", "i", "markdown")
|
||||
register_fmt_converter("*%s*", "em", "markdown")
|
||||
register_fmt_converter("**%s**", "b", "markdown")
|
||||
register_fmt_converter("**%s**", "strong", "markdown")
|
||||
register_fmt_converter("~~%s~~", "s", "markdown")
|
||||
register_fmt_converter("~~%s~~", "del", "markdown")
|
||||
register_fmt_converter("__%s__", "u", "markdown")
|
||||
register_fmt_converter("__%s__", "ins", "markdown")
|
||||
register_fmt_converter("\n", "br", "markdown")
|
||||
register_fmt_converter("\n```%s```\n", "pre", "markdown")
|
||||
register_fmt_converter("`%s`", "code", "markdown")
|
||||
|
||||
|
||||
@register_converter("span", "markdown")
|
||||
def proc_tag_span_to_markdown(tag: Tag) -> Optional[str]:
|
||||
if "_mfm_blur_" in tag.attrs.get("class", ""):
|
||||
return "||%s||" % nodes_process(tag.children, "markdown")
|
||||
return None
|
||||
|
||||
|
||||
@register_converter("blockquote", "markdown")
|
||||
def proc_tag_blockquote_to_markdown(tag: Tag) -> str:
|
||||
return str.join(
|
||||
"\n",
|
||||
(
|
||||
"> " + line
|
||||
for line in nodes_process(tag.children, "markdown")
|
||||
.strip()
|
||||
.split("\n")
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@register_converter("ul", "markdown")
|
||||
def proc_tag_ul_to_markdown(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
(
|
||||
"* " + node_process(el, "markdown").replace("\n", "\n ").rstrip()
|
||||
for el in tag.children
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@register_converter("ol", "markdown")
|
||||
def proc_tag_li_to_markdown(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
(
|
||||
"%d. %s"
|
||||
% (i, node_process(el, "markdown").replace("\n", "\n ").rstrip())
|
||||
for i, el in enumerate(tag.children, 1)
|
||||
),
|
||||
)
|
|
@ -0,0 +1,75 @@
|
|||
"""
|
||||
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
"""
|
||||
|
||||
from mastoposter.text import (
|
||||
nodes_process,
|
||||
register_converter,
|
||||
register_fmt_converter,
|
||||
node_process,
|
||||
STRIPE,
|
||||
BULLET,
|
||||
)
|
||||
|
||||
from bs4.element import Tag
|
||||
|
||||
|
||||
@register_converter("a", "plain")
|
||||
def proc_tag_a_to_plain(tag: Tag):
|
||||
return "%s (%s)" % (
|
||||
nodes_process(tag.children, "plain"),
|
||||
tag.attrs.get("href", "#"),
|
||||
)
|
||||
|
||||
|
||||
register_fmt_converter("%s\n\n", "p", "plain")
|
||||
register_fmt_converter("\n", "br", "plain")
|
||||
|
||||
|
||||
@register_converter("blockquote", "plain")
|
||||
def proc_tag_blockquote_to_plain(tag: Tag) -> str:
|
||||
return str.join(
|
||||
"\n",
|
||||
(
|
||||
STRIPE + " " + line
|
||||
for line in nodes_process(tag.children, "plain")
|
||||
.strip()
|
||||
.split("\n")
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@register_converter("ul", "plain")
|
||||
def proc_tag_ul_to_plain(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
(
|
||||
BULLET
|
||||
+ " "
|
||||
+ node_process(el, "plain").replace("\n", "\n ").rstrip()
|
||||
for el in tag.children
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@register_converter("ol", "plain")
|
||||
def proc_tag_li_to_plain(tag: Tag) -> str:
|
||||
return "\n" + str.join(
|
||||
"\n",
|
||||
(
|
||||
"%d. %s"
|
||||
% (i, node_process(el, "plain").replace("\n", "\n ").rstrip())
|
||||
for i, el in enumerate(tag.children, 1)
|
||||
),
|
||||
)
|
|
@ -18,7 +18,7 @@ from typing import Any, Callable, Optional, List, Literal, TypeVar
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from mastoposter.utils import node_to_html, node_to_markdown, node_to_plaintext
|
||||
from mastoposter.text import node_process
|
||||
|
||||
|
||||
def _date(val: str) -> datetime:
|
||||
|
@ -355,18 +355,18 @@ class Status:
|
|||
|
||||
@property
|
||||
def content_flathtml(self) -> str:
|
||||
return node_to_html(
|
||||
BeautifulSoup(self.content, features="lxml")
|
||||
return node_process(
|
||||
BeautifulSoup(self.content, features="lxml"), "html"
|
||||
).rstrip()
|
||||
|
||||
@property
|
||||
def content_markdown(self) -> str:
|
||||
return node_to_markdown(
|
||||
BeautifulSoup(self.content, features="lxml")
|
||||
return node_process(
|
||||
BeautifulSoup(self.content, features="lxml"), "markdown"
|
||||
).rstrip()
|
||||
|
||||
@property
|
||||
def content_plaintext(self) -> str:
|
||||
return node_to_plaintext(
|
||||
BeautifulSoup(self.content, features="lxml")
|
||||
return node_process(
|
||||
BeautifulSoup(self.content, features="lxml"), "plain"
|
||||
).rstrip()
|
||||
|
|
|
@ -13,27 +13,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|||
GNU General Public License for more details.
|
||||
"""
|
||||
from configparser import ConfigParser
|
||||
from html import escape
|
||||
from logging import getLogger
|
||||
from typing import Callable, Dict
|
||||
from bs4.element import Tag, PageElement
|
||||
|
||||
logger = getLogger("utils")
|
||||
|
||||
|
||||
def md_escape(text: str) -> str:
|
||||
return (
|
||||
text.replace("\\", "\\\\")
|
||||
.replace("*", "\\*")
|
||||
.replace("[", "\\[")
|
||||
.replace("]", "\\]")
|
||||
.replace("_", "\\_")
|
||||
.replace("~", "\\~")
|
||||
.replace("|", "\\|")
|
||||
.replace("`", "\\`")
|
||||
)
|
||||
|
||||
|
||||
def normalize_config(conf: ConfigParser):
|
||||
for section in conf.sections():
|
||||
_remove = set()
|
||||
|
@ -49,238 +33,3 @@ def normalize_config(conf: ConfigParser):
|
|||
for k in _remove:
|
||||
logger.info("removing key %r.%r", section, k)
|
||||
del conf[section][k]
|
||||
|
||||
|
||||
def node_to_html(el: PageElement) -> str:
|
||||
TAG_TRANSFORMS: Dict[
|
||||
str,
|
||||
Callable[
|
||||
[
|
||||
Tag,
|
||||
],
|
||||
str,
|
||||
],
|
||||
] = {
|
||||
"a": lambda tag: '<a href="{}">{}</a>'.format(
|
||||
escape(tag.attrs["href"]),
|
||||
str.join("", map(node_to_html, tag.children)),
|
||||
),
|
||||
"p": lambda tag: (
|
||||
str.join("", map(node_to_html, tag.children)) + "\n\n"
|
||||
),
|
||||
"i": lambda tag: (
|
||||
"<i>%s</i>" % str.join("", map(node_to_html, tag.children))
|
||||
),
|
||||
"b": lambda tag: (
|
||||
"<b>%s</b>" % str.join("", map(node_to_html, tag.children))
|
||||
),
|
||||
"s": lambda tag: (
|
||||
"<s>%s</s>" % str.join("", map(node_to_html, tag.children))
|
||||
),
|
||||
"u": lambda tag: (
|
||||
"<u>%s</u>" % str.join("", map(node_to_html, tag.children))
|
||||
),
|
||||
"pre": lambda tag: (
|
||||
"\n<pre>%s</pre>\n" % str.join("", map(node_to_html, tag.children))
|
||||
),
|
||||
"code": lambda tag: (
|
||||
"<code>%s</code>" % str.join("", map(node_to_html, tag.children))
|
||||
),
|
||||
"span": lambda tag: (
|
||||
(
|
||||
'<span class="tg-spoiler">%s</span>'
|
||||
if "_mfm_blur_" in tag.attrs.get("class", "")
|
||||
else "%s"
|
||||
)
|
||||
% str.join("", map(node_to_html, tag.children))
|
||||
),
|
||||
"blockquote": lambda tag: "\n%s"
|
||||
% str.join(
|
||||
"\n",
|
||||
(
|
||||
"| %s" % part
|
||||
for part in str.join(
|
||||
"", map(node_to_html, tag.children)
|
||||
).split("\n")
|
||||
),
|
||||
),
|
||||
"br": lambda _: "\n",
|
||||
# NOTE may fail on nested lists
|
||||
"ul": lambda tag: (
|
||||
"\n"
|
||||
+ str.join(
|
||||
"\n",
|
||||
(
|
||||
" \u2022 "
|
||||
+ node_to_html(li).replace("\n", "\n ").rstrip()
|
||||
for li in tag.children
|
||||
),
|
||||
)
|
||||
+ "\n"
|
||||
),
|
||||
"ol": lambda tag: (
|
||||
"\n"
|
||||
+ str.join(
|
||||
"\n",
|
||||
(
|
||||
"%d. %s"
|
||||
% (i, node_to_html(li).replace("\n", "\n ").rstrip())
|
||||
for i, li in enumerate(tag.children, 1)
|
||||
),
|
||||
)
|
||||
+ "\n"
|
||||
),
|
||||
}
|
||||
|
||||
TAG_SUBSTITUTIONS: Dict[str, str] = {
|
||||
"strong": "b",
|
||||
"em": "i",
|
||||
"del": "s",
|
||||
"ins": "u",
|
||||
}
|
||||
|
||||
if isinstance(el, Tag):
|
||||
if el.name in TAG_TRANSFORMS:
|
||||
return TAG_TRANSFORMS[el.name](el)
|
||||
if el.name in TAG_SUBSTITUTIONS:
|
||||
sub = TAG_SUBSTITUTIONS[el.name]
|
||||
if sub in TAG_TRANSFORMS:
|
||||
return TAG_TRANSFORMS[sub](el)
|
||||
return str.join("", map(node_to_html, el.children))
|
||||
return escape(str(el))
|
||||
|
||||
|
||||
def node_to_markdown(el: PageElement) -> str:
|
||||
""" Convert HTML to Markdown (Discord flavor) """
|
||||
|
||||
TAG_TRANSFORMS: Dict[
|
||||
str,
|
||||
Callable[
|
||||
[
|
||||
Tag,
|
||||
],
|
||||
str,
|
||||
],
|
||||
] = {
|
||||
"a": lambda tag: "[{}]({})".format(
|
||||
md_escape(str.join("", map(node_to_markdown, tag.children))),
|
||||
tag.attrs["href"],
|
||||
),
|
||||
"p": lambda tag: (
|
||||
str.join("", map(node_to_markdown, tag.children)) + "\n\n"
|
||||
),
|
||||
"i": lambda tag: (
|
||||
"*%s*" % str.join("", map(node_to_markdown, tag.children))
|
||||
),
|
||||
"b": lambda tag: (
|
||||
"**%s**" % str.join("", map(node_to_markdown, tag.children))
|
||||
),
|
||||
"s": lambda tag: (
|
||||
"~~%s~~" % str.join("", map(node_to_markdown, tag.children))
|
||||
),
|
||||
"u": lambda tag: (
|
||||
"__%s__" % str.join("", map(node_to_markdown, tag.children))
|
||||
),
|
||||
"pre": lambda tag: (
|
||||
"\n```%s```\n" % str.join("", map(node_to_markdown, tag.children))
|
||||
),
|
||||
"code": lambda tag: (
|
||||
"`%s`" % str.join("", map(node_to_markdown, tag.children))
|
||||
),
|
||||
"span": lambda tag: (
|
||||
("||%s||" if "_mfm_blur_" in tag.attrs.get("class", "") else "%s")
|
||||
% str.join("", map(node_to_markdown, tag.children))
|
||||
),
|
||||
"blockquote": lambda tag: (
|
||||
"\n%s"
|
||||
% str.join(
|
||||
"\n",
|
||||
(
|
||||
"> %s" % part
|
||||
for part in str.join(
|
||||
"", map(node_to_markdown, tag.children)
|
||||
).split("\n")
|
||||
),
|
||||
)
|
||||
),
|
||||
"br": lambda _: "\n",
|
||||
# NOTE may fail on nested lists
|
||||
"ul": lambda tag: (
|
||||
"\n%s\n"
|
||||
% str.join(
|
||||
"\n",
|
||||
(
|
||||
"* "
|
||||
+ node_to_markdown(li).replace("\n", "\n ").rstrip()
|
||||
for li in tag.children
|
||||
),
|
||||
)
|
||||
),
|
||||
"ol": lambda tag: (
|
||||
"\n%s\n"
|
||||
% str.join(
|
||||
"\n",
|
||||
(
|
||||
"%d. %s"
|
||||
% (i, node_to_markdown(li).replace("\n", "\n ").rstrip())
|
||||
for i, li in enumerate(tag.children, 1)
|
||||
),
|
||||
)
|
||||
),
|
||||
}
|
||||
|
||||
TAG_SUBSTITUTIONS: Dict[str, str] = {
|
||||
"strong": "b",
|
||||
"em": "i",
|
||||
"del": "s",
|
||||
"ins": "u",
|
||||
}
|
||||
|
||||
if isinstance(el, Tag):
|
||||
if el.name in TAG_TRANSFORMS:
|
||||
return TAG_TRANSFORMS[el.name](el)
|
||||
if el.name in TAG_SUBSTITUTIONS:
|
||||
sub = TAG_SUBSTITUTIONS[el.name]
|
||||
if sub in TAG_TRANSFORMS:
|
||||
return TAG_TRANSFORMS[sub](el)
|
||||
return str.join("", map(node_to_markdown, el.children))
|
||||
return md_escape(str(el))
|
||||
|
||||
|
||||
def node_to_plaintext(el: PageElement) -> str:
|
||||
if isinstance(el, Tag):
|
||||
if el.name == "a":
|
||||
return "%s (%s)" % (
|
||||
str.join("", map(node_to_plaintext, el.children)),
|
||||
el.attrs["href"],
|
||||
)
|
||||
elif el.name == "p":
|
||||
return str.join("", map(node_to_plaintext, el.children)) + "\n\n"
|
||||
elif el.name == "br":
|
||||
return "\n"
|
||||
elif el.name == "blockquote":
|
||||
return str.join(
|
||||
"\n",
|
||||
(
|
||||
"\u258d%s" % part
|
||||
for part in str.join(
|
||||
"", map(node_to_plaintext, el.children)
|
||||
).split("\n")
|
||||
),
|
||||
)
|
||||
elif el.name in ("ol", "ul"):
|
||||
children = map(node_to_plaintext, el.children)
|
||||
return "\n%s\n" % str.join(
|
||||
"\n",
|
||||
(
|
||||
" \u2022 %s" % li.replace("\n", "\n ").strip()
|
||||
for li in children
|
||||
)
|
||||
if el.name == "ul"
|
||||
else (
|
||||
"%d. %s" % (i, li.replace("\n", "\n ").strip())
|
||||
for i, li in enumerate(children, 1)
|
||||
),
|
||||
)
|
||||
return str.join("", map(node_to_plaintext, el.children))
|
||||
return str(el)
|
||||
|
|
Loading…
Reference in New Issue