Text node processor, typo fixes, text2* cli program
This commit is contained in:
parent
32d7784276
commit
db100c0f7e
|
@ -14,7 +14,6 @@ GNU General Public License for more details.
|
||||||
"""
|
"""
|
||||||
from typing import Callable, Iterable, Literal, Optional
|
from typing import Callable, Iterable, Literal, Optional
|
||||||
from bs4.element import Tag, PageElement
|
from bs4.element import Tag, PageElement
|
||||||
from html import escape
|
|
||||||
|
|
||||||
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
|
VALID_OUTPUT_TYPES = Literal["plain", "html", "markdown"]
|
||||||
BULLET = "\u2022"
|
BULLET = "\u2022"
|
||||||
|
@ -39,7 +38,7 @@ node_processors: dict[
|
||||||
list[
|
list[
|
||||||
Callable[
|
Callable[
|
||||||
[
|
[
|
||||||
Tag,
|
PageElement,
|
||||||
],
|
],
|
||||||
Optional[str],
|
Optional[str],
|
||||||
]
|
]
|
||||||
|
@ -49,12 +48,21 @@ node_processors: dict[
|
||||||
|
|
||||||
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
|
def register_converter(tag: str, output_type: VALID_OUTPUT_TYPES = "plain"):
|
||||||
def decorate(function):
|
def decorate(function):
|
||||||
|
node_processors.setdefault((output_type, tag), [])
|
||||||
node_processors[output_type, tag].append(function)
|
node_processors[output_type, tag].append(function)
|
||||||
return function
|
return function
|
||||||
|
|
||||||
return decorate
|
return decorate
|
||||||
|
|
||||||
|
|
||||||
|
def register_text_node_converter(output_type: VALID_OUTPUT_TYPES = "plain"):
|
||||||
|
def decorate(function):
|
||||||
|
node_processors[output_type, ":text:"] = [function]
|
||||||
|
return function
|
||||||
|
|
||||||
|
return decorate
|
||||||
|
|
||||||
|
|
||||||
def register_fmt_converter(
|
def register_fmt_converter(
|
||||||
format: str,
|
format: str,
|
||||||
tag: str,
|
tag: str,
|
||||||
|
@ -71,11 +79,15 @@ def register_fmt_converter(
|
||||||
|
|
||||||
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
|
def node_process(el: PageElement, type_: VALID_OUTPUT_TYPES) -> str:
|
||||||
if isinstance(el, Tag):
|
if isinstance(el, Tag):
|
||||||
|
if (type_, el.name) in node_processors:
|
||||||
for func in node_processors[type_, el.name]:
|
for func in node_processors[type_, el.name]:
|
||||||
result = func(el) # XXX: could use walrus, but it's py3.8+ only
|
result = func(el)
|
||||||
if result:
|
if result:
|
||||||
return result
|
return result
|
||||||
return escape(str(el))
|
return nodes_process(el.children, type_)
|
||||||
|
if (type_, ":text:") in node_processors:
|
||||||
|
return node_processors[type_, ":text:"][0](el) or str(el)
|
||||||
|
return str(el)
|
||||||
|
|
||||||
|
|
||||||
def nodes_process(
|
def nodes_process(
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
"""
|
||||||
|
mastoposter - configurable reposter from Mastodon-compatible Fediverse servers
|
||||||
|
Copyright (C) 2022-2023 hatkidchan <hatkidchan@gmail.com>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from mastoposter.text import node_process, VALID_OUTPUT_TYPES
|
||||||
|
from argparse import ArgumentParser, FileType
|
||||||
|
from typing import get_args as T_get_args
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import sys
|
||||||
|
|
||||||
|
parser = ArgumentParser("mastoposter.text", description="HTML-to-* converter")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--type",
|
||||||
|
"-t",
|
||||||
|
choices=T_get_args(VALID_OUTPUT_TYPES),
|
||||||
|
default=T_get_args(VALID_OUTPUT_TYPES)[0],
|
||||||
|
dest="output_type",
|
||||||
|
)
|
||||||
|
parser.add_argument("file", default=sys.stdin, type=FileType("r"))
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(args.file.read(), "lxml")
|
||||||
|
print(node_process(soup, args.output_type))
|
|
@ -12,10 +12,12 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
GNU General Public License for more details.
|
GNU General Public License for more details.
|
||||||
"""
|
"""
|
||||||
|
from bs4 import NavigableString
|
||||||
from mastoposter.text import (
|
from mastoposter.text import (
|
||||||
nodes_process,
|
nodes_process,
|
||||||
register_converter,
|
register_converter,
|
||||||
register_fmt_converter,
|
register_fmt_converter,
|
||||||
|
register_text_node_converter,
|
||||||
node_process,
|
node_process,
|
||||||
STRIPE,
|
STRIPE,
|
||||||
BULLET,
|
BULLET,
|
||||||
|
@ -26,6 +28,11 @@ from bs4.element import Tag
|
||||||
from html import escape
|
from html import escape
|
||||||
|
|
||||||
|
|
||||||
|
@register_text_node_converter("html")
|
||||||
|
def proc_text_node_to_html(txt: NavigableString) -> str:
|
||||||
|
return escape(txt).strip()
|
||||||
|
|
||||||
|
|
||||||
@register_converter("a", "html")
|
@register_converter("a", "html")
|
||||||
def proc_tag_a_to_html(tag: Tag):
|
def proc_tag_a_to_html(tag: Tag):
|
||||||
return '<a href="%s">%s</a>' % (
|
return '<a href="%s">%s</a>' % (
|
||||||
|
@ -81,7 +88,7 @@ def proc_tag_ul_to_html(tag: Tag) -> str:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@register_converter("li", "html")
|
@register_converter("ol", "html")
|
||||||
def proc_tag_li_to_html(tag: Tag) -> str:
|
def proc_tag_li_to_html(tag: Tag) -> str:
|
||||||
return "\n" + str.join(
|
return "\n" + str.join(
|
||||||
"\n",
|
"\n",
|
||||||
|
|
|
@ -63,7 +63,7 @@ def proc_tag_ul_to_markdown(tag: Tag) -> str:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@register_converter("li", "markdown")
|
@register_converter("ol", "markdown")
|
||||||
def proc_tag_li_to_markdown(tag: Tag) -> str:
|
def proc_tag_li_to_markdown(tag: Tag) -> str:
|
||||||
return "\n" + str.join(
|
return "\n" + str.join(
|
||||||
"\n",
|
"\n",
|
||||||
|
|
|
@ -23,14 +23,13 @@ from mastoposter.text import (
|
||||||
)
|
)
|
||||||
|
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from html import escape
|
|
||||||
|
|
||||||
|
|
||||||
@register_converter("a", "plain")
|
@register_converter("a", "plain")
|
||||||
def proc_tag_a_to_plain(tag: Tag):
|
def proc_tag_a_to_plain(tag: Tag):
|
||||||
return "%s (%s)" % (
|
return "%s (%s)" % (
|
||||||
nodes_process(tag.children, "plain"),
|
nodes_process(tag.children, "plain"),
|
||||||
escape(tag.attrs.get("href", "#")),
|
tag.attrs.get("href", "#"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,7 +63,7 @@ def proc_tag_ul_to_plain(tag: Tag) -> str:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@register_converter("li", "plain")
|
@register_converter("ol", "plain")
|
||||||
def proc_tag_li_to_plain(tag: Tag) -> str:
|
def proc_tag_li_to_plain(tag: Tag) -> str:
|
||||||
return "\n" + str.join(
|
return "\n" + str.join(
|
||||||
"\n",
|
"\n",
|
||||||
|
|
Loading…
Reference in New Issue