From 8e1e648f592423a0c35b84880ffcf4535b994db5 Mon Sep 17 00:00:00 2001 From: Thanos <111999343+Sachaa-Thanasius@users.noreply.github.com> Date: Mon, 15 Apr 2024 13:40:42 -0400 Subject: [PATCH] Move html_to_markdown to utils/misc. - Centralize this util. --- core/utils/misc.py | 42 ++++++++++++++++++++++++++++++++++++++- exts/fandom_wiki.py | 15 +++++--------- exts/ff_metadata/utils.py | 39 +----------------------------------- requirements.txt | 2 +- 4 files changed, 48 insertions(+), 50 deletions(-) diff --git a/core/utils/misc.py b/core/utils/misc.py index abd9e2d..0994d4c 100644 --- a/core/utils/misc.py +++ b/core/utils/misc.py @@ -7,8 +7,10 @@ import logging import time +import lxml.html -__all__ = ("catchtime",) + +__all__ = ("catchtime", "html_to_markdown") class catchtime: @@ -33,3 +35,41 @@ def __exit__(self, *exc: object) -> None: self.total_time = time.perf_counter() - self.total_time if self.logger: self.logger.info("Time: %.3f seconds", self.total_time) + + +def html_to_markdown(node: lxml.html.HtmlElement, *, include_spans: bool = False, base_url: str | None = None) -> str: + # Modified from RoboDanny code: + # https://github.com/Rapptz/RoboDanny/blob/6e54be1985793ed29fca6b7c5259677904b8e1ad/cogs/dictionary.py#L532 + + text: list[str] = [] + italics_marker: str = "_" + + if base_url is not None: + node.make_links_absolute("".join(base_url.partition(".com/wiki/")[0:-1]), resolve_base_href=True) + + for child in node.iter(): + child_text = child.text.strip() if child.text else "" + + if child.tag in {"i", "em"}: + text.append(f"{italics_marker}{child_text}{italics_marker}") + if italics_marker == "*": # type: ignore + italics_marker = "_" + elif child.tag in {"b", "strong"}: + if text and text[-1].endswith("*"): + text.append("\u200b") + text.append(f"**{child_text.strip()}**") + elif child.tag == "a": + # No markup for links + if base_url is None: + text.append(child_text) + else: + text.append(f"[{child.text}]({child.attrib['href']})") + elif child.tag == "p": + text.append(f"\n{child_text}\n") + elif include_spans and child.tag == "span": + text.append(child_text) + + if child.tail: + text.append(child.tail) + + return "".join(text).strip() diff --git a/exts/fandom_wiki.py b/exts/fandom_wiki.py index 7153e83..70aadb3 100644 --- a/exts/fandom_wiki.py +++ b/exts/fandom_wiki.py @@ -8,22 +8,17 @@ import asyncio import logging import textwrap -from typing import TYPE_CHECKING, Any +from typing import Any from urllib.parse import quote as uriquote, urljoin +import aiohttp import discord from discord.app_commands import Choice from discord.ext import commands from lxml import etree, html import core -from core.utils import EMOJI_URL - -from .ff_metadata.utils import html_to_markdown - - -if TYPE_CHECKING: - from aiohttp import ClientSession +from core.utils import EMOJI_URL, html_to_markdown LOGGER = logging.getLogger(__name__) @@ -70,7 +65,7 @@ def __init__( ) -async def load_wiki_all_pages(session: ClientSession, wiki_url: str) -> dict[str, str]: +async def load_wiki_all_pages(session: aiohttp.ClientSession, wiki_url: str) -> dict[str, str]: pages_dict: dict[str, str] = {} next_path: str = urljoin(wiki_url, "/wiki/Special:AllPages") while True: @@ -137,7 +132,7 @@ def clean_fandom_page(element: etree._Element) -> etree._Element: # type: ignor return element -async def process_fandom_page(session: ClientSession, url: str) -> tuple[str | None, str | None]: +async def process_fandom_page(session: aiohttp.ClientSession, url: str) -> tuple[str | None, str | None]: """Extract the summary and image from a Fandom page.""" async with session.get(url) as response: diff --git a/exts/ff_metadata/utils.py b/exts/ff_metadata/utils.py index a8cc0a8..5e800d8 100644 --- a/exts/ff_metadata/utils.py +++ b/exts/ff_metadata/utils.py @@ -10,7 +10,7 @@ import fichub_api import lxml.html -from core.utils import PaginatedSelectView +from core.utils import PaginatedSelectView, html_to_markdown __all__ = ( @@ -64,43 +64,6 @@ class StoryWebsite(NamedTuple): ) -def html_to_markdown(node: lxml.html.HtmlElement, *, include_spans: bool = False, base_url: str | None = None) -> str: - # Modified from RoboDanny code: - # https://github.com/Rapptz/RoboDanny/blob/6e54be1985793ed29fca6b7c5259677904b8e1ad/cogs/dictionary.py#L532 - - text: list[str] = [] - italics_marker: str = "_" - - if base_url is not None: - node.make_links_absolute("".join(base_url.partition(".com/wiki/")[0:-1]), resolve_base_href=True) - - for child in node.iter(): - child_text = child.text.strip() if child.text else "" - - if child.tag in {"i", "em"}: - text.append(f"{italics_marker}{child_text}{italics_marker}") - italics_marker = "_" if italics_marker == "*" else "*" # type: ignore - elif child.tag in {"b", "strong"}: - if text and text[-1].endswith("*"): - text.append("\u200b") - text.append(f"**{child_text.strip()}**") - elif child.tag == "a": - # No markup for links - if base_url is None: - text.append(child_text) - else: - text.append(f"[{child.text}]({child.attrib['href']})") - elif child.tag == "p": - text.append(f"\n{child_text}\n") - elif include_spans and child.tag == "span": - text.append(child_text) - - if child.tail: - text.append(child.tail) - - return "".join(text).strip() - - def create_ao3_work_embed(work: ao3.Work) -> discord.Embed: """Create an embed that holds all the relevant metadata for an Archive of Our Own work. diff --git a/requirements.txt b/requirements.txt index 67812c2..3c52d9e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ msgspec[toml] openpyxl Pillow>=10.0.0 types-lxml -wavelink>=3.0.0 +wavelink>=3.2.0 # To be used later: # parsedatetime