Skip to content

Commit

Permalink
Move html_to_markdown to utils/misc.
Browse files Browse the repository at this point in the history
- Centralize this util.
  • Loading branch information
Sachaa-Thanasius committed Apr 15, 2024
1 parent 33c7286 commit 8e1e648
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 50 deletions.
42 changes: 41 additions & 1 deletion core/utils/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
import logging
import time

import lxml.html

__all__ = ("catchtime",)

__all__ = ("catchtime", "html_to_markdown")


class catchtime:
Expand All @@ -33,3 +35,41 @@ def __exit__(self, *exc: object) -> None:
self.total_time = time.perf_counter() - self.total_time
if self.logger:
self.logger.info("Time: %.3f seconds", self.total_time)


def html_to_markdown(node: lxml.html.HtmlElement, *, include_spans: bool = False, base_url: str | None = None) -> str:
# Modified from RoboDanny code:
# https://github.com/Rapptz/RoboDanny/blob/6e54be1985793ed29fca6b7c5259677904b8e1ad/cogs/dictionary.py#L532

text: list[str] = []
italics_marker: str = "_"

if base_url is not None:
node.make_links_absolute("".join(base_url.partition(".com/wiki/")[0:-1]), resolve_base_href=True)

for child in node.iter():
child_text = child.text.strip() if child.text else ""

if child.tag in {"i", "em"}:
text.append(f"{italics_marker}{child_text}{italics_marker}")
if italics_marker == "*": # type: ignore
italics_marker = "_"
elif child.tag in {"b", "strong"}:
if text and text[-1].endswith("*"):
text.append("\u200b")
text.append(f"**{child_text.strip()}**")
elif child.tag == "a":
# No markup for links
if base_url is None:
text.append(child_text)
else:
text.append(f"[{child.text}]({child.attrib['href']})")
elif child.tag == "p":
text.append(f"\n{child_text}\n")
elif include_spans and child.tag == "span":
text.append(child_text)

if child.tail:
text.append(child.tail)

return "".join(text).strip()
15 changes: 5 additions & 10 deletions exts/fandom_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,17 @@
import asyncio
import logging
import textwrap
from typing import TYPE_CHECKING, Any
from typing import Any
from urllib.parse import quote as uriquote, urljoin

import aiohttp
import discord
from discord.app_commands import Choice
from discord.ext import commands
from lxml import etree, html

import core
from core.utils import EMOJI_URL

from .ff_metadata.utils import html_to_markdown


if TYPE_CHECKING:
from aiohttp import ClientSession
from core.utils import EMOJI_URL, html_to_markdown


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -70,7 +65,7 @@ def __init__(
)


async def load_wiki_all_pages(session: ClientSession, wiki_url: str) -> dict[str, str]:
async def load_wiki_all_pages(session: aiohttp.ClientSession, wiki_url: str) -> dict[str, str]:
pages_dict: dict[str, str] = {}
next_path: str = urljoin(wiki_url, "/wiki/Special:AllPages")
while True:
Expand Down Expand Up @@ -137,7 +132,7 @@ def clean_fandom_page(element: etree._Element) -> etree._Element: # type: ignor
return element


async def process_fandom_page(session: ClientSession, url: str) -> tuple[str | None, str | None]:
async def process_fandom_page(session: aiohttp.ClientSession, url: str) -> tuple[str | None, str | None]:
"""Extract the summary and image from a Fandom page."""

async with session.get(url) as response:
Expand Down
39 changes: 1 addition & 38 deletions exts/ff_metadata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import fichub_api
import lxml.html

from core.utils import PaginatedSelectView
from core.utils import PaginatedSelectView, html_to_markdown


__all__ = (
Expand Down Expand Up @@ -64,43 +64,6 @@ class StoryWebsite(NamedTuple):
)


def html_to_markdown(node: lxml.html.HtmlElement, *, include_spans: bool = False, base_url: str | None = None) -> str:
# Modified from RoboDanny code:
# https://github.com/Rapptz/RoboDanny/blob/6e54be1985793ed29fca6b7c5259677904b8e1ad/cogs/dictionary.py#L532

text: list[str] = []
italics_marker: str = "_"

if base_url is not None:
node.make_links_absolute("".join(base_url.partition(".com/wiki/")[0:-1]), resolve_base_href=True)

for child in node.iter():
child_text = child.text.strip() if child.text else ""

if child.tag in {"i", "em"}:
text.append(f"{italics_marker}{child_text}{italics_marker}")
italics_marker = "_" if italics_marker == "*" else "*" # type: ignore
elif child.tag in {"b", "strong"}:
if text and text[-1].endswith("*"):
text.append("\u200b")
text.append(f"**{child_text.strip()}**")
elif child.tag == "a":
# No markup for links
if base_url is None:
text.append(child_text)
else:
text.append(f"[{child.text}]({child.attrib['href']})")
elif child.tag == "p":
text.append(f"\n{child_text}\n")
elif include_spans and child.tag == "span":
text.append(child_text)

if child.tail:
text.append(child.tail)

return "".join(text).strip()


def create_ao3_work_embed(work: ao3.Work) -> discord.Embed:
"""Create an embed that holds all the relevant metadata for an Archive of Our Own work.
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ msgspec[toml]
openpyxl
Pillow>=10.0.0
types-lxml
wavelink>=3.0.0
wavelink>=3.2.0

# To be used later:
# parsedatetime
Expand Down

0 comments on commit 8e1e648

Please sign in to comment.