Skip to content

Commit

Permalink
Fix fandom wiki page parsing (at least for AoC).
Browse files Browse the repository at this point in the history
  • Loading branch information
Sachaa-Thanasius committed Mar 12, 2024
1 parent 4acdf9e commit 1caa6ad
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 12 deletions.
28 changes: 23 additions & 5 deletions exts/fandom_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import core
from core.utils import EMOJI_URL

from .ff_metadata.utils import html_to_markdown


if TYPE_CHECKING:
from aiohttp import ClientSession
Expand Down Expand Up @@ -122,7 +124,7 @@ def clean_fandom_page(element: etree._Element) -> etree._Element: # type: ignor
else:
if index > summary_end_index:
summary_end_index = index
subheading.getparent().remove(subheading) # type: ignore [reportOptionalMemberAccess]
subheading.getparent().remove(subheading) # type: ignore [reportOptionalMemberAccess]

if summary_end_index != 0:
for el in list(element[summary_end_index + 1 :]):
Expand All @@ -142,17 +144,33 @@ async def process_fandom_page(session: ClientSession, url: str) -> tuple[str | N
char_summary, char_thumbnail = None, None

# Extract the main content.
text = await response.text()
element = html.fromstring(text)
element = html.fromstring(await response.text())
content = element.find(".//div[@class='mw-parser-output']")
if content is not None:
# Extract the image.
image = content.find(".//a[@class='image image-thumbnail']")
if image is not None:
char_thumbnail = str(image.attrib["href"])

content = clean_fandom_page(content)
char_summary = content.text
# Filter the content text.
summary_end_index = 0
to_look_for = [".//aside[contains(@class, 'portable-infobox')]", ".//div[@id='toc']", ".//h2"]

for index, node in enumerate(content.xpath(" | ".join(to_look_for))):
if (node.tag == "div" or node.tag == "h2") and summary_end_index == 0 and index > summary_end_index:
summary_end_index = index

node.getparent().remove(node)

if summary_end_index != 0:
for el in list(content[summary_end_index:]):
content.remove(el)

char_summary = html_to_markdown(
content,
include_spans=True,
base_url="".join(url.partition(".com/wiki/")[0:-1]),
)

# Return the remaining text.
return char_summary, char_thumbnail
Expand Down
13 changes: 6 additions & 7 deletions exts/ff_metadata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import re
import textwrap
from typing import Any, NamedTuple
from urllib.parse import urljoin

import ao3
import atlas_api
Expand Down Expand Up @@ -65,21 +64,22 @@ class StoryWebsite(NamedTuple):
)


def html_to_markdown(raw_text: str, *, include_spans: bool = False, base_url: str | None = None) -> str:
def html_to_markdown(node: lxml.html.HtmlElement, *, include_spans: bool = False, base_url: str | None = None) -> str:
# Modified from RoboDanny code:
# https://github.com/Rapptz/RoboDanny/blob/6e54be1985793ed29fca6b7c5259677904b8e1ad/cogs/dictionary.py#L532

text: list[str] = []
italics_marker: str = "_"

node = lxml.html.fromstring(raw_text)
if base_url is not None:
node.make_links_absolute("".join(base_url.partition(".com/wiki/")[0:-1]), resolve_base_href=True)

for child in node.iter():
child_text = child.text.strip() if child.text else ""

if child.tag in {"i", "em"}:
text.append(f"{italics_marker}{child_text}{italics_marker}")
italics_marker = "_" if italics_marker == "*" else "*" # type: ignore
italics_marker = "_" if italics_marker == "*" else "*"

Check failure on line 82 in exts/ff_metadata/utils.py

View workflow job for this annotation

GitHub Actions / Type Coverage and Linting @ 3.10

Condition will always evaluate to False since the types "Literal['_']" and "Literal['*']" have no overlap (reportUnnecessaryComparison)
elif child.tag in {"b", "strong"}:
if text and text[-1].endswith("*"):
text.append("\u200b")
Expand All @@ -89,8 +89,7 @@ def html_to_markdown(raw_text: str, *, include_spans: bool = False, base_url: st
if base_url is None:
text.append(child_text)
else:
url = urljoin(base_url, child.attrib["href"])
text.append(f"[{child.text}]({url})")
text.append(f"[{child.text}]({child.attrib['href']})")
elif child.tag == "p":
text.append(f"\n{child_text}\n")
elif include_spans and child.tag == "span":
Expand Down Expand Up @@ -238,7 +237,7 @@ def create_fichub_embed(story: fichub_api.Story) -> discord.Embed:
else:
stats_str = "No stats available at this time."

md_description = html_to_markdown(story.description)
md_description = html_to_markdown(lxml.html.fromstring(story.description))

# Add the info to the embed appropriately.
story_embed = (
Expand Down

0 comments on commit 1caa6ad

Please sign in to comment.