Fix fandom wiki page parsing (at least for AoC).

Sachaa-Thanasius · Mar 12, 2024 · 1caa6ad · 1caa6ad
1 parent 4acdf9e
commit 1caa6ad
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 12 deletions.
diff --git a/exts/fandom_wiki.py b/exts/fandom_wiki.py
@@ -19,6 +19,8 @@
 import core
 from core.utils import EMOJI_URL
 
+from .ff_metadata.utils import html_to_markdown
+
 
 if TYPE_CHECKING:
  from aiohttp import ClientSession
@@ -122,7 +124,7 @@ def clean_fandom_page(element: etree._Element) -> etree._Element: # type: ignor
  else:
  if index > summary_end_index:
  summary_end_index = index
- subheading.getparent().remove(subheading) # type: ignore [reportOptionalMemberAccess]
+  subheading.getparent().remove(subheading) # type: ignore [reportOptionalMemberAccess]
 
  if summary_end_index != 0:
  for el in list(element[summary_end_index + 1 :]):
@@ -142,17 +144,33 @@ async def process_fandom_page(session: ClientSession, url: str) -> tuple[str | N
  char_summary, char_thumbnail = None, None
 
  # Extract the main content.
- text = await response.text()
- element = html.fromstring(text)
+ element = html.fromstring(await response.text())
  content = element.find(".//div[@class='mw-parser-output']")
  if content is not None:
  # Extract the image.
  image = content.find(".//a[@class='image image-thumbnail']")
  if image is not None:
  char_thumbnail = str(image.attrib["href"])
 
- content = clean_fandom_page(content)
- char_summary = content.text
+ # Filter the content text.
+ summary_end_index = 0
+ to_look_for = [".//aside[contains(@class, 'portable-infobox')]", ".//div[@id='toc']", ".//h2"]
+
+ for index, node in enumerate(content.xpath(" | ".join(to_look_for))):
+ if (node.tag == "div" or node.tag == "h2") and summary_end_index == 0 and index > summary_end_index:
+ summary_end_index = index
+
+ node.getparent().remove(node)
+
+ if summary_end_index != 0:
+ for el in list(content[summary_end_index:]):
+ content.remove(el)
+
+ char_summary = html_to_markdown(
+ content,
+ include_spans=True,
+ base_url="".join(url.partition(".com/wiki/")[0:-1]),
+ )
 
  # Return the remaining text.
  return char_summary, char_thumbnail

diff --git a/exts/ff_metadata/utils.py b/exts/ff_metadata/utils.py
@@ -3,7 +3,6 @@
 import re
 import textwrap
 from typing import Any, NamedTuple
-from urllib.parse import urljoin
 
 import ao3
 import atlas_api
@@ -65,21 +64,22 @@ class StoryWebsite(NamedTuple):
 )
 
 
-def html_to_markdown(raw_text: str, *, include_spans: bool = False, base_url: str | None = None) -> str:
+def html_to_markdown(node: lxml.html.HtmlElement, *, include_spans: bool = False, base_url: str | None = None) -> str:
  # Modified from RoboDanny code:
  # https://github.com/Rapptz/RoboDanny/blob/6e54be1985793ed29fca6b7c5259677904b8e1ad/cogs/dictionary.py#L532
 
  text: list[str] = []
  italics_marker: str = "_"
 
- node = lxml.html.fromstring(raw_text)
+ if base_url is not None:
+ node.make_links_absolute("".join(base_url.partition(".com/wiki/")[0:-1]), resolve_base_href=True)
 
  for child in node.iter():
  child_text = child.text.strip() if child.text else ""
 
  if child.tag in {"i", "em"}:
  text.append(f"{italics_marker}{child_text}{italics_marker}")
- italics_marker = "_" if italics_marker == "*" else "*" # type: ignore
+ italics_marker = "_" if italics_marker == "*" else "*"
  elif child.tag in {"b", "strong"}:
  if text and text[-1].endswith("*"):
  text.append("\u200b")
@@ -89,8 +89,7 @@ def html_to_markdown(raw_text: str, *, include_spans: bool = False, base_url: st
  if base_url is None:
  text.append(child_text)
  else:
- url = urljoin(base_url, child.attrib["href"])
- text.append(f"[{child.text}]({url})")
+ text.append(f"[{child.text}]({child.attrib['href']})")
  elif child.tag == "p":
  text.append(f"\n{child_text}\n")
  elif include_spans and child.tag == "span":
@@ -238,7 +237,7 @@ def create_fichub_embed(story: fichub_api.Story) -> discord.Embed:
  else:
  stats_str = "No stats available at this time."
 
- md_description = html_to_markdown(story.description)
+ md_description = html_to_markdown(lxml.html.fromstring(story.description))
 
  # Add the info to the embed appropriately.
  story_embed = (