-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add extension to support metaformats (#213)
- Loading branch information
Showing
7 changed files
with
235 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
"""Metaformats parser. | ||
https://microformats.org/wiki/metaformats | ||
TODO: | ||
* explicit mf2 classes on meta tags | ||
https://microformats.org/wiki/metaformats#parsing_an_element_for_properties | ||
""" | ||
from .dom_helpers import try_urljoin | ||
from .mf2_classes import filter_classes | ||
|
||
METAFORMAT_TO_MF2 = [ | ||
# in priority order, descending | ||
# OGP | ||
("property", "article:author", "author"), | ||
("property", "article:published_time", "published"), | ||
("property", "article:modified_time", "updated"), | ||
("property", "og:audio", "audio"), | ||
("property", "og:description", "summary"), | ||
("property", "og:image", "photo"), | ||
("property", "og:title", "name"), | ||
("property", "og:video", "video"), | ||
("name", "twitter:title", "name"), | ||
("name", "twitter:description", "summary"), | ||
("name", "twitter:image", "photo"), | ||
# HTML standard meta names | ||
# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name | ||
("name", "description", "summary"), | ||
] | ||
OGP_TYPE_TO_MF2 = { | ||
"article": "h-entry", | ||
"movie": "h-cite", | ||
"music": "h-cite", | ||
"profile": "h-card", | ||
} | ||
URL_PROPERTIES = { | ||
"article:author", | ||
"og:audio", | ||
"og:image", | ||
"og:video", | ||
"twitter:image", | ||
} | ||
|
||
|
||
def parse(soup, url=None): | ||
"""Extracts and returns a metaformats item from a BeautifulSoup parse tree. | ||
Args: | ||
soup (bs4.BeautifulSoup): parsed HTML | ||
url (str): URL of document | ||
Returns: | ||
dict: mf2 item, or None if the input is not eligible for metaformats | ||
""" | ||
if not soup.head: | ||
return None | ||
|
||
# Is there a microformat2 root class on the html element? | ||
if filter_classes(soup.get("class", []))["h"]: | ||
return None | ||
|
||
parsed = {"properties": {}} | ||
props = parsed["properties"] | ||
|
||
# Properties | ||
for attr, meta, mf2 in METAFORMAT_TO_MF2: | ||
if val := soup.head.find("meta", attrs={attr: meta}): | ||
if content := val.get("content"): | ||
if meta in URL_PROPERTIES: | ||
content = try_urljoin(url, content) | ||
props.setdefault(mf2, [content]) | ||
|
||
if soup.head.title: | ||
if text := soup.head.title.text: | ||
props.setdefault("name", [text]) | ||
|
||
if not props: | ||
# No OGP or Twitter properties | ||
return None | ||
|
||
# type from OGP or default to h-entry | ||
parsed["type"] = ["h-entry"] | ||
if ogp_type := soup.head.find("meta", property="og:type"): | ||
if content := ogp_type.get("content"): | ||
if mf2_type := OGP_TYPE_TO_MF2.get(content.split(".")[0]): | ||
parsed["type"] = [mf2_type] | ||
|
||
return parsed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<meta http-equiv="content-type" content="text/html; charset=utf-8"> | ||
<title>Hello World</title> | ||
<base href="http://tantek.com/" /> | ||
<meta name="description" content="Descrypshun bar" /> | ||
</head> | ||
<body> | ||
<p>Hello world!</p> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<meta http-equiv="content-type" content="text/html; charset=utf-8"> | ||
<title>Hello World</title> | ||
<base href="http://tantek.com/" /> | ||
<meta property="og:type" content="article" /> | ||
<meta property="og:title" content="Titull foo" /> | ||
<meta property="og:description" content="Descrypshun bar" /> | ||
<meta property="og:image" content="http://example.com/baz.jpg" /> | ||
<meta property="og:audio" content="http://example.com/biff.mp3" /> | ||
<meta property="og:video" content="http://example.com/boff.mov" /> | ||
<meta property="article:author" content="/me" /> | ||
<meta property="article:published_time" content="2023-01-02T03:04Z" /> | ||
<meta property="article:modified_time" content="2023-01-02T05:06Z" /> | ||
</head> | ||
<body> | ||
<p>Hello world!</p> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<meta http-equiv="content-type" content="text/html; charset=utf-8"> | ||
<title>Hello World</title> | ||
<base href="http://tantek.com/" /> | ||
<meta name="twitter:title" content="Titull foo" /> | ||
<meta name="twitter:description" content="Descrypshun bar" /> | ||
<meta name="twitter:image" content="/baz.jpg" /> | ||
</head> | ||
<body> | ||
<p>Hello world!</p> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters