-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_clipper.py
119 lines (98 loc) · 3.15 KB
/
web_clipper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from dataclasses import dataclass
import requests
import pathlib
import re
from markdownify import markdownify as md
from bs4 import BeautifulSoup as bs
from datetime import datetime as dt
from htmldate import find_date
from readability.readability import Document
from globals import ROOT_DIR
from custom_logger import create_logger
LOGGER = create_logger(name="web_clipper")
@dataclass
class WebpageClipping:
author: str
source: str
clipped_date: str
published_date: str
tags: str
title: str
markdown: str
def clip_webpage(url):
"""Clips a webpage.
Args:
url (str): The URL of the webpage to archive.
Returns:
WebpageClipping: The contents (and some metadata)
of the webpage.
"""
response = requests.get(url)
soup = bs(response.text, "html.parser")
byline_soup = soup.find(class_="byline")
today = dt.today().strftime("%Y-%m-%d")
published = find_date(response.text)
TAGS = "#clippings"
readable = Document(response.text)
title = readable.title()
source = f"[{title}]({url})"
markdowned = md(readable.summary())
return WebpageClipping(
author=byline_soup,
source=source,
clipped_date=today,
published_date=published,
tags=TAGS,
title=title,
markdown=markdowned,
)
def format_clipping(clipping):
"""Formats a WebpageClipping into a Markdown string.
Args:
clipping (WebpageClipping): Clipping to format.
Returns:
str: Formatted string
"""
document = "\n\n".join(
[
f"author:: {clipping.author}",
f"source:: {clipping.source}",
f"clipped:: {clipping.clipped_date}",
f"published:: {clipping.published_date}",
f"tags:: {clipping.tags}",
f"# {clipping.title}",
clipping.markdown,
]
)
# Remove all whitespace characters sandwiched between two new lines
cleaned = re.sub(r"\n\s+\n", "\n\n", document)
# Convert all instances of at least one new-line to 2 new-lines
cleaned = re.sub(r"\n+", "\n\n", cleaned)
return cleaned
# We want to replace all illegal filename characters in the title with a dash
# before writing to the file
def clean_filename(filename):
"""Replaces all illegal filename characters in the title of a filename
with a dash, before writing to the file (i.e. `<>:/|?*`)
Args:
filename (str): The filename to clean
Returns:
str: Cleaned filename.
"""
blacklist = r"[<>:/\|?*]"
blacklist = re.compile(blacklist)
replacement = " - "
return re.sub(blacklist, replacement, filename)
def archive_web(url):
"""Archives a webpage as a Markdown file under
/archive/web/
Args:
url (str): The URL of the webpage to archive.
"""
webpage = clip_webpage(url)
filename = clean_filename(webpage.title)
path = pathlib.Path(ROOT_DIR, "archive", "web", f"{filename}_{webpage.clipped_date}.md")
document = format_clipping(clipping=webpage)
with path.open(mode="wb") as file:
file.write(document.encode("UTF-8"))
LOGGER.info(f"Archived webpage at URL {url}")