-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
193 lines (145 loc) · 5.56 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python3
"""Scraper is a website scraper and downloader.
It parses the resource at the provided link for urls, then concurrently downloads
the target resources and repeats the process until it finds no more links.
"""
__version__ = "1.0.0"
import asyncio
from functools import cache
from html.parser import HTMLParser
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen, URLError, HTTPError
def to_path(url: str, base: str = None) -> Path:
"""Given a {url}, convert it to a file system path.
If {base} is given, prepend it to the url path."""
here = Path.cwd()
if base:
# appending as-is allows us to append an absolute path
here /= base
parts = urlparse(url)
# special case for index pages
if parts.path in ("", "/"):
parts = parts._replace(path="/index.html")
return here / parts.netloc / parts.path[1:]
def fetch(url: str) -> bytes:
"""Given a {url}, fetch the resource and return it."""
try:
with urlopen(url) as r:
html = r.read()
except (HTTPError, URLError) as err:
print(f"Error fetching {url}: {err}")
return None
return html
def store(url: str, html: bytes, base: str = None):
"""Save {html} data to a file, creating a path from the provided {url},
optionally under the {base} directory."""
path = to_path(url, base)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(html)
class LinkExtractor(HTMLParser):
"""Parses a HTML document and pulls all href attributes."""
def __init__(self, baseurl: str, *args, **kwargs):
super().__init__(*args, **kwargs)
self.found_links: set[str] = set()
self.baseurl = baseurl
def normalise_url(self, url: str) -> str:
"""Clean up {url} to remove duplicate entries by removing
fragments and trailing slashes.
If {url} is relative, copy scheme and location from {self.baseurl}."""
# note: _replace looks like a private interface in order for it to not
# look like a namedtuple field (ParseResult is a subclass)
link = urlparse(url)._replace(fragment="")
if link.path.endswith("/"):
link = link._replace(path=link.path[:-1])
if not link.netloc:
base = urlparse(self.baseurl)
link = link._replace(scheme=base.scheme, netloc=base.netloc)
return link.geturl()
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
"""Search {tag} for a "href" attribute, adding its value to {self.found_links}
if it exists."""
href = dict(attrs).get("href")
if href is None:
return
self.found_links.add(self.normalise_url(href))
def extract(self) -> set[str]:
"""Returns unique links found so far."""
return self.found_links
class Scraper:
"""Scrapes all urls it finds"""
def __init__(
self,
base_url: str,
cross_origin: bool = False,
base_dir: str = None,
):
self.seen_links: set[str] = set()
self.extractor = LinkExtractor(base_url)
self.base_url = base_url
self.loop = asyncio.get_running_loop()
self.cross_origin = cross_origin
self.base_dir = base_dir
@cache
def same_origin(self, url: str) -> bool:
"""True if {url} is relative or has same domain as {self.base_url}."""
domain = urlparse(url).netloc
return domain == "" or domain == urlparse(self.base_url).netloc
async def scrape(self):
"""Scrape {self.base_url}."""
await self.scrape_all([self.base_url])
def fetch_and_store(self, url: str) -> bytes:
"""Fetches {url}, then stores and returns it."""
print(f"fetching {url}...")
html = fetch(url)
if html:
store(url, html, self.base_dir)
return html
async def scrape_all(self, urls: list[str]):
"""Given a list of {urls}, concurrently fetches and scrapes them.
If this results in new urls being found, these are fetched as well."""
self.seen_links |= set(urls)
if not self.cross_origin:
urls = [url for url in urls if self.same_origin(url)]
tasks = [asyncio.to_thread(self.fetch_and_store, url) for url in urls]
for task in asyncio.as_completed(tasks):
html = await task
if html:
try:
text = html.decode()
except ValueError as err:
print(err)
continue
self.extractor.feed(text)
links = self.extractor.extract()
# filter out already visited links before running the next batch
new_links = links - self.seen_links
self.seen_links |= links
if new_links:
await self.scrape_all(list(new_links))
async def main():
import argparse
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--cross-origin",
action="store_true",
help="also fetch resources from different domains",
)
parser.add_argument(
"-o",
"--output-dir",
help="base directory for storing fetched resources",
)
parser.add_argument("url", help="URL to fetch")
args = parser.parse_args()
scraper = Scraper(
base_url=args.url,
cross_origin=args.cross_origin,
base_dir=args.output_dir,
)
await scraper.scrape()
def entrypoint():
# Can't use async functions as entrypoints
asyncio.run(main())
if __name__ == "__main__":
entrypoint()