-
Notifications
You must be signed in to change notification settings - Fork 3
/
Lesson 41 - Web Scraping.py
37 lines (34 loc) · 1.24 KB
/
Lesson 41 - Web Scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import requests
from bs4 import BeautifulSoup
result = requests.get("https://goprogram.co.uk")
# print(result)
# print(result.content)
# print(result.text)
# soup = BeautifulSoup(result.content,features="lxml")
# print(soup.prettify())
# for link in soup.find("header").find_all("a"):
# print(link.get_text(),link.get("href"))
to_visit = set(["https://www.goprogram.co.uk"])
valid = set()
visited = set()
while len(to_visit) > 0:
location = next(iter(to_visit))
print(location)
with requests.get(location) as result:
if "goprogram.co.uk" in result.url:
soup = BeautifulSoup(result.content,features="lxml")
for link in soup.find_all("a"):
href = link.get("href")
if href == None:
continue
href = href.split("#")[0]
if href.startswith("/"):
href = "https://www.goprogram.co.uk" + href
elif not href.startswith("https://"):
href = "/".join(location.split("/")[:-1]) + "/" + href
if not href in visited:
to_visit.update([href])
valid.update([location])
visited.update([location])
to_visit.discard(location)
print(valid)