-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapers.py
112 lines (88 loc) · 3.46 KB
/
scrapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import requests
from bs4 import BeautifulSoup
import logging
from typing import List
# Functions to get page links
def get_vogue_pages(url: str) -> List[str]:
date = url.split("/")[-2]
source_url = 'https://archive.vogue.com//image/nCU7lza0PM150sIHOTteljBKCHAc7Y9a/'
try:
response = requests.get(url)
response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
logging.error(f"Error fetching URL: {e}")
return []
soup = BeautifulSoup(response.text, 'lxml')
data = soup.find_all('script', type='text/javascript')
pages = data[2].text
result = pages.split('"PageName":"')
page_links = []
for x in result[1:]:
clean = x.split('Key')
final = clean[0].split('"')
final_src = source_url + date + '/' + final[0]
page_links.append(final_src)
return page_links
def get_ad_pages(url: str) -> List[str]:
date = url.split("/")[-2]
source_url = 'https://architecturaldigest.azurewebsites.net/image/nCU7lza0PM150sIHOTteljBKCHAc7Y9a/'
try:
response = requests.get(url)
response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
logging.error(f"Error fetching URL: {e}")
return []
html = response.text
soup = BeautifulSoup(html, 'lxml')
data = soup.find_all('script', type='text/javascript')
pages = data[1].text
result = pages.split('"PageName":"')
page_links = []
for x in result[1:]:
clean = x.split('"')
final = clean[0]
final_src = source_url + date + '/' + final
page_links.append(final_src)
return page_links
def get_vf_pages(url: str) -> List[str]:
date = url.split("/")[-2]
source_url = 'https://archive.vanityfair.com/image/nCU7lza0PM150sIHOTteljBKCHAc7Y9a/'
try:
response = requests.get(url)
response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
logging.error(f"Error fetching URL: {e}")
return []
html = response.text
soup = BeautifulSoup(html, 'lxml')
data = soup.find_all('script', type='text/javascript')
pages = data[2].text
result = pages.split('"PageName":"')
page_links = []
for x in result[1:]:
clean = x.split('"')
final = clean[0]
final_src = source_url + date + '/' + final
page_links.append(final_src)
return page_links
def get_esquire_pages(url: str) -> List[str]:
date = url.split("/")[-2]
source_url = 'https://classic.esquire.com/image/nCU7lza0PM150sIHOTteljBKCHAc7Y9a/'
try:
response = requests.get(url)
response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
logging.error(f"Error fetching URL: {e}")
return []
html = response.text
soup = BeautifulSoup(html, 'lxml')
data = soup.find_all('script', type='text/javascript')
pages = data[1].text
result = pages.split('"PageName":"')
page_links = []
for x in result[1:]:
clean = x.split('"')
final = clean[0]
final_src = source_url + date + '/' + final
page_links.append(final_src)
return page_links