-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
67 lines (49 loc) · 1.73 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/anaconda3/bin/python
import requests
import bs4
import re
import urllib.request
import os
def download_images(img_urls, dest_dir):
"""Given the urls, downloads
each image into the destination directory.
Gives the images local filenames img0, img1, and so on.
Creates the directory if necessary.
"""
amnh = 'https://www.amnh.org/'
abs_dest = os.path.abspath(dest_dir)
if not os.path.exists(abs_dest):
os.makedirs(abs_dest)
for i in range(len(img_urls)):
print('Retrieving...', img_urls[i])
item = img_urls[i].split('/')[-1]
urllib.request.urlretrieve(amnh + img_urls[i], abs_dest + '/' + item)
def scraper(num_pages, dest_dir):
url = 'https://www.amnh.org/our-research/paleontology/paleontology-faq/trilobite-website/trilobites-dl/trilobite-images-folder'
page_count = 10
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
for page in range(num_pages):
print('-' * 20, '\nFetching Page', page, 'imgs\n' + '-' * 20, '\n')
# Setups URL for each of the 88 pages
a = url.split('/')
if page == 1:
a.append('(offset)')
a.append(str(page_count))
elif page >= 2:
page_count += 10
a.pop()
a.append(str(page_count))
# New page_url
url = '/'.join(a)
html_doc = requests.get(url).text
soup = bs4.BeautifulSoup(html_doc, 'html.parser')
img_urls = re.findall(r' in-standard-src="/(\S*jpg)', html_doc)
# remove '_small.jpg' for full-scale imgs
for i in range(len(img_urls)):
img_urls[i] = img_urls[i][:-10] + '.jpg'
download_images(img_urls, dest_dir)
def main():
scraper(88, 'trilobite')
if __name__ == '__main__':
main()