-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_scraper.py
56 lines (43 loc) · 1.85 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import string
import requests
from bs4 import BeautifulSoup
import os
BASE_URL = "https://www.nature.com"
print("This will scrape articles in the Nature website")
pages = int(input("Number of article pages to scrape: "))
article_type = input("Article type to scrape:")
def get_data(url):
print("getting data %", url)
r = requests.get(url)
return BeautifulSoup(r.content, 'html.parser')
for i in range(pages):
i = i + 1
articles_page = f"/nature/articles?searchType=journalSearch&sort=PubDate&page={i}"
folder_name = f"Page_{i}"
if not os.access(folder_name, os.F_OK):
os.mkdir(folder_name)
file_dir = os.path.join(os.getcwd(), f"Page_{i}")
articles_saved = []
articles = get_data(BASE_URL + articles_page).findAll('li', class_={'app-article-list-row__item'})
for article in articles:
# Check if article has correct type
if article.find('span', class_="c-meta__type").text == article_type:
item = article.find('a', {'data-track-action': 'view article'})
article_link = item.get('href')
parser = get_data(BASE_URL + article_link)
# Getting article title
title = parser.find('h1').text
title = title.translate(str.maketrans('', '', string.punctuation))
title = title.strip().replace(" ", "_")
# Getting article body text
body = parser.find('div', class_='c-article-body u-clearfix')
if body is None:
body = parser.find('div', class_='article-item__body')
text = body.text.strip()
file_name = os.path.join(file_dir, f"{title}.txt")
f = open(file_name, "wb")
f.write(text.encode())
f.close()
articles_saved.append(file_name)
print(f"Saved articles: {articles_saved}")
print("All articles were scraped.")