-
Notifications
You must be signed in to change notification settings - Fork 0
/
nasa.py
34 lines (29 loc) · 1003 Bytes
/
nasa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import requests
from bs4 import BeautifulSoup
i=0;
#BaseURL = "https://climate.nasa.gov/blog/?page="+i;
baseBlogUrl = "https://climate.nasa.gov"
individualSoup = None;
individualData= None;
allTextString = "";
href = "";
f = open("ipsum.txt","w+");
for i in range(1,23):
BaseURL = "https://climate.nasa.gov/blog/?page="+str(i);
data = requests.get(BaseURL);
soup = BeautifulSoup(data.content,'lxml');
for title in soup.findAll("h1",attrs = {'class':"article_title"}) :
href = title.find("a")['href'];
if href == "/": #handle an edge case where some links are incorrect on the site.
continue;
else:
href = baseBlogUrl+href;
print(href);
individualData = requests.get(href);
individualSoup = BeautifulSoup(individualData.content, 'lxml');
for ipsumBuilder in individualSoup.findAll("div", attrs = {'class':'wysiwyg_content'}):
for gc in ipsumBuilder.findAll("p"):
allTextString += gc.get_text()+"\n";
f.write(allTextString+"\n");
f.close();
#print(allTextString);