-
Notifications
You must be signed in to change notification settings - Fork 11
/
scrape_mars.py
229 lines (151 loc) · 6.63 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# Import Dependecies
from bs4 import BeautifulSoup
from splinter import Browser
import pandas as pd
import requests
# Initialize browser
def init_browser():
# Replace the path with your actual path to the chromedriver
#Mac Users
#executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
#return Browser('chrome', **executable_path, headless=False)
#Windows Users
# executable_path = {'executable_path': '/Users/cantu/Desktop/Mission-to-Mars'}
# return Browser('chrome', **executable_path, headless=False)
exec_path = {'executable_path': '/app/.chromedriver/bin/chromedriver'}
return Browser('chrome', headless=True, **exec_path)
# Create Mission to Mars global dictionary that can be imported into Mongo
mars_info = {}
# NASA MARS NEWS
def scrape_mars_news():
try:
# Initialize browser
browser = init_browser()
#browser.is_element_present_by_css("div.content_title", wait_time=1)
# Visit Nasa news url through splinter module
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# HTML Object
html = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# Retrieve the latest element that contains news title and news_paragraph
news_title = soup.find('div', class_='content_title').find('a').text
news_p = soup.find('div', class_='article_teaser_body').text
# Dictionary entry from MARS NEWS
mars_info['news_title'] = news_title
mars_info['news_paragraph'] = news_p
return mars_info
finally:
browser.quit()
# FEATURED IMAGE
def scrape_mars_image():
try:
# Initialize browser
browser = init_browser()
#browser.is_element_present_by_css("img.jpg", wait_time=1)
# Visit Mars Space Images through splinter module
image_url_featured = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url_featured)# Visit Mars Space Images through splinter module
# HTML Object
html_image = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_image, 'html.parser')
# Retrieve background-image url from style tag
featured_image_url = soup.find('article')['style'].replace('background-image: url(','').replace(');', '')[1:-1]
# Website Url
main_url = 'https://www.jpl.nasa.gov'
# Concatenate website url with scrapped route
featured_image_url = main_url + featured_image_url
# Display full link to featured image
featured_image_url
# Dictionary entry from FEATURED IMAGE
mars_info['featured_image_url'] = featured_image_url
return mars_info
finally:
browser.quit()
# Mars Weather
def scrape_mars_weather():
try:
# Initialize browser
browser = init_browser()
#browser.is_element_present_by_css("div", wait_time=1)
# Visit Mars Weather Twitter through splinter module
weather_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(weather_url)
# HTML Object
html_weather = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_weather, 'html.parser')
# Find all elements that contain tweets
latest_tweets = soup.find_all('div', class_='js-tweet-text-container')
# Retrieve all elements that contain news title in the specified range
# Look for entries that display weather related words to exclude non weather related tweets
for tweet in latest_tweets:
weather_tweet = tweet.find('p').text
if 'Sol' and 'pressure' in weather_tweet:
print(weather_tweet)
break
else:
pass
# Dictionary entry from WEATHER TWEET
mars_info['weather_tweet'] = weather_tweet
return mars_info
finally:
browser.quit()
# Mars Facts
def scrape_mars_facts():
# Visit Mars facts url
facts_url = 'http://space-facts.com/mars/'
# Use Panda's `read_html` to parse the url
mars_facts = pd.read_html(facts_url)
# Find the mars facts DataFrame in the list of DataFrames as assign it to `mars_df`
mars_df = mars_facts[0]
# Assign the columns `['Description', 'Value']`
mars_df.columns = ['Description','Value']
# Set the index to the `Description` column without row indexing
mars_df.set_index('Description', inplace=True)
# Save html code to folder Assets
data = mars_df.to_html()
# Dictionary entry from MARS FACTS
mars_info['mars_facts'] = data
return mars_info
# MARS HEMISPHERES
def scrape_mars_hemispheres():
try:
# Initialize browser
browser = init_browser()
# Visit hemispheres website through splinter module
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)
# HTML Object
html_hemispheres = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_hemispheres, 'html.parser')
# Retreive all items that contain mars hemispheres information
items = soup.find_all('div', class_='item')
# Create empty list for hemisphere urls
hiu = []
# Store the main_ul
hemispheres_main_url = 'https://astrogeology.usgs.gov'
# Loop through the items previously stored
for i in items:
# Store title
title = i.find('h3').text
# Store link that leads to full image website
partial_img_url = i.find('a', class_='itemLink product-item')['href']
# Visit the link that contains the full image website
browser.visit(hemispheres_main_url + partial_img_url)
# HTML Object of individual hemisphere information website
partial_img_html = browser.html
# Parse HTML with Beautiful Soup for every individual hemisphere information website
soup = BeautifulSoup( partial_img_html, 'html.parser')
# Retrieve full image source
img_url = hemispheres_main_url + soup.find('img', class_='wide-image')['src']
# Append the retreived information into a list of dictionaries
hiu.append({"title" : title, "img_url" : img_url})
mars_info['hiu'] = hiu
# Return mars_data dictionary
return mars_info
finally:
browser.quit()