-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_mars.py
159 lines (104 loc) · 4 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Dependencies and Setup
from bs4 import BeautifulSoup
from splinter import Browser
import pandas as pd
import requests
import time
import re
# MAC - Set browser
def init_browser():
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
return Browser("chrome", **executable_path, headless=False)
mars_info = {}
# Mars News
def scrape_info():
browser = init_browser()
# Visit Nasa news url
url_news = "https://mars.nasa.gov/news/"
browser.visit(url_news)
# HTML Object
html_news = browser.html
soup = BeautifulSoup(html_news, "html.parser")
# Scrape the latest News Title and Paragraph Text
news_title = soup.find("div", class_ = "content_title").text
news_paragraph = soup.find("div", class_ = "article_teaser_body").text
mars_info["news_title"] = news_title
mars_info["news_paragraph"] = news_paragraph
# Featured Image
# Visit JPL Featured Space Image url
url_spaceimage = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url_spaceimage)
# HTML Object
img_html = browser.html
img_soup = BeautifulSoup(img_html, "html.parser")
# Find image url to the full size
featured_image = img_soup.find("article")["style"].replace('background-image: url(','').replace(');', '')[1:-1]
# Display url
main_url = "https://www.jpl.nasa.gov"
# Connect website url with scrapped route
featured_image_url = main_url + featured_image
mars_info["featured_image_url"] = featured_image_url
# Mars Weather
# Visit Mars Weather twitter url
url_weather = "https://twitter.com/marswxreport?lang=en"
browser.visit(url_weather)
# HTML Object
weather_html = browser.html
weather_soup = BeautifulSoup(weather_html, "html.parser")
# Scrape the latest Mars weather tweet from the page
weather = weather_soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
# Remove the anchor from paragraph
weather.a.decompose()
weather = weather.text
# Clean up the text
mars_weather = weather.replace(" \n", '')
mars_info["mars_weather"] = mars_weather
# Mars Facts
# Visit the Mars Facts webpage and use Pandas to scrape the table
url_facts = "https://space-facts.com/mars/"
# Use Pandas - read_html - to scrape tabular data from a page
mars_facts = pd.read_html(url_facts)
# Find the mars facts DataFrame in the list
mars_df = mars_facts[0]
# Create Data Frame
mars_df.columns = ["Description", "Value"]
# Set index to Description
mars_df.set_index("Description", inplace=True)
# Save html code to folder Assets
html_table = mars_df.to_html()
# Strip unwanted newlines to clean up the table
html_table.replace("\n", '')
# Save html code
mars_df.to_html("mars_facts_data.html")
mars_info["mars_facts"] = html_table
# Mars Hemispheres
# Visit the USGS Astrogeology Science Center url
url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url_hemisphere)
# HTML Object
html_hemisphere = browser.html
hem_soup = BeautifulSoup(html_hemisphere, "html.parser")
# Find containers whcih has mars hemispheres information
hemispheres = hem_soup.find_all("div", class_="item")
# Create empty list
hemispheres_info = []
# Sign main url for loop
hemispheres_url = "https://astrogeology.usgs.gov"
# Loop through the list of all hemispheres information
for i in hemispheres:
title = i.find("h3").text
hemispheres_img = i.find("a", class_="itemLink product-item")["href"]
# Visit the link that contains the full image website
browser.visit(hemispheres_url + hemispheres_img)
# HTML Object
image_html = browser.html
web_info = BeautifulSoup(image_html, "html.parser")
# Create full image url
img_url = hemispheres_url + web_info.find("img", class_="wide-image")["src"]
mars_info["title"] = title.strip()
mars_info["img_url"] = img_url
hemispheres_info.append({"title" : title, "img_url" : img_url})
mars_info["hemispheres_info"] = hemispheres_info
# Close the browser after scraping
browser.quit()
return mars_info