-
Notifications
You must be signed in to change notification settings - Fork 2
/
Day Four Notes.R
211 lines (162 loc) · 6.21 KB
/
Day Four Notes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#this is our base code for day four - where we learn about scrapers
#this is our featured library
library(rvest)
#it would be important to have these running as well
library(ggplot2)
library(dplyr)
library(tibble)
#LETS BE HONEST - YOU WILL NEED A TON OF LIBRARIES.
#rvest is popular and effective, it allows a number of options for quick scraping
#first: store the location that will be scraped
feetball<-"https://www.sports-reference.com/cfb/years/2018-schedule.html"
foot<-feetball%>%
read_html()%>%
#this is the key - this is actually stored as a true table so we can easily take the xpath
html_nodes(xpath='//*[@id="schedule"]')%>%
html_table()%T>%
View()
football<-as.tibble(data.frame(foot))
View(football)
#Let's table the football question for now, we will wrangle this in a few dozen lines
#movie example
#older tutorials use "read" now we have read_html
lebowski <- read_html("https://www.imdb.com/title/tt0118715/")
#check out lebowski in your global environment: it did not simply store the URL but it actually hit the website it
#there are two sub-lists: the head and the body
#this code then is reaching into the stored region and basically running a specialized search/find operation
leb_cast<-lebowski %>%
#XPATH IS INSIDE OF SINGLE MARK
html_nodes(xpath = '//*[@id="titleCast"]/table') %>%
html_table(header = TRUE)
as.tibble(data.frame(leb_cast))
#and if we want the full cast
L2<-read_html("https://www.imdb.com/title/tt0118715/fullcredits?ref_=tt_cl_sm#cast")
L2%>%
#KEY THING - most xpaths that are useful for tables end with /table
html_nodes(xpath = '//*[@id="fullcredits_content"]/table')%>%
html_table(header=TRUE)
#perhaps we just want the place where the movie poster is
poster <- lebowski %>%
html_nodes(".poster img") %>%
html_attr("src")
#you can then
download.file(poster, "poster.png")
#sweet, I had this poter in my college dorm
#more practice and an old school feature from reshape2
elect<-read_html("https://en.wikipedia.org/wiki/Electoral_College_(United_States)")
elect2<-elect%>%
html_nodes(xpath = '//*[@id="mw-content-text"]/div/table[7]')%>%
html_table(fill = TRUE)
elect3<-as_tibble(data.frame(elect2))
View(elect3)
elect4<-slice(elect3, 4:54)
#clean this now. I mean, this is super yucky.
elect5<-select(elect4, -Electionyear)
elect5%>%
filter(Electionyear.1 == "California")%>%
ggplot(aes(variable, value))+geom_point()
#Let's get some text shall we
#here is an article about steak
steak<-read_html("https://thetakeout.com/drew-magary-ruins-wagyu-steak-1827996474")
steak%>%
html_nodes("p")%>%
html_text()%T>%
View()
sprouts<-read_html("https://www.bonappetit.com/recipe/roasted-brussels-sprouts-with-warm-honey-glaze")
sprouts%>%
html_nodes(".ingredient")%>%
html_text()
sprouts%>%
html_nodes("#react-app > div > div.post-layout > div.grid.grid--post-layout > div > div > div.col.col-xs-4.col-sm-10.col-md-8.col--MainWithSidebar.col--main-content > div.content-group > div.row.steps-area > div > ul")%>%
html_text()
#a quick dirty method to do a lot of this ... a better method would be to write a function with a for-loop, but that is in a few weeks
#check out this URL
"http://archive.fortune.com/magazines/fortune/fortune500_archive/full/1955/"
#now this one
"http://archive.fortune.com/magazines/fortune/fortune500_archive/full/1990/"
#what do you notice?
#They did a really good job with consistent URLs
#you can do this in excel if you need to, and honestly the correct method is a for-loop, but this is just such a great MacGuvyer
#THINGS TO KNOW adding quotes to things is really tricky.
fLines<-data.frame(A="fortune", B=1955:2010, C="<-read_html",D="(", E="http://archive.fortune.com/magazines/fortune/fortune500_archive/full/", F=1955:2010, G=")", stringsAsFactors = FALSE)
View(fLines)
FFlines<-paste(fLines$A, fLines$B, fLines$C, fLines$D, "'", fLines$E, fLines$F, "'", fLines$G, sep="")
Alright<-data.frame(FFlines, stringsAsFactors = FALSE)
#LULZ! THE POWER!!!!!!!!!
eval(parse(text = Alright$FFlines))
#a 404 error - check your global environment...
fortune2002%>%
#KEY THING - most xpaths that are useful for tables end with /table
html_nodes(xpath = '//*[@id="MagListDataTable"]/table[2]')%>%
html_table(header=TRUE)
cyclops<-paste(fPop$A, fPop$B, "B", sep="")
gambit<-paste(fPop$A, fPop$B, sep="")
Wolverine<-paste(cyclops, "<-", gambit, "%>%",c("html_nodes(xpath=\'//*[@id=\"MagListDataTable\"]/table[2]\')%>%html_table(header=TRUE)"))
writeLines(Wolverine)
#copy and run one of those lines from wolverine
#even moar power, and since we stored in vars, it's like hello
eval(parse(text = Wolverine))
#the next step
rogue<-paste(cyclops, ",", sep="")
writeLines(rogue)
storm<-paste(cyclops,"<-","data.frame(",cyclops,")", sep = "")
writeLines(storm)
eval(parse(text = storm))
Nightcrawler<-paste(cyclops,"<-mutate(", cyclops, ",", "year=", fPop$B, ")", sep="")
writeLines(Nightcrawler)
eval(parse(text = Nightcrawler))
#lets do this the wrong way. Copy that output from the console
Magneto<-rbind(fortune1955B,
fortune1956B,
fortune1957B,
fortune1958B,
fortune1959B,
fortune1960B,
fortune1961B,
fortune1962B,
fortune1963B,
fortune1964B,
fortune1965B,
fortune1966B,
fortune1967B,
fortune1968B,
fortune1969B,
fortune1970B,
fortune1971B,
fortune1972B,
fortune1973B,
fortune1974B,
fortune1975B,
fortune1976B,
fortune1977B,
fortune1978B,
fortune1979B,
fortune1980B,
fortune1981B,
fortune1982B,
fortune1983B,
fortune1984B,
fortune1985B,
fortune1986B,
fortune1987B,
fortune1988B,
fortune1989B,
fortune1990B,
fortune1991B,
fortune1992B,
fortune1993B,
fortune1994B,
fortune1995B,
fortune1996B,
fortune1997B,
fortune1998B,
fortune1999B,
fortune2000B,
fortune2001B,
fortune2002B,
fortune2003B,
fortune2004B,
fortune2005B)%T>%
View()
#now go ahead and rename those columns
distinct(Magneto, Company)