-
Notifications
You must be signed in to change notification settings - Fork 2
/
Day Three Notes.R
331 lines (241 loc) · 9.36 KB
/
Day Three Notes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#day three pre-code examples, to make things run more smoothly
#first, call your libraries for the day
library(ggplot2)
library(tibble)
library(tidyr)
library(dplyr)
library(magrittr)
#some handydatasets
library(babynames)
library(nycflights13)
library(ggplot2movies)
#that last one is new to you - it is a list of about 60k films from imdb. Let's use that today
#a review of dplyr code: five big functions, this time using
#filter
filter(movies, Comedy==1)
#so what if we want to see all Comedies after 1990, desending order of length
filter(movies, Comedy==1)%>%
filter(year>1990)%>%
arrange(desc(length))
#here is a really funny example
recentComedy<-filter(movies, Comedy==1)%>%
filter(year>1990)%>%
#oh and look at that a "T" pipe
arrange(desc(length))%T>%
View()
#the T pipe allows you to do something that should have no result but then to add a result
#OK WAIT - GO BACK TO LINE 29 and DELETE the T PIPE! SEE WHAT HAPPENS.
#You did it? Excellent. It just freaked out because that View needed to grab somethign
#But why would you need the T pipe?
#there are some workflows with d3.js that want to print or store things, %<>% can be used to assign things in pipe system
#Back to the extraction theme
#Let's use babynames this time
#we know that there are nearly two-million entires in this dataset
#but HOW MANY ACTUAL NAMES?
distinct(babynames, name)
#how many...
#Ok, this time, let's really do some science to the NYC airports. Instead of letting them use averages for a whole year, lets get random flights
sample_frac(flights, size = .05, replace = FALSE)
#now lets see how much the average delay was
sample_frac(flights, size = .05)%>%
summarize(mean(dep_delay, na.rm = TRUE))
#run lines 51 and 52 FIVE TIMES.
#What do you notice?
#perhaps you just want some random number of rows
sample_n(babynames, 14)
#if you ever need a random name generator for fiction writing, this is your jam.
#Let's say you want to arrange all the baby names by popularity ever, and THEN slice off the top 25 names of all time, then to see the distinct count
arrange(babynames, desc(n))%>%
slice(1:20)%>%
distinct(name)
#Let's find the height of the age of conformity
arrange(babynames, desc(n))%>%
slice(1:20)%>%
summarize(mean(year))
#Well that's it.
#What if we use the top 100?
top_n(babynames, 25, n)%>%
summarize(mean(year))
#a density graphic for the age of conformity
top_n(babynames, 500, n)%>%
ggplot(aes(year, n, colour=sex))+geom_density2d()
#really stretch out that plot zoom, you won't be disappointed
#ok, why not
top_n(babynames, 500, n)%>%
ggplot(aes(year, n, colour=sex))+geom_hex()
#OH WAIT, didn't work? Trouble shoot this...
#and just for information sake...
top_n(babynames, 500, n)%>%
ggplot(aes(year, n, colour=sex))+geom_text(aes(label = name), check_overlap = TRUE)
#So what are we really learning about Linda...
filter(babynames, name == "Linda")%>%
ggplot(aes(year, prop))+geom_area()
#NERD ALERT - time for kurtosis
#this is the measure of how sharp that point is...
#you will need these handy libraries for some real cool maths
library(moments)
library(e1071)
linda<-filter(babynames, name == "Linda")
kurtosis(linda$n)
skewness(linda$n)
#let's do a quick comp
Jen<-filter(babynames, name == "Jennifer")
kurtosis(Jen$n)
skewness(Jen$n)
#for comp sake,
Rob<-filter(babynames, name == "Robert")
kurtosis(Rob$n)
skewness(Rob$n)
#so which one has a sharper peak?
#Which one has more total children...
sum(linda$n)
sum(Jen$n)
#Professors: what is your conclusion?
#AT THIS POINT RUN THIS HELP CODE.
??base::logic
#You may need to pull stuff out of a column with pull or select
pull(movies, year)
#that's just all the values from that column as a vector, it could be useful
#what if I don't care about genre...
select(movies, title, year, length, rating)
#a nice new dataset without the columns I didn't need
#Let's work with VECTOR summary functions
#get me all flights on Valentine's Day
filter(flights, month==2 & day==14)%>%
#make a new column called rocker that shows in real tie
mutate(rocker = cumsum(dep_delay))%T>%
#take a T pipe to dump all this data into a CSV, then T pipe again
write.csv("somedata.csv")%T>%
#pop it up like a spread sheet and WHILE we are at it, make a GGplot
View()%>%
ggplot(aes(dep_time, rocker, colour=carrier))+geom_jitter()
#what even was rocker?
#I would suggest you ??magrittr and read the original description of the code, it is funny and useful
#key syntax for really getting pipes: LHS (left hand side) RHS (right hand side)
#the basic pipe %>% is the most common
#%T>% just returns the LEFT value to pop out of the pipe, sort of like a ninja turtle lifting a lid on a pipe
#the %$% operator carries through the elements of the left side
#this pipe only functions if you call magrittr on your own
#the ggplot2movies dataset from IMDB
movies %>%
#in this case we want just movies that are above average, but we still want to retain the information about what the ratings were
subset(rating > mean(rating))%$%
#since we are now operating in a subset of films and we used the exposition pipe, we can call back to the other parts of the dataset
cor(rating, length)
#try this code again but with a standard pipe; it couldn't deal with "length" because you needed RIGHT hand side to be able to see the pipe result TOO
#from last time, with a little programming mojo...
filter(babynames, name == "Aubri" | name == "Aubrey")%>%
mutate(accurate = if_else(name == "Aubrey", "Wrong", "Right", missing = "WUT"))%>%
ggplot(aes(year, n, shape = sex, colour = accurate))+geom_point()
#summarize functions are really helpful...
summarize(movies, mean(rating))
babynames%>%
group_by(year)%>%
summarize(sum(n), mean(n))
#At this point we need to start talking about putting stuff together
#put two tables next to each other
bind_cols()
#add more rows to a table with the same strucutre
bind_rows()
#set theory operators are a little more complicated, but super rewarding; let's get some data...
#these are called FILTERING JOINS
oldnames<-babynames%>%
filter(year==1960)%>%
top_n(100, n)
newnames<-babynames%>%
filter(year==2015)%>%
top_n(100, n)
#so now we have oldnames and newnames; each has 100 names.
#Let's see which names have been retained over those 55 years..
semi_join(oldnames, newnames, by = "name")
semi_join(newnames, oldnames, by = "name")
#so names have changed a lot.
#the names that have NOT been retained
anti_join(oldnames, newnames, by = "name")
#let's make this a little easier...
oldboynames<-babynames%>%
filter(sex == "M")%>%
filter(year==1960)%>%
top_n(100, n)
newboynames<-babynames%>%
filter(sex == "M")%>%
filter(year==2015)%>%
top_n(100, n)
semi_join(oldboynames, newboynames, by = "name")
oldgirlnames<-babynames%>%
filter(sex == "F")%>%
filter(year==1960)%>%
top_n(100, n)
newgirlnames<-babynames%>%
filter(sex == "F")%>%
filter(year==2015)%>%
top_n(100, n)
semi_join(oldgirlnames, newgirlnames, by = "name")
#let's try a test,
reallyoldgirlnames<-babynames%>%
filter(sex == "F")%>%
filter(year==1880)%>%
top_n(100, n)
semi_join(reallyoldgirlnames, newgirlnames, by = "name")
semi_join(reallyoldgirlnames, oldgirlnames, by = "name")
#what can you say about these names then?
#you can also use MUTATING JOINS
#Let's make a version of flights that is pretty easy to read...
concise_flights <- select(flights, origin, dest, dep_delay, arr_delay, carrier)
concise_flights
#but what if you wanted to know the actual names of those carriers? UA is easy, but B6?
#this reveals..
nycflights13::airports
View(airports)
#as you can see, there are five datasets here: airlines, airports, flights, planes, and weather
#lets start with our concise data and add the names of the air carriers
#inner_join allows you to use a "key" value shared between datasets to add more data
inner_join(concise_flights, airlines, by = "carrier")
#Let's assume that JFK has the worst weather and is the norm
weatherB<-filter(weather, origin=="JFK")%>%
select(-origin)%T>%
View()
#you are starting to realize that there could be a massive, nascent dataset, all of the data.
allflights<-inner_join(flights, airlines, by = "carrier")%>%
left_join(planes, by = "tailnum")%>%
mutate(faa = dest)%>%
left_join(airports, by = "faa")%>%
inner_join(weatherB, by = "time_hour")%T>%
View()
#special problem: three airports
dim(weather)
View(filter(weather, month == 1 & day ==1 & hour ==1))
#here is the problem: if we just innerjoin this, it will add the weather from EACH airport to EVERY flight
#so this is now roughly 1/3 the size.
wEWR<-weather%>%
filter(origin == "EWR")
wLGA <- weather%>%
filter(origin == "LGA")
wJFK <- weather%>%
filter(origin == "JFK")
tfEWR<-flights%>%
filter(origin == "EWR")
tfLGA <- flights%>%
filter(origin == "LGA")
tfJFK <- flights%>%
filter(origin == "JFK")
LGA<-inner_join(tfLGA, wLGA, by = "time_hour")
EWR<-inner_join(tfEWR, wEWR, by = "time_hour")
JFK<-inner_join(tfJFK, wJFK, by = "time_hour")
#and there you are - intermediate variables to the rescue!
fullFlights<-bind_rows(LGA, EWR, JFK)
#some fun functions
ggplot(fullFlights, aes(time_hour, dep_delay, colour=wind_speed))+geom_jitter()
fullFlights%$%
cor.test(arr_delay, wind_speed)
#well that doesn't help
fullFlights%$%
cor.test(dep_delay, wind_dir)
fullFlights%$%
cor.test(dep_delay, temp)
fullFlights%$%
cor.test(dep_delay, pressure)
fullFlights%$%
cor.test(dep_delay, humid)
fullFlights%$%
cor.test(dep_delay, visib)