-
Notifications
You must be signed in to change notification settings - Fork 0
/
day_six_analysis.R
122 lines (96 loc) · 3.35 KB
/
day_six_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#day six - key variables
#libraries
library(dplyr)
library(ggplot2)
library(ggthemr)
#a good bit of this package may be written in FORTRAN
library(car)
#optional
ggthemr("carrot")
#we have been working with TIDY data, the tidy philosophy has three axioms
#a. each variable has a column
#b. each observation has a row
#c. each value has a cell
#this is also "long data" more on this next week
#also - TIBBLES - are a special class of data frames that
#a. allow you to have more creative names
#b. throw better error messages
#c. print more coherently
#you won't see that many tibbles in your travels unless you intentionally make them
#data.frame is far more common (this is why I use reflexively)
#at this point I think you have good control of the basic functions like writing a filter
#the more advanced tools are those that create new like MUTATE
#on Wednesday we looked at a formula about brides which did some nested logic, let's work there now
#to start with
glimpse(weddings)
#we need to start thinking about the idea of the KEY variable
#this is a special column of information, I think of it like a zipper
#here is a column
weddings$State
#some of our coolest things (like maps) will involve zippering two datasets together
#we can also use a key column to make things a little less annoying
#like a serial filter
#let's manually create a list of states that are MID
mid<-c("Florida", "Massachusetts", "Illinois")
#we can play with this in fun ways with a special pipe
weddings %>%
filter(State %in% mid)
#RESULT:
weddings %>%
filter(State %in% mid)
#now lets do this the wild way
weddingsD<-weddingsC %>%
mutate(meh = if_else(State %in% mid, "gross", "baller"))
#and graphic that bad boi
weddingsD %>%
ggplot(aes(Total,fill=as.factor(Result)))+
geom_density(alpha = 0.7) +
facet_grid(~meh)
#ok that is legit interesting - florida/mass/illinois have weird stuff
#NOW SLOW DOWN the function
#first call and store
weddingsD<-weddingsC %>%
#mutate and name
mutate(meh =
#the logical test - is the STATE in MID
if_else(State %in% mid,
#if YES the value of the cell...
"gross",
#if NO the value of the cell
"baller"))
#what if we wanted to nest even more of these
baller<-c("California", "Arizona", "New York")
weddingsD<-weddingsC %>%
#mutate and name
mutate(meh =
#the logical test - is the STATE in MID
if_else(State %in% mid,
#if YES the value of the cell...
"mid",
#IF NO then we RUN A NEW TEST
if_else(State %in% baller,
#value if a baller
"baller",
#value if neither MID nor BALLER
"meh"
)))
#run it again
weddingsD %>%
ggplot(aes(Total,fill=as.factor(BudgetPerGues)))+
geom_density(alpha = 0.7) +
#OVERRIDE CARROT
theme_excel() +
scale_fill_excel()+
facet_grid(~meh)
#window functions
weddingsD %>%
group_by(meh) %>%
#what do you want to know for the average
summarize(mean(Age), mean(Budget))
weddingsE<-weddingsD %>% mutate(sugar = Age-Age.1)
#so what really matters
#time for the FANCY MATH!
#it is FRIDAY after all
fit<-aov(Total ~ Generation*sugar*Budget.1*State, data=weddingsE)
plot(fit)
summary(fit)