-
Notifications
You must be signed in to change notification settings - Fork 1
/
Day 2 intermediate basics.R
134 lines (93 loc) · 4.13 KB
/
Day 2 intermediate basics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#Day Two - confirming our filter knowledge
#Learning Outcomes
#Confirm basic dplyr knowldege
#Produce plots quickly using ggplot2
#Use a pipe to filter a full dataset
#Add labels
#map colors for discrete and continuous
#summarize a more complex dataset
#First how do you load a library
library(dplyr)
library(ggplot2)
#let's start with some new data... the weddings dataset
#we need to load that in our Environment pane
#you can summarize this with a function called head()
head(weddings)
#this data is tidy, meaning each column is a variable and each row is a case
#which columns are discrete?
#let's do some basic grammar of graphics
#we define the dataset, then the aesthetics, then the geom
ggplot(weddings)
#THAT GRAPH PRODUCED:
ggplot(weddings, aes(Budget, Guests, colour=State))
#THAT GRAPH PRODUCED:
ggplot(weddings, aes(Budget, Guests, colour=State))+geom_jitter()
#AND BECAUSE WE CAN...look at that, we are doing some MATH INSIDE THE FUNCTION
ggplot(weddings, aes(Budget, Guests, colour=State, size = Dress+Venue+Experience))+geom_jitter()
#Let's write a new one with a discrete X and Y axis
ggplot(weddings, aes(Result, Budget.1, colour=Experience))+geom_jitter()
#let's play around for a minute, which other GEOMS will be awesome here...
#Now for a research design minute, let's test a hypothesis. If the bride is OLDER the EXPERIENCE will be lower, because they can't party hard.
cor.test(weddings$Age, weddings$Experience)
#how to interpret; pvalue <.05; coefficient
#remember, you are just rejecting the null hypothesis
#what we are doing here is exploratory data science
#sometimes we might want to combine some things, that is when we use a pipe
#you can make a pipe with apple-shift-m
#what if we wanted to get average budget by state?
#start BIG with the dataset
weddings %>%
#which is the left argument for
group_by(State) %>%
#those groups are now the left argument for
summarize(mean(Budget))
#OH fancy, now we can add a graph to that
weddings %>%
group_by(State) %>%
#to make it easier we will NAME the summary
summarize(Average=mean(Budget)) %>%
#this new happy dataset (States and Average)
ggplot(aes(State, Average))+geom_point() +
#lets turn the labels 90 degrees
theme(axis.text.x = element_text(angle = 90))
#Let's get in some groups and talk about this data, find some trends, what happened here
#how do you win?
#now lets make some stuff pretty, why not?
library(tvthemes)
library(ggthemes)
ggplot(weddings, aes(Budget.1, Result, colour=Experience))+geom_jitter()+scale_color_brooklyn99(type="continuous")
#but how many TV themes do we have to play with...
??tvthemes
ggplot(weddings, aes(Budget.1, Result, colour=Experience))+geom_jitter()+scale_color_simpsons(type="continuous")
#but Simpsons if it was 1993
ggplot(weddings, aes(Budget.1, Result, colour=Experience))+geom_jitter()+scale_color_simpsons(type="continuous")+theme_excel()
#now give me Nate Silver but with his friend Ron Swanson
ggplot(weddings, aes(Budget.1, Result, colour=Experience))+geom_jitter()+scale_color_parksAndRec(type="continuous")+theme_fivethirtyeight()
#lets just have fun for a second
ggplot(weddings, aes(State, Budget, size = 1/Result, colour=Experience))+
geom_jitter()+
theme(axis.text.x = element_text(angle = 90))+
scale_color_gradient(low="green",high="darkgreen")+
labs(title="When More is Less")
#let's try it again
#load the weddings data...
glimpse(weddings)
#this dataset has everything, categorial, discrete, location, names, and even different levels of measure
#somethings here are dirty and hard, lets clean them
#try this
weddings%>%
#new name first
rename(Bride.Name = Bride.1)
colnames(weddings)[1]<-"Dan is Cool"
#that makes more sense
#so which one matters most to victory?
#we need a new column with the total score
w2<-weddings%>%
mutate(Total = Dress+Venue+Food+Experience)
#now we can use w2
#now change that up a litte... take five minutes and chat with your pals...
#Now we need to learn some more advanced dplyr stuff, and for this you will need to use pipes
#what if we want to know some averages
weddings%>%
group_by(Result)%>%
summarize(mean(Experience), mean(Dress), mean(Food), mean(Venue))