-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKOI_ML_explore.R
257 lines (193 loc) · 12.7 KB
/
KOI_ML_explore.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#' ---
#' title: "Kepler Space Telescope Exoplanet Search Data Machine Learning Classification Project"
#' subtitle: "Data Set Exploration and Preliminary Analysis"
#' author: "Timothy Drexler"
#' date: "August 2019"
#' ---
## ---- Setup ------------------------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
# used to load large csv file
require(readr)
# used to manipulate data frame
require(dplyr)
# used for plots
require(ggplot2)
# used for plot colors
require(RColorBrewer)
# used to arrange plots
require(grid); require(gridExtra)
# used to melt data for plotting
require(data.table)
# used to plot correlation matrix
require(corrplot)
# used to test for multivariate normality
require(MVN)
# prevent masking of 'dplyr' select by other packages
select <- dplyr::select
## ---- Data Import ------------------------------------------------------------
# import data set
koi_df <-
# load data set
data.frame(read_csv("KOI_data.csv", n_max = 9564) ) %>%
# select response + useful predictors (exclude error columns, flags, ids, etc.)
select(koi_pdisposition, koi_period, koi_impact, koi_duration, koi_depth, koi_prad, koi_teq, koi_insol, koi_model_snr, koi_steff, koi_slogg, koi_srad, koi_kepmag ) %>%
# code response as factor
mutate( koi_pdisposition = as.factor(koi_pdisposition) )
# examine observations with missing values
koi_df_missing <-
koi_df %>%
filter(!complete.cases(.))
# filter data set to include only complete cases
koi_df <-
koi_df %>%
filter(complete.cases(.))
## ---- Data Set Exploration ---------------------------------------------------
## boxplots --------------------------------------------------------------------
# overall distributions
melt(as.data.table(koi_df), id.vars= "koi_pdisposition") %>%
select(variable, value) %>%
ggplot( aes(y=value, x=variable, fill=variable ) ) +
geom_boxplot() +
labs(title="Box Plots of Quantitative Variable Distributions", subtitle="Kepler data set", y="Value", x="") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5), legend.position = "none", axis.text.x = element_blank()) +
facet_wrap(~variable, ncol=3, scales="free")
# distributions within each response category
melt(as.data.table(koi_df), id.vars= "koi_pdisposition") %>%
select(koi_pdisposition, variable, value) %>%
ggplot( aes(y=value, x=koi_pdisposition, fill=koi_pdisposition ) ) +
geom_boxplot() +
labs(title="Box Plots of Quantitative Variable Distributions Within Categories of 'koi_pdisposition'", subtitle="Kepler data set", y="Value", x="") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5), axis.text.x = element_blank() )+
scale_x_discrete(limits=c("CANDIDATE","FALSE POSITIVE")) +
facet_wrap(~variable, ncol=3, scales="free")
# most variables very right-skewed, except 'koi_slogg' (left-skewed) and 'koi_kepmag' (~normal, logarithmic scale)
## histograms ------------------------------------------------------------------
melt(as.data.table(koi_df), id.vars= "koi_pdisposition") %>%
select(koi_pdisposition, variable, value) %>%
ggplot( aes(x=value, fill=variable) ) +
geom_histogram(bins=15, color="black") +
labs(title="Histograms of Quantitative Variable Distributions", subtitle="Kepler data set", y="Counts", x="") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5), legend.position = "none") +
facet_wrap(~variable, ncol=3, scales="free")
# most variables very right-skewed, except 'koi_slogg' (left-skewed) and 'koi_kepmag' (~normal, logarithmic scale)
## normal probability plots ----------------------------------------------------
melt(as.data.table(koi_df), id.vars= "koi_pdisposition") %>%
select(koi_pdisposition, variable, value) %>%
ggplot( aes(sample=value, color=variable ) ) +
stat_qq( size = 2, shape = 1 ) +
stat_qq_line() +
labs(title="Normal Probability Plots of Quantitative Variable Distributions", subtitle="Kepler data set", y="Sample Quantiles", x="Theoretical Quantiles") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5), legend.position = "none" ) +
facet_wrap(~variable, ncol=3, scales="free")
# most variables very right-skewed, except 'koi_slogg' (left-skewed) and 'koi_kepmag' (~normal, logarithmic scale)
## overlapping density plots ---------------------------------------------------
melt(as.data.table(koi_df), id.vars= "koi_pdisposition") %>%
select(koi_pdisposition, variable, value) %>%
ggplot( aes(x=value, color=koi_pdisposition ) ) +
geom_density(lwd=1) +
labs(title="Overlapping Density Plots of Quantitative Variable Distributions Within Categories of 'koi_pdisposition'", subtitle="Kepler data set", y="Value", x="") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5) ) +
facet_wrap(~variable, ncol=3, scales="free")
# 'koi_slogg', 'koi_steff', & 'koi_kepmag' appear to have ~equal variances between categories; 'koi_teq' appears to have unequal variances between categories; difficult to tell for other variables
## linear correlations between response and predictor variables ----------------
pairs(koi_df[c(1,2:5)], main="Scatterplot Matrices - Quantitative Variables (Kepler data set)")
pairs(koi_df[c(1,6:9)], main="Scatterplot Matrices - Quantitative Variables (Kepler data set)") # 'koi_teq' and 'koi_insol' are correlated, but correlation is not linear
pairs(koi_df[c(1,10:13)], main="Scatterplot Matrices - Quantitative Variables (Kepler data set)") # 'koi_srad' correlated with 'koi_slogg', but correlation is not linear
cor(koi_df %>% mutate( koi_pdisposition = ifelse(koi_pdisposition=="CANDIDATE", 1, 0) ))[ , 1] # no strong linear correlation between response and any predictors
# correlations between predictor variables
corrplot(
cor( koi_df[ ,-1]),
method="color", # squares instead of circles
type = "upper", # upper half only
addCoef.col="black", # add regression coefficient values
order = "hclust", # hierarchical clustering order
tl.col = "black", # text label color
tl.srt = 45, # text label rotation
tl.cex = 0.75, # text label size
diag = FALSE, # remove correlation coefficients from principal diagonal
number.cex=0.7 )# correlation coefficient text size (cex parameter)
# some linear correlations ('koi_slogg' & 'koi_srad', 'koi_prad' & 'koi_impact'), but all correlation coefficient magnitudes < 0.7, so won't eliminate any predictors
## ---- Variable Transformation ------------------------------------------------
# strong right-skew of most predictor variables suggests transformation may be appropriate
summary(koi_df)
# no variables have negative values, but there are some zeroes, so use ln(x+1) transformation on all variables other than 'koi_kepmag' (already approximately-normal), 'koi_slogg' (left-skewed), and 'koi_pdisposition' (binary response)
koi_transform <-
koi_df %>% mutate_at(vars(-c(koi_pdisposition, koi_slogg, koi_kepmag) ), list(~log1p(.) ) )
## transformed variable boxplots -----------------------------------------------
# overall distributions
melt(as.data.table(koi_transform), id.vars= "koi_pdisposition") %>%
select(variable, value) %>%
ggplot( aes(y=value, x=variable, fill=variable ) ) +
geom_boxplot() +
labs(title="Box Plots of Quantitative Variable Distributions", subtitle="Kepler data set", y="Value", x="") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5), legend.position = "none", axis.text.x = element_blank()) +
facet_wrap(~variable, ncol=3, scales="free")
# distributions within each response category
melt(as.data.table(koi_transform), id.vars= "koi_pdisposition") %>%
select(koi_pdisposition, variable, value) %>%
ggplot( aes(y=value, x=koi_pdisposition, fill=koi_pdisposition ) ) +
geom_boxplot() +
labs(title="Box Plots of Quantitative Variable Distributions Within Categories of 'koi_pdisposition'", subtitle="Kepler data set", y="Value", x="") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5), axis.text.x = element_blank() )+
scale_x_discrete(limits=c("CANDIDATE","FALSE POSITIVE")) +
facet_wrap(~variable, ncol=3, scales="free")
# log(x+1) transformation produces more-symmetrical distributions to some degree for all variables
# no obvious differences in distribution by response category for any variable
## transformed variable histograms ---------------------------------------------
melt(as.data.table(koi_transform), id.vars= "koi_pdisposition") %>%
select(koi_pdisposition, variable, value) %>%
ggplot( aes(x=value, fill=variable) ) +
geom_histogram(bins=15, color="black") +
labs(title="Histograms of Quantitative Variable Distributions", subtitle="Kepler data set", y="Counts", x="") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5), legend.position = "none") +
facet_wrap(~variable, ncol=3, scales="free")
# log(x+1) transformation produces more-symmetrical distributions to some degree for all variables
# some variables still very right skewed ('koi_impact', 'koi_prad', 'koi_srad')
## transformed variable normal probability plots -------------------------------
melt(as.data.table(koi_transform), id.vars= "koi_pdisposition") %>%
select(koi_pdisposition, variable, value) %>%
ggplot( aes(sample=value, color=variable ) ) +
stat_qq( size = 2, shape = 1 ) +
stat_qq_line() +
labs(title="Normal Probability Plots of Quantitative Variable Distributions", subtitle="Kepler data set", y="Sample Quantiles", x="Theoretical Quantiles") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5), legend.position = "none" ) +
facet_wrap(~variable, ncol=3, scales="free")
# log(x+1) transformation produces more-symmetrical distributions to some degree for all variables
# some variables still very right skewed ('koi_impact', 'koi_prad', 'koi_srad')
## transformed variable overlapping density plots ------------------------------
melt(as.data.table(koi_transform), id.vars= "koi_pdisposition") %>%
select(koi_pdisposition, variable, value) %>%
ggplot( aes(x=value, color=koi_pdisposition ) ) +
geom_density(lwd=1) +
labs(title="Overlapping Density Plots of Quantitative Variable Distributions Within Categories of 'koi_pdisposition'", subtitle="Kepler data set", y="Value", x="") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust=0.5) ) +
facet_wrap(~variable, ncol=3, scales="free")
# after transformation, can now observe unequal distributions by response category for some variables ('koi_period', 'koi_depth', 'koi_insol', etc.); distinct classes may help with model predictions
## linear correlations between response and transformed predictor variables ----
pairs(koi_transform[c(1,2:5)], main="Scatterplot Matrices - Quantitative Variables (Kepler data set)")
pairs(koi_transform[c(1,6:9)], main="Scatterplot Matrices - Quantitative Variables (Kepler data set)") # transformed 'koi_teq' and 'koi_insol' linearly correlated
pairs(koi_transform[c(1,10:13)], main="Scatterplot Matrices - Quantitative Variables (Kepler data set)") # transformed 'koi_srad' and 'koi_slogg' linearly correlated
cor(koi_transform %>% mutate( koi_pdisposition = ifelse(koi_pdisposition=="CANDIDATE", 1, 0) ))[,1] # no strong linear correlation between response and any predictors
# correlations between transformed predictor variables
corrplot(
cor( koi_transform[ , -1]),
method="color", # squares instead of circles
type = "upper", # upper half only
addCoef.col="black", # add regression coefficient values
order = "hclust", # hierarchical clustering order
tl.col = "black", # text label color
tl.srt = 45, # text label rotation
tl.cex = 0.75, # text label size
diag = FALSE, # remove correlation coefficients from principal diagonal
number.cex=0.7 )# correlation coefficient text size (cex parameter)
# transformed predictor variables have much higher linear correlations, both positive and negative; variance inflation could be an issue for linear models
## test for multivariate normality of predictors within each response category ----
x_cd_transform = koi_transform[koi_transform$koi_pdisposition == "CANDIDATE", -1]
x_fp_transform = koi_transform[koi_transform$koi_pdisposition == "FALSE POSITIVE", -1]
x_cd_notransform = koi_df[koi_df$koi_pdisposition == "CANDIDATE", -1]
x_fp_notransform = koi_df[koi_df$koi_pdisposition == "FALSE POSITIVE", -1]
mvn(x_cd_transform, mvnTest = "hz")$multivariateNormality
mvn(x_fp_transform, mvnTest = "hz")$multivariateNormality
mvn(x_cd_notransform, mvnTest = "hz")$multivariateNormality
mvn(x_fp_notransform, mvnTest = "hz")$multivariateNormality
# results show predictor variables are not multivariate normal; linear discriminant analysis and quadratic discriminate analysis would not be appropriate for model selection