-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChurn Reduction-Final-Final.R
440 lines (295 loc) · 17.2 KB
/
Churn Reduction-Final-Final.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
rm(list = ls())
#Getting Current working directory.
getwd()
#Setting working directory
setwd('/Users/akash/Desktop/Project 1 -Churn Reduction')
getwd()
######################################### READING DATA #################################################
#Load CSV files
train_data = read.csv('Train_data.csv' , header = TRUE , stringsAsFactors = FALSE)
test_data = read.csv('Test_data.csv' , header = TRUE , stringsAsFactors = FALSE)
#Getting the dimensions of data
dim(train_data)
dim(test_data)
#Retrieving Column names of train and test data.
colnames(train_data)
colnames(test_data)
#Changing column names of train and test data by stripping the space between them by Underscore
colnames(train_data) = c('state', 'account_length', 'area_code', 'phone_number',
'international_plan', 'voice_mail_plan', 'number_vmail_messages',
'total_day_minutes', 'total_day_calls', 'total_day_charge',
'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
'total_night_minutes', 'total_night_calls', 'total_night_charge',
'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
'number_customer_service_calls', 'Churn')
colnames(test_data) = c('state', 'account_length', 'area_code', 'phone_number',
'international_plan', 'voice_mail_plan', 'number_vmail_messages',
'total_day_minutes', 'total_day_calls', 'total_day_charge',
'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
'total_night_minutes', 'total_night_calls', 'total_night_charge',
'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
'number_customer_service_calls', 'Churn')
################################### CHECKING FOR DATA IMBALANCE ########################################
# Getting data points by label
table(train_data$Churn) # Train data contains 2850 negative labels and 483 positive labels.
table(test_data$Churn) # Test data contains 1443 negative labels and 224 positive labels.
# Checking Class distribution
prop.table(table(train_data$Churn)) # Train data contains 85.5% negaitve labels and 14.5% positive labels.
prop.table(table(test_data$Churn)) # Test data contains 86.5 % negative labels and 13.5% positive labels.
# Plot for data imbalance
# For train data
library(ggplot2)
pl = ggplot(train_data ,aes(x = Churn)) + ggtitle("Train data distribution")
print(pl + geom_bar(fill = 'blue'))
# For test data
library(ggplot2)
pl = ggplot(test_data ,aes(x = Churn)) + ggtitle("Test data distribution")
print(pl + geom_bar(fill = 'blue'))
"
INFERENCE :
It is obvious from the above visualisation that, the data (both the test and the train data) is imbalanced.
- In Train_data :
True : 483
Flase : 2850
- In test_data :
True : 224
False : 1443
- Also, Negative Label (False) contributes to almost 85.5 percent of train data and Positive Label (True)
contributes to only 14.5 percent of data in the train dataset.<br>
- Also, Negative Label (False) contributes to almost 86.6 percent of test data and Positive Label (True)
contributes to only 13.4 percent of data in the test dataset. <br>
}
"
#################################### CHECKING FOR MISSING VALUES #########################################
# Checking for missing values in train data
sum(is.na(train_data)) # Output : [1] 0
#checking for missing values in test data
sum(is.na(test_data)) # Output : [1] 0
"
INFERENCE :
So, both of our train data and test data does not contain any missing values.
"
############################################ DATA STATISTICS ################################################
library(psych)
describe(train_data)
describe(test_data)
############################################ ADDING EXTRA VARIABLES TO THE DATA ##############################
"
It seems like, we can add few more features of our own to add some extra information to our dataset like
'total minutes' , 'total_calls' and 'total charge' that might add some extra information to the dataset.
"
# Adding to Train Data
train_data$total_minutes = train_data$total_day_minutes + train_data$total_eve_minutes + train_data$total_night_minutes + train_data$total_intl_minutes
train_data$total_calls = train_data$total_day_calls + train_data$total_eve_calls + train_data$total_night_calls + train_data$total_intl_calls
train_data$total_charge = train_data$total_day_charge + train_data$total_eve_charge + train_data$total_night_charge + train_data$total_intl_charge
dim(train_data) # 3333 Rows and 24 Columns
# Adding to Test data
test_data$total_minutes = test_data$total_day_minutes + test_data$total_eve_minutes + test_data$total_night_minutes + test_data$total_intl_minutes
test_data$total_calls = test_data$total_day_calls + test_data$total_eve_calls + test_data$total_night_calls + test_data$total_intl_calls
test_data$total_charge = test_data$total_day_charge + test_data$total_eve_charge + test_data$total_night_charge + test_data$total_intl_charge
dim(test_data) # 1667 Rows and 24 Columns
################################# TRANSFORMING CATEGORICAL VARIABLES INTO NUMERIC ##############################
"
In the train and test data given to us, 'international_plan' , 'Voice_mail_plan' and 'Churn' are categorical features. So, we convert them into numerical
features by creating dummies for each of them also called one hot encoding.
"
####### Train Data
train_data = fastDummies::dummy_cols(train_data , select_columns = "international_plan" , remove_first_dummy = TRUE)
train_data = fastDummies::dummy_cols(train_data , select_columns = "voice_mail_plan" , remove_first_dummy = TRUE)
train_data = fastDummies::dummy_cols(train_data , select_columns = "Churn" , remove_first_dummy = TRUE)
knitr::kable(train_data)
# Deleting the columns for which dummies are created
train_data = subset(train_data, select = -c(international_plan, voice_mail_plan, Churn))
# Deleting the column names like state, phone_number and account_length because it does not contain any extra information
# and also increases the feature space after we perform one hot encoding on them.
train_data = subset(train_data, select = -c(state, phone_number, account_length ))
# Changing the column names for the dummies that we created
library("plyr")
train_data = plyr::rename(train_data,c("international_plan_ yes" = "international_plan", "voice_mail_plan_ no" = "voice_mail_plan", "Churn_ True." = "Churn"))
####### Test Data
test_data = fastDummies::dummy_cols(test_data , select_columns = "international_plan" , remove_first_dummy = TRUE)
test_data = fastDummies::dummy_cols(test_data , select_columns = "voice_mail_plan" , remove_first_dummy = TRUE)
test_data = fastDummies::dummy_cols(test_data , select_columns = "Churn" , remove_first_dummy = TRUE)
knitr::kable(test_data)
# Deleting the columns for which dummies are created
test_data = subset(test_data, select = -c(international_plan, voice_mail_plan, Churn))
# Deleting the column names like state, phone_number and account_length because it does not contain any extra information
# and also increases the feature space after we perform one hot encoding on them.
test_data = subset(test_data, select = -c(state, phone_number, account_length))
# Changing the column names for the dummies that we created
library("plyr")
test_data = plyr::rename(test_data,c("international_plan_ yes" = "international_plan", "voice_mail_plan_ yes" = "voice_mail_plan","Churn_ True."="Churn"))
####################################### SPLITTING THE DATA IN TRAIN AND TEST SET ############################################
#Splitting Train data in X_train and y_train
X_train = subset(train_data,select = -c(Churn))
y_train = subset(train_data,select = c(Churn))
##Splitting Test data in X_test and y_test
X_test = subset(test_data,select = -c(Churn))
y_test = subset(test_data,select = c(Churn))
#Checking dimension of X_train and y_train
dim(X_train)
dim(y_train)
#Checking dimension of X_test and y_test
dim(X_test)
dim(y_test)
####################################################### DATA VISUALISATION ####################################################
################## UNIVARIATE ANALYSIS ###################
#### Box-Plot Visualisation of Individual Features
boxplot(train_data[,c('total_day_minutes',
'total_day_calls', 'total_day_charge', 'total_eve_minutes',
'total_eve_calls', 'total_eve_charge')])
boxplot(train_data[,c('total_night_minutes',
'total_night_calls', 'total_night_charge', 'total_intl_minutes')])
boxplot(train_data[,c('total_intl_minutes',
'total_intl_calls', 'total_intl_charge',
'number_customer_service_calls', 'total_minutes', 'total_calls',
'total_charge')])
#### Box-Plot Visualisation of Features based on label
boxplot(total_day_minutes~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_day_minutes", xlab="Churn")
boxplot(total_day_calls~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_day_calls", xlab="Churn")
boxplot(total_day_charge~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_day_charge", xlab="Churn")
boxplot(total_eve_minutes~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_eve_minutes", xlab="Churn")
boxplot(total_eve_calls~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_eve_calls", xlab="Churn")
boxplot(total_eve_charge~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_eve_charge", xlab="Churn")
boxplot(total_night_minutes~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_night_minutes", xlab="Churn")
boxplot(total_night_calls~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_night_calls", xlab="Churn")
boxplot(total_night_charge~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_night_charge", xlab="Churn")
boxplot(total_intl_minutes~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_intl_minutes", xlab="Churn")
boxplot(total_intl_calls~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_intl_calls", xlab="Churn")
boxplot(total_intl_charge~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_intl_charge", xlab="Churn")
boxplot(number_customer_service_calls~Churn, data=train_data, col=(c("gold","darkgreen")),main="number_customer_service_calls", xlab="Churn")
boxplot(total_minutes~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_minutes", xlab="Churn")
boxplot(total_calls~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_calls", xlab="Churn")
boxplot(total_charge~Churn, data=train_data, col=(c("gold","darkgreen")),main="total_charge", xlab="Churn")
##### Violin-Plot Visualisation of Individual Features
library(vioplot)
vioplot(train_data$total_day_minutes , train_data$total_day_calls ,train_data$total_day_charge )
vioplot(train_data$total_eve_minutes,train_data$total_eve_calls,train_data$total_eve_charge)
vioplot(train_data$total_night_minutes,train_data$total_night_calls,train_data$total_night_charge)
vioplot(train_data$total_intl_minutes,train_data$total_intl_calls,train_data$total_intl_charge)
vioplot(train_data$total_minutes,train_data$total_calls,train_data$total_charge, train_data$number_customer_service_calls)
##### Violin-Plot Visualisation of Individual Features based on label
library(ggpubr)
viplot = function(y)
{
ggviolin(train_data, "Churn", "y" , fill ="Churn",palette = c("#00AFBB", "#E7B800"),
add = "boxplot", add.params = list(fill = "white"))
}
viplot("total_day_minutes")
viplot("total_day_calls")
viplot("total_day_charge")
viplot("total_night_minutes")
viplot("total_night_calls")
viplot("total_night_charge")
viplot("total_eve_calls")
viplot("total_eve_minutes")
viplot("total_eve_charge")
viplot("total_intl_minutes")
viplot("total_intl_calls")
viplot("total_intl_charges")
viplot("number_customer_service_calls")
##### Normality check
qqnorm(train_data$total_day_minutes)
qqnorm(train_data$total_day_calls)
qqnorm(train_data$total_day_charge)
qqnorm(train_data$total_eve_minutes)
qqnorm(train_data$total_eve_calls)
qqnorm(train_data$total_eve_charge)
qqnorm(train_data$total_night_minutes)
qqnorm(train_data$total_night_calls)
qqnorm(train_data$total_night_charge)
qqnorm(train_data$total_intl_minutes)
qqnorm(train_data$total_intl_calls)
qqnorm(train_data$total_intl_charge)
qqnorm(train_data$number_customer_service_calls)
qqnorm(train_data$total_charge)
qqnorm(train_data$total_calls)
qqnorm(train_data$total_minutes)
################################## MULTIVARIATE ANALYSIS ####################################
#### PCA Visualisation
library(ggfortify)
autoplot(prcomp(train_data), data = train_data, colour = 'Churn')
#### t-SNE Visualisation
library(Rtsne)
require(tsne)
train_matrix <- as.matrix(train_data[,1:20])
set.seed(42) # Set a seed if you want reproducible results
tsne_out <- Rtsne(train_matrix) # Run TSNE
# Show the objects in the 2D tsne representation
plot(tsne_out$Y,col=train_data$Churn)
library(ggplot2)
tsne_plot <- data.frame(x = tsne_out$Y[,1], y = tsne_out$Y[,2], col = train_data$Churn)
ggplot(tsne_plot) + geom_point(aes(x=x, y=y, color=col))
################################################ DATA PREPARATION AND CLEANING #############################################
#### Scaling the data
cnames= colnames(X_train)
for(i in cnames){
print(i)
train_data[,i] = (train_data[,i]-mean(train_data[,i])) / sd(train_data[,i])
test_data[,i] = (test_data[,i]-mean(test_data[,i])) / sd(test_data[,i])
}
#### Feature Selection
# Correlation Plot
library(corrgram)
corrgram(train_data[,cnames] , order =F, upper.panel = panel.pie , text.panel = panel.txt , main = "Correlation Plot")
# Deleting correlated features
train_data = subset(train_data , select = -c(total_charge,total_intl_charge,total_night_charge,total_day_charge,total_eve_charge,voice_mail_plan))
test_data = subset(test_data , select = -c(total_charge,total_intl_charge,total_night_charge,total_day_charge,total_eve_charge,voice_mail_plan))
dim(train_data)
dim(test_data)
######################################################## MACHINE LEARNING MODELS #############################################################
library(sigmoid)
# Coverting predicter variable into factor
train_data$Churn = as.factor(train_data$Churn)
test_data$Churn = as.factor(test_data$Churn)
########## NAIVE BAYES ###############
library(e1071)
classifier = naiveBayes(x = train_data[-15] , y = train_data$Churn )
y_pred =predict(classifier , newdata = test_data[-15])
conf_matrix=table(test_data[,15], y_pred)
caret::confusionMatrix(test_data[,15], y_pred)
################# KNN #################
library(tidyverse)
getNamespace("grDevices")
library(rpart)
library(class)
KNN_predictions = knn(train_data[,1:14] , test_data[,1:14] , train_data[,15] , k= 3 , prob = TRUE)
KNN_predictions #Probablity Score for getting probablity of data points in test data
conf_matrix=table(KNN_predictions , test_data$Churn)
caret::confusionMatrix(KNN_predictions, test_data[,15])
########## Logistic Regression ########
logit_model = glm(Churn ~ . ,data =train_data , family = 'binomial')
summary(logit_model)
logit_predictions = predict(logit_model, newdata = test_data , type = 'response')
logit_predictions #Probablity Score for getting probablity of data points in test data
logit_predictions = ifelse(logit_predictions > 0.5 , 1 ,0)
confMatrix=table(test_data$Churn , logit_predictions )
caret::confusionMatrix(confMatrix)
########### Decision Trees ############
library(rpart)
DT_model = rpart(Churn ~ . ,data =train_data )
summary(DT_model)
DT_predictions_proba = predict(DT_model, newdata = test_data[-15])
DT_predictions_proba ##Probablity Score for getting probablity of data points in test data
DT_predictions = predict(DT_model, newdata = test_data[-15] , type = 'class')
confMatrix=table(test_data$Churn, DT_predictions )
caret::confusionMatrix(confMatrix)
############## Random Forest #############
library(randomForest)
RF_model = randomForest(x =train_data[-15] , y= train_data$Churn ,importance = TRUE, ntree = 500)
summary(RF_model)
RF_predictions = predict(RF_model, newdata = test_data[-15] , type = 'class')
confMatrix=table(test_data$Churn , RF_predictions )
caret::confusionMatrix(confMatrix)
################# XGBoost ################
train_data$Churn = as.integer(train_data$Churn) - 1
test_data$Churn = as.integer(test_data$Churn) - 1
library(xgboost)
XG_model = xgboost( data = as.matrix(train_data[-15]), label = train_data$Churn, nrounds = 10, objective="binary:logistic")
XG_pred = predict(objective =logistic ,XG_model , newdata = as.matrix(test_data[-15]) )
XG_pred ##Probablity Score for getting probablity of data points in test data
XG_pred = ifelse(XG_pred > 0.5 , 1 ,0)
confMatrix=table(test_data$Churn , XG_pred)
caret::confusionMatrix(confMatrix)
###########################################----------------------------------------################################################
"XGBoost and random forest gives best results out of all these machine learning models"