-
Notifications
You must be signed in to change notification settings - Fork 1
/
CreditCardFraud.r
168 lines (134 loc) · 4.58 KB
/
CreditCardFraud.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Libraries need to be run for the code to run
# Link for Dataset: https://www.kaggle.com/ntnu-testimon/paysim1
# Dataset size is approx 400+ MB
# Dataset on kaggle
# Code has both data preprocessing and machine development lines.
library(plyr)
library(tidyverse)
library(caret)
library(GGally)
library(stringr)
library(rattle)
library(pROC)
library(ROCR)
#read data
set.seed(317)
fraud_raw <- read_csv("PS_20174392719_1491204439457_log.csv")
glimpse(fraud_raw)
fraud_df <- fraud_raw %>%
mutate(name_orig_first = str_sub(nameOrig,1,1)) %>%
mutate(name_dest_first = str_sub(nameDest, 1, 1)) %>%
select(-nameOrig, -nameDest)
#nameDest
unique(fraud_df$name_dest_first)
fraud_df$name_dest_first <- as.factor(fraud_df$name_dest_first)
table(fraud_df$name_dest_first)
#nameOrig
unique(fraud_df$name_orig_first)
fraud_df2 <- fraud_df %>%
select(-name_orig_first, -isFlaggedFraud) %>%
select(isFraud, type, step, everything())
glimpse(fraud_df2)
fraud_df2$type <- as.factor(fraud_df2$type)
fraud_df2$isFraud <- as.factor(fraud_df2$isFraud)
fraud_df2$isFraud <- recode_factor(fraud_df2$isFraud, `0` = "No", `1` = "Yes")
summary(fraud_df2)
#fraud transactions
fraud_trans <- fraud_df2 %>%
filter(isFraud == "Yes")
summary(fraud_trans)
#reducing dataset
fraud_df3 <- fraud_df2 %>%
filter(type %in% c("CASH_OUT", "TRANSFER")) %>%
filter(name_dest_first == "C") %>%
filter(amount <= 10000000) %>%
select(-name_dest_first)
summary(fraud_df3)
#sample dataset
not_fraud <- fraud_df3 %>%
filter(isFraud == "No") %>%
sample_n(8213)
is_fraud <- fraud_df3 %>%
filter(isFraud == "Yes")
full_sample <- rbind(not_fraud, is_fraud) %>%
arrange(step)
#graphs
ggplot(full_sample, aes(x = step, col = isFraud)) +
geom_histogram(bins = 743)
#only fraud
ggplot(is_fraud, aes(x = step)) +
geom_histogram(bins = 743)
ggpairs(full_sample)
summary(full_sample)
#preprocessing
preproc_model <- preProcess(fraud_df3[, -1],
method = c("center", "scale", "nzv"))
fraud_preproc <- predict(preproc_model, newdata = fraud_df3[, -1])
fraud_pp_w_result <- cbind(isFraud = fraud_df3$isFraud, fraud_preproc)
summary(fraud_pp_w_result)
#high correlational
fraud_numeric <- fraud_pp_w_result %>%
select(-isFraud, -type)
high_cor_cols <- findCorrelation(cor(fraud_numeric), cutoff = .75, verbose = TRUE,
names = TRUE, exact = TRUE)
high_cor_removed <- fraud_pp_w_result %>%
select(-newbalanceDest)
fraud_numeric <- high_cor_removed %>%
select(-isFraud, -type)
comboInfo <- findLinearCombos(fraud_numeric)
comboInfo
#modelling
model_df <-high_cor_removed
is_fraud <- model_df %>%
filter(isFraud == "Yes")
not_fraud <- model_df %>%
filter(isFraud == "No") %>%
sample_n(8213)
# To mix up the sample set I'll arrange by `step`
model_full_sample <- rbind(is_fraud, not_fraud) %>%
arrange(step)
#splitting
in_train <- createDataPartition(y = model_full_sample$isFraud, p = .75,
list = FALSE)
train <- model_full_sample[in_train, ]
test <- model_full_sample[-in_train, ]
gc()
control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary)
big_no_sample <- model_df %>%
filter(isFraud == "No") %>%
sample_n(100000)
#svm
start_time <- Sys.time()
svm_model <- train(isFraud ~ .,
data = train,
method = "svmRadial", # Radial kernel
tuneLength = 3, # 3 values of the cost function
metric="ROC",
trControl=control)
end_time <- Sys.time()
end_time - start_time
print(svm_model$finalModel)
#prediction on training set
svm_train_pred <- predict(svm_model, train)
confusionMatrix(train$isFraud, svm_train_pred, positive = "Yes")
#prediction on test set
svm_test_pred <- predict(svm_model, test)
confusionMatrix(test$isFraud, svm_test_pred, positive = "Yes")
#predict on no-fraud dataset
start_time <- Sys.time()
svm_big_no_pred <- predict(svm_model, big_no_sample)
end_time <- Sys.time()
end_time - start_time
confusionMatrix(big_no_sample$isFraud, svm_big_no_pred, positive = "Yes")
#ROC
svm_probs <- predict(svm_model, test, type = "prob")
svm_ROC <- roc(response = test$isFraud,
predictor = svm_probs$Yes,
levels = levels(test$isFraud))
plot(svm_ROC, col = "black")
#area under curve
auc(svm_ROC)