-
Notifications
You must be signed in to change notification settings - Fork 0
/
classificationscript.R
72 lines (72 loc) · 3.61 KB
/
classificationscript.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
library(tidyr)
library(readr)
library(dplyr)
library(zoo)
library(caret)
train
dim(train)
summary(train)
class(train$Name)
train$Name = as.factor(train$Name)
summary(train)
train$ID = as.factor(train$ID)
train$Customer_ID = as.factor(train$Customer_ID)
summary(train$ID)
train$Age = gsub('-', '', train$Age)
write_csv(train, 'archive/train.csv')
train$Age = gsub('_', '', train$Age)
train$Annual_Income = gsub('-', '', train$Annual_Income)
train$Annual_Income = gsub('_', '', train$Annual_Income)
train$Num_of_Loan= gsub('-', '', train$Num_of_Loan)
train$Num_of_Loan= gsub('_', '', train$Num_of_Loan)
train$Changed_Credit_Limit = gsub('-', '', train$Changed_Credit_Limit)
train$Changed_Credit_Limit = gsub('_', '', train$Changed_Credit_Limit)
train$Outstanding_Debt = gsub('-', '', train$Outstanding_Debt)
train$Outstanding_Debt = gsub('_', '', train$Outstanding_Debt)
train$Monthly_Balance = gsub('-', '', train$Monthly_Balance)
train$Monthly_Balance = gsub('_', '', train$Monthly_Balance)
train = select(train, -ID, -Customer_ID, -Name)
train = select(train, -Month, -SSN, -Monthly_Inhand_Salary, -Type_of_Loan, -Num_of_Delayed_Payment, -Credit_History_Age, -Amount_invested_monthly)
summary(train)
class(train$Age)
View(train)
train$Age = ifelse(train$Age > 80, NA, train$Age)
train$Age = na.locf(train$Age)
train$Monthly_Balance = ifelse(is.na(train$Monthly_Balance), mean(train$Monthly_Balance, na.rm = TRUE), train$Monthly_Balance)
train$Num_Credit_Inquiries = ifelse(is.na(train$Num_Credit_Inquiries), mean(train$Num_Credit_Inquiries, na.rm = TRUE), train$Num_Credit_Inquiries)
train$Occupation = gsub('_______', NA, train$Occupation)
train$Occupation = na.locf(train$Occupation)
train$Credit_Mix = gsub('_', NA, train$Credit_Mix)
train$Credit_Mix = na.locf(train$Credit_Mix, fromLast = TRUE)
train$Credit_Mix = gsub('Bad', '0', train$Credit_Mix)
train$Credit_Mix = gsub('Standard', '1', train$Credit_Mix)
train$Credit_Mix = gsub('Good', '2', train$Credit_Mix)
class(train$Credit_Mix)
train$Credit_Mix = as.integer(train$Credit_Mix)
train_copy = as.factor(train$Payment_of_Min_Amount)
summary(train_copy)
train$Payment_of_Min_Amount = gsub('NM', '0', train$Payment_of_Min_Amount)
train$Payment_of_Min_Amount = gsub('No', '1', train$Payment_of_Min_Amount)
train$Payment_of_Min_Amount = gsub('Yes', '2', train$Payment_of_Min_Amount)
train$Payment_of_Min_Amount = as.integer(train$Payment_of_Min_Amount)
train_copy = as.factor(train$Payment_Behaviour)
summary(train_copy)
train$Payment_Behaviour = gsub('!@9#%8', '0', train$Payment_Behaviour)
train$Payment_Behaviour = gsub('Low_spent_Small_value_payments', '1', train$Payment_Behaviour)
train$Payment_Behaviour = gsub('Low_spent_Medium_value_payments', '2', train$Payment_Behaviour)
train$Payment_Behaviour = gsub('Low_spent_Large_value_payments', '3', train$Payment_Behaviour)
train$Payment_Behaviour = gsub('High_spent_Small_value_payments', '4', train$Payment_Behaviour)
train$Payment_Behaviour = gsub('High_spent_Medium_value_payments', '5', train$Payment_Behaviour)
train$Payment_Behaviour = gsub('High_spent_Large_value_payments', '6', train$Payment_Behaviour)
train$Payment_Behaviour = as.integer(train$Payment_Behaviour)
train_copy = as.factor(train$Credit_Score)
summary(train_copy)
train$Credit_Score = gsub('Poor', '0', train$Credit_Score)
train$Credit_Score = gsub('Standard', '1', train$Credit_Score)
train$Credit_Score = gsub('Good', '2', train$Credit_Score)
train$Credit_Score = as.integer(train$Credit_Score)
forumla = as.formula("~ Occupation")
one_hot = predict(dummyVars(formula, data = train), newdata = train)
train = cbind(train, one_hot)
train = select(train, -Occupation)
train = select(train, -Changed_Credit_Limit)