-
Notifications
You must be signed in to change notification settings - Fork 0
/
kagglefinal.r
76 lines (60 loc) · 2.09 KB
/
kagglefinal.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
---
title: "KaggleFinal"
output: html_document
---
#Loading the data \
#```{r}
train = read.csv('/stats-202-fa21/train.csv')
test = read.csv('/stats-202-fa21/test.csv')
#```
#Installing packages\
#```{r}
library('class')
install.packages('crossval')
library('crossval')
install.packages('caret')
library(caret)
library(e1071)
#```
#70/30 Train test split cross validation \
#```{r}
set = sample(length(labels), 0.7*length(labels))
labels <- as.factor(train$Activity)
train.data <- train[,2:562]
svm.lin <- svm(labels[set] ~., data=train.data[set,], kernel="linear", cost=10)
train_err_rate = 1 - sum(diag(table(labels[set], predict(svm.lin, train.data[set,]))))/length(labels[set])
test_err_rate = 1 - sum(diag(table(labels[-set], predict(svm.lin, train.data[-set,]))))/length(labels[-set])
print(paste('training err rate:', train_err_rate, 'test err rate:', test_err_rate))
#```
#10-Fold Cross validation to tune cost parameter \
#```{r}
train.data <- train[,2:562]
labels <- as.factor(train$Activity)
err_rate <- function(x) {1-sum(diag(x)/(sum(rowSums(x))))}
err_arr <- c()
for (i in seq(-2, 1, by = 0.25)) {
cv_err <- c()
fold <- createFolds(labels, 10) #create 10 folds
for (j in fold) {
svm.out <- svm(labels[-j] ~., data= train.data[-j,], kernel="linear", cost=10^i)
prediction <- predict(svm.out, train.data[j,])
tab <- table(prediction, labels[j])
cv_err<- append(cv_err, err_rate(tab))
}
err_arr <- append(err_arr, mean(cv_err))
}
plot(seq(-2, 1, by = 0.25), err_arr, xlab='cost', ylab='error rate')
#```
#Using the best cost parameter and training on all the training data \
#```{r}
cv_cost <- seq(-2, 1, by = 0.25)[which.min(err_arr)]
svm.best <- svm(labels ~., data=train.data, kernel="linear", cost=10^cv_cost)
train_err_rate = 1 - sum(diag(table(labels, predict(svm.best, train.data))))/length(labels)
train_err_rate
#```
#Predicting the unlabeled data \
#```{r}
prediction <- predict(svm.best, test)
pred_df <- data.frame('Id' = test$Id, 'Activity' = prediction)
write.csv(pred_df, file='/Users/amalyajohnson/Desktop/Data Science/STATS 202/sv_lin.csv',row.names = FALSE)
#```