-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml_predict.R
98 lines (68 loc) · 3.26 KB
/
ml_predict.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
## Load the necessary libraries.
library(caret) # For training/machine learning.
## Get the data.
the_data <- read.csv("data/pml-training.csv", na.strings = c("NA", ""))
## Show size.
print(dim(the_data))
########### Pre-processing: Mostly cleaning! #############
## The data set has lots of NAs. Indeed some columns are purely NAs.
## Therefore, let's find those columns that are purely NAs, then remove them,
## as they won't be very useful to us and could be distracting to our model.
## Here, we get the sum of NAs per column (note, the margin argument of Apply
## function = 2, for columns only; default is 1, for rows/lists). Then, we replace
## the original df w/the a cleaner version of itself.
na_colsum <- apply(the_data, 2, function(x){sum(is.na(x))})
the_data <- the_data[, which(na_colsum == 0)]
## Show size.
print(dim(the_data))
########### Partitioning (for later cross-validation) ############
## Create the training set partition and remaining validation set.
train_sample <- createDataPartition(y = the_data$classe, p = 0.7, list = FALSE)
training_set <- the_data[train_sample, ]
validation_set <- the_data[-train_sample,]
## Size up and preview.
print(dim(training_set))
print(dim(validation_set))
# print(head(training_set))
########### Training the model (random forest) #############
## Train using caret package (model = random forest or "rf").
## Classe as outcome, predicted by any/all other variables.
## DISCLAIMER: I found out that you can speed things up by limiting the number of folds the
## method uses, so yeah, I'm doing that. I need my computer for other things!
the_forest1 <- train(classe ~ ., method = "rf", data = training_set,
trControl = trainControl(method = "cv", number = 2))
## Let's 'see' the model.
print(the_forest1$finalModel)
## Get the # of predictors for max accuracy.
print(the_forest1$bestTune)
## Plot the accuracy per predictors.
print(plot(the_forest1, col = "red", main = "Accuracy per Number of Random Predictors"))
############ Cross-Validate ############
## We're cross-validating to see how well the model does with data that's outside
## of the original sample.
## Compute the predictions upon the validation set, using our forest model.
v_predictions <- predict(the_forest1, validation_set)
## Now, let's sum the # of correct predictions and divide by the total # of values to get
## accuracy and, consequently, the error rate.
v_accuracy <- sum(v_predictions == validation_set$classe)/length(v_predictions)
oos_error <- 1 - v_accuracy
## Print error rate, prettily. :)
print(paste("Out of sample error rate: ", round(oos_error*100,3), "% (percent)", sep = ""))
########### Use test data! ############
## Get the data.
test_data <- read.csv("data/pml-testing.csv", na.strings = c("NA", ""))
########### Same pre-processing as before. ###########
na_colsum2 <- apply(test_data, 2, function(x){sum(is.na(x))})
test_data <- test_data[, which(na_colsum2 == 0)]
## Show size.
print(dim(test_data))
## Test predictions
t_predictions <- predict(the_forest1, test_data)
########### Get test values ########
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
}
}