-
Notifications
You must be signed in to change notification settings - Fork 0
/
algorithm.R
83 lines (53 loc) · 2.35 KB
/
algorithm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
library(ggplot2)
library(grid)
library(gridExtra)
library(lattice)
library(caret)
library(rpart)
library(rattle)
library(randomForest)
url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
filename <- "pml-training.csv"
# downloading the file
download.file(url, destfile = filename, method = "curl")
filename <- "pml-training.csv"
data <- read.csv(filename, header = TRUE, na.strings = c("NA", "#DIV/0!"))
colnames(data)[1] <- "observationId"
names(data)
# create a T/F vector identify variables with at least one NA
missingcols <- sapply(data, function(x) { any(is.na(x)) })
# replace data by keeping only those variables that don't have missing data
data <- data[ , !missingcols]
names(data)
# set seed
set.seed(123)
# create training and test sets
inTrain <- caret::createDataPartition(y = data$classe, p = 0.7, list = FALSE)
# subset
training <- data[inTrain, ]
testing <- data[-inTrain, ]
#four ggplot boxplots
p.roll.belt <- ggplot(training, aes(classe, roll_belt))
p.roll.belt <- p.roll.belt + geom_boxplot() + ggtitle("Roll Belt")
p.pitch.belt <- ggplot(training, aes(classe, pitch_belt))
p.pitch.belt <- p.pitch.belt + geom_boxplot() + ggtitle("Pitch Belt")
p.yaw.belt <- ggplot(training, aes(classe, yaw_belt))
p.yaw.belt <- p.yaw.belt + geom_boxplot() + ggtitle("Yaw Belt")
p.accel.belt <- ggplot(training, aes(classe, total_accel_belt))
p.accel.belt <- p.accel.belt + geom_boxplot() + ggtitle("Total Accel Belt")
gridExtra::grid.arrange(p.roll.belt, p.pitch.belt, p.yaw.belt, p.accel.belt, ncol = 2, nrow = 2)
# simple decision tree model, isolating the class (60) as the outcome and
# variables 8 - 11 as predictors
modFit <- caret::train(classe ~ ., method = "rpart", data = training[,c(8:11,60)])
rattle::fancyRpartPlot(modFit$finalModel)
# make sure to use the same variables from the testing set as were used in the training set.
predictions <- predict(modFit, newdata = testing[,c(8:11,60)])
# output confusion matrix
confusionMatrix(predictions, testing$classe)
# random forest using all predictors
modFit.rf <- randomForest::randomForest(classe ~ ., data = training[,c(8:60)])
modFit.rf
predictions.rf <- predict(modFit.rf, newdata = testing[,c(8:60)])
confusionMatrix(predictions.rf, testing$classe)
plot(modFit.rf, main = "Error rate of random forest")
varImpPlot(modFit.rf, pch = 20, main = "Importance of Variables")