-
Notifications
You must be signed in to change notification settings - Fork 2
/
03-01-Kmeans-Clustering.Rmd
40 lines (29 loc) · 1.36 KB
/
03-01-Kmeans-Clustering.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# Clustering Methods {-}
## Kmeans Clustering {-}
```{r a1, warning=FALSE, message=FALSE, comment=NA, fig.width=8, fig.height=7}
## The famous iris dataset
data(iris)
flower = iris
head(flower)
summary(flower)
## Kmeans requires you to predetermine the number of clusters
mdl = kmeans(flower[, 1:4], centers = 3)
flower$cluster = mdl$cluster
## plot kmeans for a few of the variable combinations
par(mfrow = c(2, 2))
plot(x = flower$Sepal.Length, y = flower$Sepal.Width, col = factor(flower$Species), main = "Actual")
plot(x = flower$Sepal.Length, y = flower$Sepal.Width, col = factor(flower$cluster), main = "Model")
plot(x = flower$Petal.Length, y = flower$Petal.Width, col = factor(flower$Species), main = "Actual")
plot(x = flower$Petal.Length, y = flower$Petal.Width, col = factor(flower$cluster), main = "Model")
## Try using principal components to draw one picture that catures all of the variance
pc = prcomp(flower[, 1:4], cor = TRUE)
summary(pc)
## The first 2 principal components explains 97% of all variation
flower.pc = as.data.frame(as.matrix(flower[, 1:4]) %*% pc$rotation)
flower.pc$cluster = flower$cluster
flower.pc$species = flower$Species
## How well did the model do?
par(mfrow = c(2, 1))
plot(flower.pc$PC1, flower.pc$PC2, col = factor(flower.pc$cluster), main = "Model")
plot(flower.pc$PC1, flower.pc$PC2, col = factor(flower.pc$species), main = "Actual")
```