-
Notifications
You must be signed in to change notification settings - Fork 0
/
R File Happiest Countries.R
113 lines (74 loc) · 3.35 KB
/
R File Happiest Countries.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
## ----warning=F,message=F-------------------------------------------
library(tidyverse)
library(dplyr)
library(factoextra)
library(cluster)
library(gridExtra)
# Load data
happiness_0 = read_csv("happiness_2018.csv", show_col_types = FALSE)
happiness_1 = happiness_0 %>%
mutate(Corruption = as.numeric(`Perceptions of corruption`))
# Remove NA values
happiness_2 = happiness_1 %>% drop_na()
dim(happiness_1)
happiness_1 = happiness_2 %>%
select(-c(`Overall rank`, `Country or region`,`Perceptions of corruption`))
## ------------------------------------------------------------------
# Correlogram
pairs(happiness_1[,1:4], pch = 19)
## ------------------------------------------------------------------
# Create A PCA
pc.happiness = happiness_1 %>%
select(-Score) %>%
prcomp(scale=TRUE)
pc.happiness
## ------------------------------------------------------------------
# Visualize top contributors
fviz_contrib(pc.happiness,choice="var", axes=1, top=5)
## ----warning=F-----------------------------------------------------
# Graph Explantory power of variables
PRVar<- pc.happiness$sdev^2
PVE<- PRVar[1:9]/sum(PRVar)
PC=1:9
data=data.frame(PC, PVE)
ggplot(data=data, aes(x=PC, y=PVE))+
geom_line(color="navy")+
geom_point(aes(x=3,y=0.1),cex=5,color="orange",alpha=0.3)+
geom_point(color="red",cex=2)+
labs(title="Proportion of Variance Explained", x="Principal Component",y="pve")+
scale_x_continuous(breaks = 1:9)
## ------------------------------------------------------------------
# View scatterplot of greatest contributors from PCA
PC12 <- pc.happiness$x %>% as_tibble() %>% select(1:2)
pc.happiness$x %>% as_tibble() %>% select(PC1, PC2) %>%
bind_cols(happiness_1) %>%
ggplot(aes(x = `PC1`, y = `PC2`)) + geom_point()
## ------------------------------------------------------------------
# Find optimal clusters
happiness_4 = scale(happiness_1[,-1])
fviz_nbclust(PC12, kmeans, method = "gap_stat")
## ------------------------------------------------------------------
km_mod = kmeans(happiness_4, centers=3)
pam_mod = pam(happiness_4, 3)
## ------------------------------------------------------------------
# Create variable for cluster number in df
PC12_cluster = happiness_1 %>% mutate(cluster=factor(pam_mod$cluster))
## ------------------------------------------------------------------
# Compare kmean and pam
p1<-fviz_cluster(km_mod, data = happiness_4)
p2<- fviz_cluster(pam_mod, data = happiness_4)
grid.arrange(p1, p2, ncol=2)
## ------------------------------------------------------------------
# Create clusters with country name
PC12_cluster <- PC12 %>% mutate(cluster = factor(pam_mod$cluster), countryOrRegion = factor(happiness_2$`Country or region`))
## ------------------------------------------------------------------
# View cluster diparity
p1<-ggplot(PC12_cluster,aes(x=cluster,y=PC1,color=cluster)) +
geom_boxplot()+ labs(title="GDP per Capita by Cluster")
# Boxplot
p2<-ggplot(PC12_cluster,aes(x=cluster,y=PC2 ,color=cluster)) +
geom_boxplot()+ labs(title="Healthy Life Expectancy by Cluster")
grid.arrange(p1, p2, ncol = 2)
## ------------------------------------------------------------------
# Filter and display the happiest countries
as.data.frame(PC12_cluster %>% filter(cluster == 1) %>% select(countryOrRegion))