-
Notifications
You must be signed in to change notification settings - Fork 0
/
TenYears.r
247 lines (195 loc) · 11.6 KB
/
TenYears.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# This code builds on a range of extremely useful resources for implementing sequence analysis using TraMineR.
# In particular [Gabadinho et al. (2011)](http://traminer.unige.ch/documentation.shtml) and [Nikos et al. (2019)](https://github.com/patnik/neighbourhood_change).
#### Install relevant packages and libraries
library(ggplot2)
library(tidyr)
library(RColorBrewer)
library(TraMineR)
library(cluster)
library(dplyr)
library(gridExtra)
library(WeightedCluster)
library(sf)
library(tmap)
#### Explore the data
Local Authorities were selected, as sub-regional fuel poverty estimates are understood to be reliable at this scale, allowing for comparison between particular areas (unlike estimates at LSOA scale).
# Read data as csv
LA_data <- read.csv("fuelpoverty_data.csv", check.names=FALSE, fileEncoding = 'UTF-8-BOM')
# Convert data into long format
LA_data_long <- gather(LA_data, key="Year", value="FP_households", c(4:13))
# Make sure that R recognises the year format as a date
LA_data_long$Year<-lubridate::ymd(LA_data_long$Year, truncated = 2L)
head(LA_data_long)
# Once the data is prepared, we can compute a *relative rank or decile* for each fuel poor households for each year.
# This allowed for analysis of how Local Authorities move through different states over time. Relative deciles were selected as an appropriate categorical classification for the dataset, classifying Local Authorities into deciles (i.e. the top 10%, top 10% etc.) for each year between 2010-2019.
# Relative ranks can then be used to create a *state sequence object*.
# We use the TraMineR package in R, a useful package for mining and visualising sequences of states (Gabadinho et al. 2011).
# Find out the names of the columns in the dataset
colnames(LA_data)
# Compute relative rank for each year of data
LA_data_deciles = mutate(LA_data, x2010 = ntile(LA_data$"2010", 10),
x2011 = ntile(LA_data$"2011", 10),
x2012 = ntile(LA_data$"2012", 10),
x2013 = ntile(LA_data$"2013", 10),
x2014 = ntile(LA_data$"2014", 10),
x2015 = ntile(LA_data$"2015", 10),
x2016 = ntile(LA_data$"2016", 10),
x2017 = ntile(LA_data$"2017", 10),
x2018 = ntile(LA_data$"2018", 10),
x2019 = ntile(LA_data$"2019", 10))
# Create a list of states
data_alpha <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
# Create a state sequence object from the fuel poverty variables
LA_seq <- seqdef(LA_data_deciles, 14:23, ststep=NULL, alphabet=data_alpha) # Alphabet is the list of states
# Set the colour palette according to the sequence
cpal(LA_seq) <- c("#313695", "#4575b4", "#74add1", "#abd9e9",
"#e0f3f8", "#fee090", "#fdae61", "#f46d43", "#d73027", "#a50026")
# The sequence data can then be plotted, both from the start and from the end.
seqIplot(LA_seq, sortv = "from.start", with.legend = FALSE)
seqfplot(LA_seq, border = NA, with.legend = FALSE)
#### Compute distance between sequences (i.e. dissimilarities)
# We can then compute pairwise dissimilarities between sequences, or the dissimilarity from a reference sequence.
# It is necessary to chose between several dissimilarity measures, including optimal matching (OM) and many of its variants, distance based on the count of common attributes, and distances between sequence state distributions.
# We opt for *DHD method* (Dynamic Hamming). This approach reflects important differences, or distinct timings, within sequences.
# In our case, the fuel poverty indicator changes across time. The DHD has a strong time sensitivity, allowing substitution costs to depend on the position **t** in the sequence.
# See [Studer (2016)](https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/rssa.12125).
LA_DHD <- seqdist(LA_seq, method = "DHD")
#### PAM ward clustering
Once the temporal sequences are derived, clustering is applied to aggregate the sequences into a reduced number of groups. We opt for a PAM ward hierarchical clustering approach. This provides a more detailed cluster analysis as each cluster has the chance to be redivided (http://www.ijfcc.org/vol5/434-S059.pdf).
# PAM ward clustering
pamwardcluster5 <- wcKMedoids(LA_DHD, k = 5)
unique(pamwardcluster5$clustering)
# Rename labels with cluster names
cl5.lab <- factor(pamwardcluster5$clustering, labels = paste(c("Sustained energy affluence", "Fluctuating energy affluence", "Changeable middle", "Fluctuating energy deprivation", "Entrenched energy deprivation")))
# Change labels in csv file
LA_data_deciles$Cluster_5 <- factor(pamwardcluster5$clustering, labels = paste(1:5))
# Write results to csv.
write.csv(LA_data_deciles, "data_deciles.csv")
head(LA_data_deciles)
# State distribution plots
seqdplot(LA_seq, group = cl5.lab, border = NA, rows = 3, cols = 3)
# Sequence frequency plots
seqfplot(LA_seq,group = cl5.lab, border = NA, rows = 3, cols = 3)
# Transversal entropy plots
seqHtplot(LA_seq,group = cl5.lab, border = NA, rows = 3, cols = 3)
# Mean times plots
seqmtplot(LA_seq,group = cl5.lab, border = NA, rows = 3, cols = 3)
# Whole set index plots
seqIplot(LA_seq,group = cl5.lab, border = NA, rows = 3, cols = 3)
# Individual sequences
seqiplot(LA_seq,group = cl5.lab, border = NA, rows = 3, cols = 3)
# Parallel coordinate plots
seqpcplot(LA_seq,group = cl5.lab, border = NA, rows = 3, cols = 3)
# Modal state sequences
seqmsplot(LA_seq,group = cl5.lab, border = NA, rows = 3, cols = 3)
#### Working with contextual census variables
# Add additional variables
LA_additional_variables <- read.csv("LA_variables.csv", check.names=FALSE, fileEncoding = 'UTF-8-BOM')
# Merge with trajectories dataset
LA_finaldataset <- merge(LA_data_deciles, LA_additional_variables, by.x=c("LACode"), by.y=c("geography code"))
# Change cluster data to character
LA_finaldataset$Cluster_5 <- as.character(LA_finaldataset$Cluster_5)
# Unpaid Care
p1 <- ggplot(LA_finaldataset, aes(x=Cluster_5, y = UnpaidCare_person_per))+
geom_violin(aes(fill=Cluster_5), color=NA)+
labs(title="Unpaid care",y="Percentage of persons", x="", caption = "")+
scale_fill_manual(values=c("#440154", "#3a528b", "#20908d","#5dc962", "#fde725"),
name="Cluster",
breaks=c("1", "2", "3", "4", "5"),
labels=c("1", "2", "3", "4", "5"))+
stat_summary(fun.y=median, geom="point", size=2, color="black")+
theme(legend.position = "none", plot.title = element_text(size=10), axis.title.y=element_text(size=10))
# Private rent
h1 <- ggplot(LA_finaldataset, aes(x=Cluster_5, y = PrivateRent_hh_per))+
geom_violin(aes(fill=Cluster_5), color=NA)+
labs(title="Private renting",y="Percentage of households", x="", caption = "")+
scale_fill_manual(values=c("#440154", "#3a528b", "#20908d","#5dc962", "#fde725"),
name="Cluster",
breaks=c("1", "2", "3", "4", "5"),
labels=c("1", "2", "3", "4", "5"))+
stat_summary(fun.y=median, geom="point", size=2, color="black")+
theme(legend.position = "none", plot.title = element_text(size=10), axis.title.y=element_text(size=10))
# Social rent
h2 <- ggplot(LA_finaldataset, aes(x=Cluster_5, y = SocialRent_hh_per))+
geom_violin(aes(fill=Cluster_5), color=NA)+
labs(title="Social renting",y="", x="Clusters", caption = "")+
scale_fill_manual(values=c("#440154", "#3a528b", "#20908d","#5dc962", "#fde725"),
name="Cluster",
breaks=c("1", "2", "3", "4", "5"),
labels=c("1", "2", "3", "4", "5"))+
stat_summary(fun.y=median, geom="point", size=2, color="black")+
theme(legend.position = "none", plot.title = element_text(size=10), axis.title.x=element_text(size=10))
# Ethnicity
p2 <- ggplot(LA_finaldataset, aes(x=Cluster_5, y = Ethnicity_per))+
geom_violin(aes(fill=Cluster_5), color=NA)+
labs(title="Ethnic minority", x="", y = "", caption = "")+
scale_fill_manual(values=c("#440154", "#3a528b", "#20908d","#5dc962", "#fde725"),
name="Cluster",
breaks=c("1", "2", "3", "4", "5"),
labels=c("1", "2", "3", "4", "5"))+
stat_summary(fun.y=median, geom="point", size=2, color="black")+
theme(legend.position = "none", plot.title = element_text(size=10))
# Deprivaton
h3 <- ggplot(LA_finaldataset, aes(x=Cluster_5, y = Deprivation_hh_per))+
geom_violin(aes(fill=Cluster_5), color=NA)+
labs(title="Deprivation",y="", x="", caption = "")+
scale_fill_manual(values=c("#440154", "#3a528b", "#20908d","#5dc962", "#fde725"),
name="Cluster",
breaks=c("1", "2", "3", "4", "5"),
labels=c("1", "2", "3", "4", "5"))+
stat_summary(fun.y=median, geom="point", size=2, color="black")+
theme(legend.position = "none", plot.title = element_text(size=10))
# No central heating
h4 <- ggplot(LA_finaldataset, aes(x=Cluster_5, y = NoCentralHeating_per_hh))+
geom_violin(aes(fill=Cluster_5), color=NA)+
labs(title="No central heating",y="Percentage of households", x="Clusters", caption = "")+
scale_fill_manual(values=c("#440154", "#3a528b", "#20908d","#5dc962", "#fde725"),
name="Cluster",
breaks=c("1", "2", "3", "4", "5"),
labels=c("1", "2", "3", "4", "5"))+
stat_summary(fun.y=median, geom="point", size=2, color="black")
# Disability or illness
p3 <- ggplot(LA_finaldataset, aes(x=Cluster_5, y = LimitedAlot_person_per))+
geom_violin(aes(fill=Cluster_5), color=NA)+
labs(title="Daily activities limited a lot", x = "", y="", caption = "")+
scale_fill_manual(values=c("#440154", "#3a528b", "#20908d","#5dc962", "#fde725"),
name="Cluster",
breaks=c("1", "2", "3", "4", "5"),
labels=c("1", "2", "3", "4", "5"))+
stat_summary(fun.y=median, geom="point", size=2, color="black")+
theme(legend.position = "none", plot.title = element_text(size=10))
# Final grid image
grid.arrange(p1, p2, p3, h1, h2, h3, nrow = 2)
#### Plots of raw ful poverty data according to cluster
# Gather raw fuel poverty data
LA_data_cluster_long <- gather(LA_finaldataset, key="Year", value="FP_households", c(4:13))
head(LA_data_cluster_long)
# Use only every second label on x axis
everysecond <- function(x){
x <- sort(unique(x))
x[seq(2, length(x), 2)] <- ""
x
}
# Plot fuel poverty estimates for each cluster using bins
bins <- ggplot(LA_data_cluster_long, aes(x=Year, y = FP_households))+
geom_bin_2d()+
labs(x = "", y="Fuel poor households (%)")+
facet_wrap(~ Cluster_5)+
scale_x_discrete(labels = everysecond(LA_data_cluster_long$Year))+
scale_y_continuous()+
guides(fill = guide_legend(title = "LA count"))+
theme_minimal()
plot(bins)
#### Mapping clusters
# Import LA shapefile
LA <- st_read("EnglandandWales_LA_2019.shp")
# Merge with cluster attibutes
LA_clusters <- merge(LA, LA_data_deciles, by.x = "lad19cd", by.y = "LACode")
# Map clusters using tmap functions
Clusters_mapped <- tm_shape(LA)+
tm_fill(col = "grey87")+ # Provide grey outline without borders of England and Wales
tm_shape(LA_clusters)+
tm_polygons(col = "Cluster_5", palette =c("1" ="#440154", "2" ="#3a528b", "3"= "#20908d", "4" = "#5dc962", "5" = "#fde725"), title = "Trajectory clusters")+
tm_scale_bar(text.size = 0.6)+
tm_layout(frame = FALSE, legend.title.fontface = "bold")
Clusters_mapped