-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathHarsh_RTraining_Liberia.R
176 lines (136 loc) · 7.79 KB
/
Harsh_RTraining_Liberia.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#Library that lets you load shapefiles
library(maptools)
#Library that lets you manipulate Shapefiles
library(sp)
#Library that lets you handle projections
library(rgdal)
#Define the projection ("Coordinate Reference System")
proj.def <- CRS("+proj=longlat +datum=WGS84")
#The location of the DHS Shapefile Boundary with the Buffers
DHS.shpfile.location <- "/home/aiddata/Desktop/Github/LiberiaInfraConflict/Analysis_Data/all_buffers_merged.shp"
#Load in the buffers, and project them. You always have to manually project in R.
DHS.locs <- readShapePoly(DHS.shpfile.location,proj4string=proj.def)
#The path to the boundary of liberia for visualizaitons.
Lib.bound.loc <- "/home/aiddata/Desktop/Github/LiberiaInfraConflict/LiberiaBoundaries/LBR_adm2.shp"
#Load & project
Lib.bound <- readShapePoly(Lib.bound.loc,proj4string=proj.def)
#Where the CSV with ancillary data is located (Seth's exports)
anc.data.loc <- "/home/aiddata/Desktop/Github/LiberiaInfraConflict/Analysis_Data/liberia_data.csv"
#Load a CSV. You can type View(anc.dta) to see what loaded (note capital V)
anc.dta <- read.csv(anc.data.loc)
#You can also type names(anc.dta) to see the column names, or head(anc.dta)
#to see a quick summary
#I'm merging the ancillary data into the shapefile.
#Note the @data - that tells R I'm looking at the Data of the shapefile.
#You can also type names(DHS.locs@data), head, etc. to see the data.
DHS.locs@data <- merge(DHS.locs@data, anc.dta, by="DHSID")
#A quick example of how to make a helpful visualization
#First, subset the data so it only contains one year:
DHS_2013 <- DHS.locs[DHS.locs@data$DHSYEAR.x == 2013,]
#Now, create a quick map.
saved_DHS_2013_map <- spplot(DHS_2013["sslp_e"], main="2013", sp.layout=list(Lib.bound))
#Plot it by simply calling the name you saved it as:
saved_DHS_2013_map
#Note there is a point WAY outside of Liberia.
#You can quickly remove all points
#That don't fall inside Liberia by subsetting the data
#Spatially:
DHS_2013.subset <- DHS_2013[Lib.bound,]
#Now, plot the map just like before, but with the subset:
DHS_2013_map.subset <- spplot(DHS_2013.subset["sslp_e"], main="2013", sp.layout=list(Lib.bound))
#And, remember to call your new map:
DHS_2013_map.subset
#Now, for a very quick example analysis.
#To do this, we'll need to construct some data.
#We also want to make sure we're doing the analysis on the subset
#So we don't get the erroneous point included
#First, let's consider the data we have:
names(DHS_2013.subset)
#Because we aren't using a panel model (rather, we're cross-sectional)
#We need to summarize this data.
#I.e., the average monthly temperature for a single point in time
#doesn't do us much good (at41_200411e would be the mean (e) temperature (at)
#in Novermber of 2004)
#Luckily for you, we already have functions to do this!
#They are included here, but you may want to skip over this part
#as it's a bit more advanced.
#If you want to check it out, look at the file "SciClone_functions.R".
#--------------------------------------------------
source("SciClone_functions.R")
#First, let's make a copy of our data to save any permutations into.
analysis_DTA <- DHS_2013.subset
#There are two types of aggregations we have tools for - trends and averages
#First we'll calculate an average:
#To do this you need the data you're creating the summaries from
#(DHS_2013.subset in this case), the prefix of the temporal data
#you want to summarize (here, we using pc41_, which is precip;
#to see all names, remember the names(analysis_DTA@data) command).
#You then need the affix - in this case we're using "e", which stands for mean
#(i.e., we're summarizing the mean monthly precip).
#The last three are the start year, end year, and type of dates.
#Here we're calculating 1992 to 2005, with "year month" dates.
#When run, this saves the new variable into "precip_average".
analysis_DTA@data$precip_preAverage <- timeRangeAvg(DHS_2013.subset, "pc41_","e",1992, 2005, "ym")
#View what we calculated:
analysis_DTA@data$precip_preAverage
hist(analysis_DTA@data$precip_preAverage)
analysis_DTA@data$precip_postAverage <- timeRangeAvg(DHS_2013.subset, "pc41_","e",2005, 2014, "ym")
#here, we calculate the linear trend over a given time period.
#This is a little trickier.
#Just like the average, the first command is the data.
#The second is the search command for the data you want to calculate over.
#In this example, names(DHS_2013.subset) shows us the format of precipitation is
#pc41_201410e". All we do is replace the numeric values (years and months)
#wiht [0-9]. If you had only years, you would just use [0-9] four times.
#The next arguments are start year, end year, unique ID, and finally the
#type of data (yearly or year-month)
analysis_DTA$precip_preTrend <-timeRangeTrend(DHS_2013.subset,"pc41_[0-9][0-9][0-9][0-9][0-9][0-9]e",1992,2005,"DHSID", "ym")
analysis_DTA@data$precip_preTrend
hist(analysis_DTA@data$precip_preTrend)
#Here's an example with a different variable and range (nighttime lights):
#Note for this variable we only have yearly data.
analysis_DTA@data$NTL_preAverage <- timeRangeAvg(DHS_2013.subset, "ncc4_","e",1992, 2005, "y")
analysis_DTA@data$NTL_preTrend <- timeRangeTrend(DHS_2013.subset,"ncc4_[0-9][0-9][0-9][0-9]e",1992,2005,"DHSID", "y")
analysis_DTA@data$NTL_postAverage <- timeRangeAvg(DHS_2013.subset, "ncc4_","e",2005, 2012, "y")
#Now we have some example data we can produce an example model with.
#First, we need to load in our treatment indicators.
treatment_data <- read.csv("/home/aiddata/Desktop/Github/LiberiaInfraConflict/Analysis_Data/merged_final_treatments.csv")
#Merge the data
analysis.trt.data <- merge(analysis_DTA, treatment_data, by="DHSID")
#For this analysis, let's only look at Rural areas to start.
analysis.trt.data <- analysis.trt.data[analysis.trt.data@data$URBAN_RURA == "U",]
#A simple definition of treatment - overlap greater than the median:
#set all units to 0:
analysis.trt.data$Treatment <- 0
#Calculate the Median Overlap
med_overlap <- median(analysis.trt.data@data$percent_overlap_dissolved)
#Set units with overlap >med_overlap to 1:
analysis.trt.data$Treatment <- (as.numeric(analysis.trt.data@data$percent_overlap_dissolved) > med_overlap)
#Double check:
summary(analysis.trt.data$Treatment)
#Check how many units of observation this leaves us with (our initial n for the
#analysis).
length(analysis.trt.data)
#First, we want to create a paired dataset where
#we match treatment and control units
#on relevant covariates.
#We're going to just use the stock MatchIt functionality for now.
#Make a list of all the variables you're using in the analysis
#(We do this to get rid of NA data that might exist)
aVars <- c("Treatment", "tc00_e","dari_e","droa_e","selv_e","sslp_e","am50_e","gpw4_2000e","NTL_preTrend","NTL_preAverage","precip_preTrend","precip_preAverage", "NTL_postAverage","precip_postAverage")
#Configure matchit
#This is the first stage of a propensity score model
matchit.results <- matchit(Treatment ~ tc00_e + dari_e + droa_e + selv_e + sslp_e + am50_e + gpw4_2000e + NTL_preTrend + NTL_preAverage + precip_preTrend + precip_preAverage,
data=analysis.trt.data@data[aVars],
method="nearest", distance="logit",
caliper=0.5)
#Examine our balance
summary(matchit.results)
#Subset our data to only include matches
modelData <- match.data(matchit.results)
#Let's construct a very simple outcome measure - the post-average of nighttime lights compared
#to the pre-average of nighttime lights.
modelData$outcome <- modelData$NTL_postAverage - modelData$NTL_preAverage
#Fit the model using out matched data from the matchit procedure
linearModel <- lm(outcome ~ Treatment + tc00_e + dari_e + droa_e + selv_e + sslp_e + am50_e + gpw4_2000e + NTL_preTrend + NTL_preAverage + precip_preTrend + precip_preAverage,
data=modelData)