-
Notifications
You must be signed in to change notification settings - Fork 2
/
prep.Rmd
476 lines (364 loc) · 20.7 KB
/
prep.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
---
title: "Data preparation"
bibliography: references.bib
date: "Last compiled on `r format(Sys.time(), '%B, %Y')`"
output:
html_document:
css: tweaks.css
toc: true
toc_float: true
number_sections: false
toc_depth: 1
code_folding: show
code_download: yes
---
```{r, globalsettings, echo=FALSE, warning=FALSE, results='hide'}
library(knitr)
knitr::opts_chunk$set(echo = TRUE)
opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE, warning = FALSE, message = FALSE,comment = "#>", cache=TRUE, class.source=c("test"), class.output=c("test2"))
options(width = 100)
rgl::setupKnitr()
colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }
```
```{r klippy, echo=FALSE, include=TRUE}
klippy::klippy(position = c('top', 'right'))
#klippy::klippy(color = 'darkred')
#klippy::klippy(tooltip_message = 'Click to copy', tooltip_success = 'Done')
```
---
# Getting started
## clean up
```{r, results='hide'}
rm(list=ls())
```
<br>
## general custom functions
- `fpackage.check`: Check if packages are installed (and install if not) in R ([source](https://vbaliga.github.io/verify-that-r-packages-are-installed-and-loaded/))
- `fload.R`: function to load R-objects under new names.
- `fidmaker`: creating mock IDs
```{r, eval=FALSE}
fpackage.check <- function(packages) {
lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
}
fload.R <- function(fileName){
load(fileName)
get(ls()[ls() != "fileName"])
}
fidmaker <- function(x) {
max.val = x * 1e+05
count <- nchar(as.character(max.val)) # find out how many 'numbers' each id will have after the letter
size <- paste("%0", count, "d", sep = "") # set the variable to be fed into 'sprintf' to ensure we have leading 0's
lets <- toupper(sample(letters, x, replace = T)) # randomizing the letters
nums <- sprintf(size, sample(1:max.val)[1:x]) # randomizing the numbers, and ensuring they all have the same number of characters
ids <- paste(lets, nums, sep = "") # joining them together
return(ids)
}
```
<br>
## necessary packages
- `RSiena`: creating RSiena data objects
- `dplyr`: package for data wrangling
```{r, eval=FALSE}
packages = c("RSiena", "dplyr")
fpackage.check(packages)
```
<br>
----
# clubdata.RData
In the following scripts a list containing the (anonymized) data of all clubs is made (clubdata.RData).
* Our primary network variable is the Kudos-network. A tie i -> j exists if ego i award at least 1 Kudos to alter j.
* For the behavioral data we include information on the *frequency* (i.e., in times per week) and *volume* (i.e., in hours per week) of running activities. We included activity (frequency and volume) in other sports (e.g., cycling and swimming) as a time-varying covariate.
```{r clubs, eval=FALSE}
# club string represents the club ID
club_str <- c("clubid1", "clubid2", "clubid3", "clubid4" ,"clubid5")
# the following script reads the data of the clubs from the folder for each club, stores them in a list, and saves it in an object in the last function call of this script.
for (i in (1: length(club_str))) {
club_id <- club_str[i]
# read the data from the club
clubdata <- read.csv(paste("clubs/", club_id, "/", "egoData_extended.csv", sep = ""), row.names = NULL,
sep = ",")
# saving club size
size <- length(clubdata[, "gender"])
# the number of months that we want to add as waves
n_waves <- 12
# let's load the friendship/following network
friend_data <- as.matrix(read.csv(paste("clubs/", club_id, "/", "socialnetwork.csv", sep = ""), row.names = NULL,
sep = ","))
# remove the first column (represents index made in the csv)
friend_data <- friend_data[, 2:ncol(friend_data)]
# and anonymize the user id, with our id-maker function
fakeid <- fidmaker(nrow(friend_data)) # generate random id
colnames(friend_data) <- fakeid # anonymizing users
# let's load the kudos network
path <- paste("clubs/", club_id, "/", "kudos", sep = "") # create path
{
kudo_w1 <- as.matrix(read.csv(paste(path, "1-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w1 <- kudo_w1[, 2:ncol(kudo_w1)]
kudo_w2 <- as.matrix(read.csv(paste(path, "2-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w2 <- kudo_w2[, 2:ncol(kudo_w2)]
kudo_w3 <- as.matrix(read.csv(paste(path, "3-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w3 <- kudo_w3[, 2:ncol(kudo_w3)]
kudo_w4 <- as.matrix(read.csv(paste(path, "4-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w4 <- kudo_w4[, 2:ncol(kudo_w4)]
kudo_w5 <- as.matrix(read.csv(paste(path, "5-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w5 <- kudo_w5[, 2:ncol(kudo_w5)]
kudo_w6 <- as.matrix(read.csv(paste(path, "6-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w6 <- kudo_w6[, 2:ncol(kudo_w6)]
kudo_w7 <- as.matrix(read.csv(paste(path, "7-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w7 <- kudo_w7[, 2:ncol(kudo_w7)]
kudo_w8 <- as.matrix(read.csv(paste(path, "8-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w8 <- kudo_w8[, 2:ncol(kudo_w8)]
kudo_w9 <- as.matrix(read.csv(paste(path, "9-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w9 <- kudo_w9[, 2:ncol(kudo_w9)]
kudo_w10 <- as.matrix(read.csv(paste(path, "10-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w10 <- kudo_w10[, 2:ncol(kudo_w10)]
kudo_w11 <- as.matrix(read.csv(paste(path, "11-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w11 <- kudo_w11[, 2:ncol(kudo_w11)]
kudo_w12 <- as.matrix(read.csv(paste(path, "12-2019.csv", sep = ""), row.names = NULL, sep = ","))
kudo_w12 <- kudo_w12[, 2:ncol(kudo_w12)]
}
kudos <- array(c(kudo_w1, kudo_w2, kudo_w3, kudo_w4, kudo_w5, kudo_w6, kudo_w7, kudo_w8, kudo_w9,
kudo_w10, kudo_w11, kudo_w12), dim = c(size, size, n_waves)) #Kudos matrix
kudo_data <- ifelse(kudos > 0, 1, 0) #if at least 1 Kudo is send, tie exists
# running time (in hours per week)
time_run <- array(c(clubdata[, "time_run_1.2019"], clubdata[, "time_run_2.2019"], clubdata[, "time_run_3.2019"],
clubdata[, "time_run_4.2019"], clubdata[, "time_run_5.2019"], clubdata[, "time_run_6.2019"],
clubdata[, "time_run_7.2019"], clubdata[, "time_run_8.2019"], clubdata[, "time_run_9.2019"],
clubdata[, "time_run_10.2019"], clubdata[, "time_run_11.2019"], clubdata[, "time_run_12.2019"]),
dim = c(size, 1, n_waves)) # minutes per month
time_run_h <- time_run/60 # hours per month
# time_run_30 <- time_run_h * 2 # half hours
time <- ceiling(time_run_h/4) # per week
# provide some descriptives
# time %>% # absolute table() time %>% # proportionate table() %>%
# prop.table() %>% round(3) cumprop <- time %>% # cumulative proportions table() %>%
# prop.table() %>% round(3) %>% cumsum() print(cumprop) 1-cumprop # the percentage of all
# values that is higher than the particular value
# our idea was to cap running time of at 7+ hours per week. this resulted in rather smooth
# (right-skewed) distribution of values for most clubs except for club 5, where 7+ hours
# category was highly populated over time. We added 2 extra categories (capped of at 9+
# hours); this resulted in a rather smooth distribution for this club.
if (club_id == "clubid5") {
time_run_temp <- ifelse(time > 9, 9, time)
} else {
time_run_temp <- ifelse(time > 7, 7, time)
}
# running frequency (times per week)
freq_run <- array(c(clubdata[, "X.run_1.2019"], clubdata[, "X.run_2.2019"], clubdata[, "X.run_3.2019"],
clubdata[, "X.run_4.2019"], clubdata[, "X.run_5.2019"], clubdata[, "X.run_6.2019"], clubdata[,
"X.run_7.2019"], clubdata[, "X.run_8.2019"], clubdata[, "X.run_9.2019"], clubdata[, "X.run_10.2019"],
clubdata[, "X.run_11.2019"], clubdata[, "X.run_12.2019"]), dim = c(size, 1, n_waves)) # frequencies per month
frequencies <- ceiling(freq_run/4) # per week
freq_run_temp <- ifelse(frequencies > 7, 7, frequencies) # cap off at 7 times per week
# we deal with composition change: actors joining the Strava club over time (we assume no leavers);
# node set was determined at t12; from there, we 'scraped backward' in time;
# down below, we count the number of waves, from t1 onward, that actors scored 0 on all DVs
# that is, where the sum of number of kudos in- and out-ties (network activity), frequency and volume (behavior activity), equals 0.
# this number +1 is the wave in which actors are assumed to have joined
# our model will then assume that entries of joiners happen at the midpoint of the unobserved time period between this and the subsequent wave.
sum_dv <- matrix(NA, nr=size, nc=n_waves)
for (i in 1:size) {
for (j in 1:n_waves) {
sum_dv[i,j] <- sum( c( # sum of
kudo_data[i,,j], # number of out-ties of actor i at time j
kudo_data[,i,j], # number of in-ties of actor i at time j
freq_run_temp[i,,j], # running frequency of i at j
time_run_temp[i,,j] # running volume of i at j
))
}
}
t <- rep(1, size) # starting entry t=1
for ( i in 1:size) {
for( j in 1:n_waves) {
# if sum = 0 for actor i in wave j,
# increment the entry period t with 1
if(sum_dv[i,j] == 0) {
t[i] = t[i] + 1
}
# if sum > 0, break the loop
if(sum_dv[i,j] > 0) {
break
}
}
}
# entry-waves cannot surpass 12:
t[which(t>12)] <- 12
# actors that join at t=12, cannot be included into the estimation;
# they are structural zero;
# we exclude the rows (i) and columns (j) corresponding to these actors
# from the kudos matrices
# we also remove these actors' elements from the running variable objects.
# first, identify these actors (get their index)
x <- which(t==12)
# make new arrays to store entries of actors, excluding structural zeros
kudo_data.nm <- array(NA, dim = c(size - length(x),
size - length(x),
n_waves))
freq_run_temp.nm <- array(NA, dim = c(size - length(x), 1, n_waves))
time_run_temp.nm <- array(NA, dim = c(size - length(x), 1, n_waves))
# fill arrays
for (i in 1:n_waves) {
kudo_data.nm[,,i] <- subset(kudo_data[,,i], select = -x)[-x, ]
freq_run_temp.nm[,,i] <- freq_run_temp[,,i][-x]
time_run_temp.nm[,,i] <- time_run_temp[,,i][-x]
}
# and adjust network size accordingly
size.nm <- size - length(x)
# create a list for the times of composition change
# see 4.3.3 of RSIENA manual
comp <- rep(list(NA), size.nm)
for (i in 1:size.nm) {
comp[[i]] <- c(t[-x][i],n_waves)
}
# RSiena GOF functions do not combine properly with the method of joiners and leavers
# But if we use NA codes for the tie variables of absent actors, sienaGOF will work properly.
# get time of joining for each actor:
t_j <- t[which(t!=12)]
# for actors that are partially absent, set entries in the kudos-matrix to NA,
# for those waves they are absent, i.e., waves 1 : (t_j-1)
for ( i in which(t_j>1)) { # for actors that join later
for (j in 1:n_waves) { # for all waves
if(t_j[i]>j) { # if their time of joining comes after the current wave
kudo_data.nm[i,,j] <- NA # set their out-degree entries to NA
kudo_data.nm[,i,j] <- NA # but also their in-degree entries
freq_run_temp.nm[i,1,j] <- NA # and their behaviors
time_run_temp.nm[i,1,j] <- NA
}g
}
}
# let's load other activity data (e.g., cycling, swimming) time
time_ride <- array(c(clubdata[, "time_ride_1.2019"], clubdata[, "time_ride_2.2019"], clubdata[, "time_ride_3.2019"],
clubdata[, "time_ride_4.2019"], clubdata[, "time_ride_5.2019"], clubdata[, "time_ride_6.2019"],
clubdata[, "time_ride_7.2019"], clubdata[, "time_ride_8.2019"], clubdata[, "time_ride_9.2019"],
clubdata[, "time_ride_10.2019"], clubdata[, "time_ride_11.2019"], clubdata[, "time_ride_12.2019"]),
dim = c(size, 1, n_waves))
time_other <- array(c(clubdata[, "time_other_1.2019"], clubdata[, "time_other_2.2019"], clubdata[,
"time_other_3.2019"], clubdata[, "time_other_4.2019"], clubdata[, "time_other_5.2019"], clubdata[,
"time_other_6.2019"], clubdata[, "time_other_7.2019"], clubdata[, "time_other_8.2019"], clubdata[,
"time_other_12.2019"]), dim = c(size, 1, n_waves))
time_other <- time_ride + time_other # minutes per month
time_other_h <- time_other/60 # hours per month
# time_other_h <- time_other_h * 2 # half hours
time <- ceiling(time_other_h/4) #per week
time_other_temp <- ifelse(time > 7, 7, time) # cap off at 7 hours per week
#table(time_other_temp)
# frequency
freq_ride <- array(c(clubdata[, "X.ride_1.2019"], clubdata[, "X.ride_2.2019"], clubdata[, "X.ride_3.2019"],
clubdata[, "X.ride_4.2019"], clubdata[, "X.ride_5.2019"], clubdata[, "X.ride_6.2019"], clubdata[,
"X.ride_7.2019"], clubdata[, "X.ride_8.2019"], clubdata[, "X.ride_9.2019"], clubdata[, "X.ride_10.2019"],
clubdata[, "X.ride_11.2019"], clubdata[, "X.ride_12.2019"]), dim = c(size, 1, n_waves))
freq_other <- array(c(clubdata[, "X.other_1.2019"], clubdata[, "X.other_2.2019"], clubdata[, "X.other_3.2019"],
clubdata[, "X.other_4.2019"], clubdata[, "X.other_5.2019"], clubdata[, "X.other_6.2019"], clubdata[,
"X.other_7.2019"], clubdata[, "X.other_8.2019"], clubdata[, "X.other_9.2019"], clubdata[,
"X.other_10.2019"], clubdata[, "X.other_11.2019"], clubdata[, "X.other_12.2019"]), dim = c(size,
1, n_waves))
freq_other <- freq_ride + freq_other
frequencies <- ceiling(freq_other/4)
freq_other_temp <- ifelse(frequencies > 7, 7, frequencies)
# and again, exclude the structural zeros:
freq_other_temp.nm <- array(NA, dim = c(size.nm, 1, n_waves))
time_other_temp.nm <- array(NA, dim = c(size.nm, 1, n_waves))
for (i in 1:n_waves) {
freq_other_temp.nm[,,i] <- freq_other_temp[,,i][-x]
time_other_temp.nm[,,i] <- time_other_temp[,,i][-x]
}
# separating male/female/other
male <- ifelse(clubdata[, "gender"][-x] == "M", 1,0)
female <- ifelse(clubdata[, "gender"][-x] == "F", 1, 0)
other <- ifelse(clubdata[, "gender"][-x] == "O", 1, 0)
# specify months of winter in case we want to use it as a varying covariate starts with
# december
winter <- rep(c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), size.nm)
winter <- matrix(winter, nr = size.nm, nc = n_waves, byrow = TRUE)
# in case we want to condition on the first activity recorded by users:
st_yr <- clubdata[,'start_date_year']
st_yr <- st_yr[-x] #exclude NAs
start_year <- ifelse(st_yr <= 2011, 1,
ifelse(st_yr == 2012, 2,
ifelse(st_yr == 2013, 3,
ifelse(st_yr == 2014, 4,
ifelse(st_yr == 2015, 5,
ifelse(st_yr == 2016, 6,
ifelse(st_yr == 2017, 7,
ifelse(st_yr == 2018, 8,
9))))))))
#hist(start_year)
# create a list containing all the read club data for the current club
club <- list(friendship = friend_data, kudo = kudo_data.nm, freq_run = freq_run_temp.nm, time_run = time_run_temp.nm,
freq_other = freq_other_temp.nm, time_other = time_other_temp.nm, winter = winter, male = male, female = female,
other = other, netsize = size.nm, composition = comp, novice=start_year)
# save the club object
save(club, file = paste("clubs/", "club", club_id, ".RData", sep = ""))
}
####################################################
# Now that we have saved the clubdata for all clubs, let's combine them in one list
# first clean the working directory, except our club string; we still need that one.
# and our function fload.R
rm(list = setdiff(ls(), c("club_str", "fload.R")))
# load in the separate club-objects
club1 <- fload.R(paste("clubs", "/", "club", club_str[1], ".RData", sep=""))
club2 <- fload.R(paste("clubs", "/", "club", club_str[2], ".RData", sep=""))
club3 <- fload.R(paste("clubs", "/", "club", club_str[3], ".RData", sep=""))
club4 <- fload.R(paste("clubs", "/", "club", club_str[4], ".RData", sep=""))
club5 <- fload.R(paste("clubs", "/", "club", club_str[5], ".RData", sep=""))
# and make a list containing all the clubdata
clubdata <- list(club1, club2, club3, club4, club5)
# save the output
save(clubdata, file = "clubs/clubdata.RData")
```
---
# clubdata_rsiena.RData
The following script creates a list containing RSiena objects for all clubs.
We create a list containing RSiena objects with running frequency as the behavior variable (clubdata_rsiena_freq.RData) and running volume (clubdata_rsiena_vol.RData).
Note: in some groups, dependent variables (either the kudos-network or running behaviors) only have upward or downward changes in particular periods. We lift RSiena's automatic restriction to follow this pattern in the simulations, by using '*allowOnly* = FALSE'. This must be done for the subsequent meta-analysis.
```{r eval=FALSE}
# clean the working environment
rm (list = ls( ))
# load the clubdata
load("clubs/clubdata.RData")
str(clubdata) # inspect structure
# clubdata is a list of 5 lists,
# with each of these lists containing data of the corresponding club.
####################################################
clubdata_rsiena_freq <- list()
clubdata_rsiena_vol <- list()
for (i in 1:5) {
club <- clubdata[[i]]
# specify the roles of variables
names(club)
# A: network variables
kudonet <- sienaDependent(club$kudo, allowOnly = FALSE) #at least one Kudo
# B: behavioral variables
time_run <- sienaDependent(club$time_run, type= "behavior", allowOnly = FALSE)
freq_run <- sienaDependent(club$freq_run, type= "behavior", allowOnly = FALSE)
time_other <- varCovar(club$time_other[,,])
freq_other <- varCovar(club$freq_other[,,])
# C: covariates
winter <- varCovar(club$winter)
gender <- NA #we dichotomize gender as binary (men vs. women and other)
gender <- ifelse(club$male == 1, 1, gender)
gender <- ifelse(club$female == 1, 2, gender)
gender <- ifelse(club$other == 1, 2, gender)
gender <- coCovar(gender)
novice <- coCovar(club$novice)
# D: composition change (joiners)
joiners <- sienaCompositionChange(club$composition)
# now combine the dependent and independent variables in a data object
mydata <- sienaDataCreate(kudonet, freq_run, time_other, freq_other, gender, winter, joiners, novice)
mydata2 <- sienaDataCreate(kudonet, time_run, time_other, freq_other, gender, winter, joiners, novice)
# this finishes the data specification
clubdata_rsiena_freq[[i]] <- mydata
clubdata_rsiena_vol[[i]] <- mydata2
}
#clean environment
rm(list = setdiff(ls(), c("clubdata_rsiena_freq", "clubdata_rsiena_vol")))
# save the output
save(clubdata_rsiena_freq, file = "clubs/clubdata_rsiena_freq.RData")
save(clubdata_rsiena_vol, file = "clubs/clubdata_rsiena_vol.RData")
```
---