We are cognizant that the use of LBS data, and inferring meaningful locations in particular, brings forward specific privacy considerations and obliges us to seriously consider research ethics and potential harm. For this reason, we de-identify the dataset used through perturbation, random noise as well as spatial and temporal aggregation. Although this undoubtedly affects the precision and accuracy of any location inference, we argue that this is a fine trade-off for most analytical applications at the urban scale.
Ultimately, we only retain a (randomly generated) user ID, a location attribute, and a date/time attribute while any other variables such as content of tweet, user description, etc are removed from the source data.
# load your original dataset
df <- readRDS("path_to_original_dataset") %>%
dplyr::select(u_id, created_at, lon, lat) %>% # only keep essential information
mutate(created_at = with_tz(created_at, tzone = "Asia/Singapore")) %>% #make sure the timestamp is in SGT timezone
data.table::as.data.table()
# nest dataset by user
df_nested <- df %>% nest(data = !u_id)
Replace original user IDs with random generated numbers.
df_nested <- df_nested %>%
mutate(u_id = sample(1:100000000, n(), replace = FALSE))
# users with fewer than 10 tweets are removed
df_nested <- df_nested %>%
mutate(n_tweets = map_dbl(data, nrow)) %>%
filter(n_tweets >= 10)
Users with too many data points are likely to be bots, so we remove the top 0.1% the most active users according to the frequency of tweets.
#remove top 0.1% the most active users based on the frequency of tweets
n_user <- n_distinct(df_nested$u_id)
df_nested <- df_nested %>%
arrange(desc(n_tweets)) %>%
slice(round(n_user*0.1/100):n_user) %>%
dplyr::select(-n_tweets)
For each user, a 5% portion of their tweets is randomly swapped with other users, effectively introducing a small amount of noise.
#tweets pool
df_pool <- df_nested %>% unnest(col = data) %>% data.table::as.data.table()
pool_ids <- df_pool$id
swap_tweets <- function(user_data){
n_tweets <- nrow(user_data)# number of tweets sent by the user
n_keep_tweets <- ceiling(0.95 * n_tweets) # number of tweets to kept by the user
n_swap_tweets <- n_tweets - n_keep_tweets # number of tweets to swap
# tweets ids from the user
user_tweets_ids <- user_data$id
# sampling tweets ids from other users
noise_pool_ids <- setdiff(pool_ids %>% sample(5000), user_tweets_ids)
# select 5% random tweets (noise) from the pool
random_noise_ids <- sample(noise_pool_ids, size = n_swap_tweets, replace = FALSE)
random_noise <- df_pool[id %in% random_noise_ids] %>% dplyr::select(-u_id)
# select 95% random tweets from user's tweets
random_user_data <- user_data %>% sample_n(., size = n_keep_tweets)
output <- bind_rows(random_user_data, random_noise)
return(output)
}
df_nested <- df_nested %>% mutate(data = map(data, with_progress(swap_tweets)))
The timestamps are shifted by a random number of seconds.
offset_timestamps <- function(user_data, timestamp){
timestamp <- rlang::sym(timestamp)
user_data %>%
mutate(hr = lubridate::hour({{timestamp}}),
min = lubridate::minute({{timestamp}}),
noise = runif(nrow(.), min = -3600, max = 3600)) %>% # generate random time is second
mutate({{timestamp}} := case_when(
hr == 0 & (noise/60 + min) < 0 ~ {{timestamp}} + abs(noise), # add positive noise avoiding date move to the previous day
hr == 23 & (noise/60 + min) >= 59 ~ {{timestamp}} - abs(noise), # add negative noise avoiding date move to the next day
TRUE ~ {{timestamp}} + noise
)) %>%
dplyr::select(-c(noise, hr, min))
}
df_nested <- df_nested %>% mutate(data = map(data, with_progress(function(x) offset_timestamps(x, timestamp = "created_at_sg"))))
The day of the week is swapped with a similar day (weekday or weekend).
# swap day of the week (weekdays, weekends)
random_wday <- function(day){
if(day %in% seq(1, 5)){
random_day <- sample(seq(1, 5), 1, replace = T)
} else{
random_day <- sample(c(6, 7), 1, replace = T)
}
return(random_day)
}
swap_days <- function(user_data, timestamp){
timestamp <- rlang::sym(timestamp)
user_data %>%
mutate(ori_day = lubridate::wday({{timestamp}}, week_start = getOption("lubridate.week.start", 1))) %>%
mutate(random_day = map_dbl(ori_day, random_wday),
offsets = random_day - ori_day,
{{timestamp}} := {{timestamp}} + lubridate::days(offsets)) %>%
dplyr::select(-c(ori_day, random_day, offsets))
}
df_nested <- df_nested %>% mutate(data = map(data, with_progress(function(x) swap_days(x, timestamp = "created_at_sg"))))
Locations are offset by a random distance < 100 meters and then are aggregated to 750 hexagoral grid cells.
#generate aggregated grid cells
grids <- st_read(here("analysis/data/raw_data/MP14_SUBZONE_NO_SEA_PL.shp"), quiet = T) %>% # read Singapore map
st_transform(., crs = 3414) %>%
st_make_valid() %>%
st_make_grid(., cellsize = 750, square = F) %>% # generate 750m hexagonal grid cells
st_sf() %>%
rowid_to_column("grid_id")
The generated hexagonal grids dataset is in
analysis/data/derived_data
.
st_write(grids, here("analysis/data/derived_data/spatial_hex_grid.shp"))
offset_point <- function(point){
#create 100m buffer for a geom point and get 1 random sample within the buffer
random_pt <- point %>%
st_buffer(., dist = 100) %>%
st_sample(., size = 1, type = "random")
while(length(random_pt) == 0){ # repeat the process until get a random point in the buffer
random_pt <- point %>%
st_buffer(., dist = 100) %>%
st_sample(., size = 1, type = "random")
}
return(random_pt)
}
geomasking <- function(user_data, grids){
#convert to sf object
user_data_sf <- user_data %>%
st_as_sf(., coords = c("lon", "lat"), crs = 4326) %>%
st_transform(crs = 3414)
#offset points and aggregate point to grid
grid_id <- st_sfc(unlist(map(st_geometry(user_data_sf), offset_point), recursive = FALSE), crs = 3414) %>%
st_sf(crs = 3414) %>% # offset points
st_join(., grids) %>% # aggregate point to grid
st_set_geometry(NULL) # drop geometry
user_data_sf %>%
st_set_geometry(NULL) %>% # drop original geometry
bind_cols(., grid_id)
}
df_nested <- df_nested %>% mutate(data = map(data, with_progress(function(x) geomasking(x, grids))))
Remove grids with fewer than 5 visited users.
df <- df_nested %>%
unnest(cols = "data") %>%
filter(!is.na(grid_id)) %>%
group_by(grid_id) %>%
mutate(n_user = n_distinct(u_id),
n_tweets = n()) %>%
ungroup() %>%
filter(n_user >= 5 & n_tweets >= 5) %>%
dplyr::select(-c(n_user, n_tweets))
# first, you need to run 4 recipes implemented in homelocator package to get the identified residents at each location
# the input dataset is the anonymized dataset from step 9
## Recipe: HMLC
hm_hmlc <- identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
tz = "Asia/Singapore", show_n_loc = 1, keep_score = F, recipe = "HMLC")
## Recipe: OSNA
hm_osna <- identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
tz = "Asia/Singapore", show_n_loc = 1, recipe = "OSNA")
## Recipe: FREQ
hm_freq <- identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
tz = "Asia/Singapore", show_n_loc = 1, recipe = "FREQ")
## Recipe: APDM
### Generate neighbors for locations. This step is necessary for running APDM recipe
if(file.exists(here("analysis/data/derived_data/neighbors.rds"))){
df_neighbors <- readRDS(here("analysis/data/derived_data/neighbors.rds"))
}else{
#generate grid neighbors
grids <- st_read(here("analysis/data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
st_transform(crs = 3414)
st_queen <- function(a, b = a) st_relate(a, b, pattern = "F***T****")
neighbors <- st_queen(grids)
#convert list to tibble
list_to_tibble <- function(index, neighbors){
tibble(grid_id = as.character(index)) %>%
mutate(neighbor = list(neighbors[[index]]))
}
df_neighbors <- do.call(rbind, map(1:length(neighbors), function(x) list_to_tibble(x, neighbors)))
saveRDS(df_neighbors, here("analysis/data/derived_data/neighbors.rds"))
}
hm_apdm <- identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
tz = "Asia/Singapore", keep_score = F, recipe = "APDM")
# Second, you need to remove users at grids that have fewer than 5 residents
hm_apdm <- hm_apdm %>% mutate(name = "APDM")
hm_freq <- hm_freq %>% mutate(name = "FREQ")
hm_hmlc <- hm_hmlc %>% mutate(name = "HMLC")
hm_osna <- hm_osna %>% mutate(name = "OSNA")
hm_all <- bind_rows(hm_apdm, hm_freq, hm_hmlc, hm_osna)
## get users with home that four recipes matched
shared_user_uniqe_hm <- hm_all %>%
group_by(u_id) %>%
dplyr::summarise(n_method = n_distinct(name),
n_home = n_distinct(home)) %>%
filter(n_method == 4 & n_home == 1)
# get home grids that have less than 5 identified home users
users2rm <- hm_all %>%
filter(u_id %in% shared_user_uniqe_hm$u_id) %>%
dplyr::select(u_id, home) %>%
distinct(u_id, .keep_all = TRUE) %>%
group_by(home) %>%
mutate(n_user = n_distinct(u_id)) %>%
filter(n_user < 5) %>%
ungroup() %>%
pull(u_id)
# remove extracted users from dataset
df <- df %>% filter(!u_id %in% users2rm)
The de-identified dataset is in analysis/data/derived_data
.
#save de-identified dataset
write_csv(df, path = here("analysis/data/derived_data/deidentified_sg_tweets.csv"))