-
Notifications
You must be signed in to change notification settings - Fork 2
/
MVF method.R
216 lines (174 loc) · 17 KB
/
MVF method.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#Example process of the MVF protocol
#This script is modeled around one lion Daily Diary (DD) motion sensor data set (exported at 10 Hz) and a technosmart (GiPSy-5) GPS dataset (1 Hz)
#Required libraries for data manipulation
install.packages("zoo") ; install.packages("dplyr") ; install.packages("lubridate")
library(zoo) ; library(dplyr) ; library(lubridate)
#Required libraries for visual inspection of results
install.packages('ggforce') ; install.packages('tidyverse') ; install.packages('ggplot2')
library(ggforce) ; library(tidyverse) ; library(ggplot2)
#################################################The thresholds#####################################################################################################################################
#The various values used to compute MVF - these values are user-defined, changed according to the species in question. For more information, see the below code where they are implemented
s = 20 #Smoothing window for computing static acceleration and post-smoothing VeDBA (~ 2 s because data = 10 Hz) - The post-smoothing value can obviously be different to that used to derive static acceleration though here they are the same
dist.step = 5 #Stepping interval to compute Haversine distance between - Here we use five fix stepping interval = ~ 5 s (assuming no missing fixes) when computing distance between GPS fixes
flex.1 = 3 #A degree of optional flexibility - This is used to both interpolate between speed values if maximum gap between fixes is <= than 3 s here. Any values converted from NA to an interpolated value assumes that a fix was indeed taken (so passes first stage of MVF protocol)
speed.sm = 5 #the smoothing window used for GPS-derived speed (s)
MeFF = 60 #The median rolling filter to pass over the longitude and latitude coordinates prior to deriving the Z threshold
Z = 100 #Z threshold (units in m) for filtering out extreme outliers
X = 0.11 #The DBA (VeDBA) threshold (units in g)
Y = 0.35 #The GPS~derived speed threshold (units in m/s)
flex.2 = 2 #Another degree of optional flexibility - Potential MVF periods encoded as 1 (that surpassed the above thresholds) occurring <= 2 s (flex.2) from one another (separated by MVF values of zero) are merged prior to the T threshold. But only if GPS fix was present (though see flex.1) and the Z threshold passed the MVF protocol (e.g., < 100 m here)
T = 5 #Time threshold (T) - the uninterrupted (consecutive) time (s) that 'potential' MVF values of 1 must be present for, for a period to be classified as genuine travelling movement
#################################################COMPUTE VEDBA#####################################################################################################################################
#(1) Set working directory and load in motion sensor (DD) data
setwd("D:/Dropbox/MVF.method/Example") #Change as appropriate to set working directory containing the motion sensor data sets and GPS file
#Here, the motion sensor data is composed of multiple .txt files (name ending with '.txt'), with these 2 lines of code we bind them all into one data frame (termed 'df')
temp = list.files(pattern="*.txt")
df = do.call(rbind,lapply(temp, function(x) read.delim(x, stringsAsFactors = T, header=T)))
#Now to ensure the data is in the correct order, we arrange based on a column termed 'Total.Event.no.', though this could be any column or multiple columns, e.g., containing month, day, time etc.)
df<-arrange(df, Total.Event.no.)
df$X = NULL #Remove this column, as this was just filled with NA values
#(2) Create time stamp object and subset data frame based on a start and end time stamp reading
df$timestamp = paste(df$Date, df$Time.hh.mm.ss.ddd)
options(digits.secs = 3) #Ensure 3 digits shown for infra-second times
df$timestamp<-as.POSIXct(strptime(df$timestamp, format= "%d/%m/%Y %H:%M:%OS"), tz = "GMT")
head(df$timestamp, n = 1) ; tail(df$timestamp, n = 1) #See what the dtart and end times are
#Arbitrary example, subset 5 days of data
date1 <- as.POSIXct("2019-02-26 08:00:00.000")
date2 <- as.POSIXct("2019-03-03 08:00:00.000")
int <- interval(date1, date2)
df<-df[df$timestamp %within% int,]
#(3) Compute static acceleration (2 s used here = approx. 20 events/rows))
df$Acc_x.sm = rollapply(df$Acc_x, width=s, FUN=mean, align="center", fill="extend")
df$Acc_y.sm = rollapply(df$Acc_y, width=s, FUN=mean, align="center", fill="extend")
df$Acc_z.sm = rollapply(df$Acc_z, width=s, FUN=mean, align="center", fill="extend")
#(4) Calculate VeDBA (Also post-smooth VeDBA (2 s used here = approx. 20 events/rows))
df$VeDBA = sqrt((df$Acc_x - df$Acc_x.sm)^2 + (df$Acc_y - df$Acc_y.sm)^2 + (df$Acc_z - df$Acc_z.sm)^2)
df$VeDBA.sm = rollapply(df$VeDBA, width = s, FUN=mean, align="center", fill="extend")
#################################################COMPUTE GPS-derived speed#########################################################################################################################
#(5) load in GPS data (here only 1 file) and term data frame; 'L1.GPS'
L1.GPS <- read.delim("D:/Dropbox/MVF.method/Example/L1.GPS.csv") #Change directory path as appropriate
#(6) Create time stamp object and subset data frame based on a start and end time stamp reading
L1.GPS$timestamp = paste(L1.GPS$Date, L1.GPS$Time)
L1.GPS$timestamp <- as.POSIXct(strptime(L1.GPS$timestamp, format= "%d/%m/%Y %H:%M:%OS"), tz = "GMT")
#Subset to same time period as the motion sensor df
L1.GPS <- L1.GPS[L1.GPS$timestamp %within% int,]
L1.GPS <- L1.GPS[!duplicated(L1.GPS$timestamp), ] #Remove any potential duplicated time readings
#Note that this GPS data file has already been converted to local S.Africa time to match with DD data, this required adding 2 hours (7200 s) because GPS is typically logged with GMT time zone and the
#DD is typically calibrated to local time. In R, this just involved the following; L1.GPS$timestamp = L1.GPS$timestamp + 7200
L1.GPS <- L1.GPS[order(L1.GPS$timestamp),] #Ensure time data frame is in correct time order
#(7) Compute GPS-derived speed by first computing distance with the Haversine formula (made into a function below)
disty = function(long1, lat1, long2, lat2) { #longitude and latitude supplied in degrees
long1 = long1 * pi/180 ; long2 = long2 * pi/180 ; lat1 = lat1 * pi/180 ; lat2 = lat2 * pi/180 #Function converts to radians
a = sin((lat2 - lat1) / 2) * sin((lat2 - lat1) / 2) + cos(lat1) * cos(lat2) * sin((long2 - long1) / 2) * sin((long2 - long1) / 2)
c = 2 * atan2(sqrt(a), sqrt(1 - a))
d1 = 6378137 * c
return(d1)
}
#(8) Specify the stepping range you want between consecutive fixes to compute distance. Too short of an interval for high res GPS data sets (e.g., 1 Hz), may incorporate precision error. Too big of an interval may miss out the tortuoisty involved in genuine movement
L1.GPS$GPS.loni = c(L1.GPS[-c(1:dist.step), 'location.long'], rep(0, dist.step)) ; L1.GPS$GPS.lati = c(L1.GPS[-c(1:dist.step), 'location.lat'], rep(0, dist.step)) #Shift column values forward by the specified stepping range and add relevant number of NA's to the end (to maintain vector length of column)
L1.GPS$GPS.distance = disty(L1.GPS$location.long, L1.GPS$location.lat, L1.GPS$GPS.loni, L1.GPS$GPS.lati) #Calculate row-wise distance between successive GPS coordinates (according to stepping range)
L1.GPS$GPS.distance = c(rep(0, dist.step), L1.GPS$GPS.distance[c(1:(nrow(L1.GPS)-dist.step))]) #Shift values back by by the specified stepping range
#(9) Now compute the time difference (TD) between values in similar manner
L1.GPS$TD = c(L1.GPS[-c(1:dist.step), 'timestamp'], rep(NA, dist.step))
L1.GPS$TD = c(difftime(L1.GPS$TD, L1.GPS$timestamp, units = "secs")) #Row-wise time difference
L1.GPS$TD = c(rep(0, dist.step), L1.GPS$TD[c(1:(nrow(L1.GPS)-dist.step))]) #Shift values back by by the specified stepping range
#(10) Calculate speed (m/s)
L1.GPS$GPS.speed = L1.GPS$GPS.distance / L1.GPS$TD
L1.GPS$GPS.speed[1:dist.step] = 0 #Make the first 'x' number of rows (length of assigned dist.step zero rather than NAN value)
#################################################Merge motion sensor and GPS data together##########################################################################################################
#(11) Merge the relevant columns of the GPS data frame with the motion sensor one.
df = merge(df, L1.GPS[, c('timestamp', 'location.long', 'location.lat', 'GPS.distance', 'TD', 'GPS.speed')], by = 'timestamp', all = TRUE)
#Ensure Time stamps of DD (or any IMU data) and GPS at times when fixes were present match exactly
#(12) Because we have many motion sensor data values between GPS data values, we want to specify two things;
#(i) Are GPS related NA values due to the standard inter-fix (1 s) intervals (between second values, e.g., motion sensor time stamps not ending in '.000'), or
#(ii) Are GPS related NA values due to missing location data where fix success rate dropped out - for greater than 1 s in this case.
df$index.sec = df$timestamp - round_date(df$timestamp, unit="1 seconds") #(i) Subtracting the infra-second time stamps with a rounded version (no decimal seconds) will read a difference of exactly zero at the location of each 'whole second'. #Notably variations of this 'index' computation may be needed if GPS was set at different logging rate (e.g., 2 Hz)
df$index.sec = ifelse(df$index.sec == 0, 1, 0) #(i) Values of one code for 'whole seconds' (where GPS is meant to be present) and values of zero code for infra-second periods
df$GPS.fix.present = ifelse(is.na(df$location.long) == TRUE, 0, 1) #(ii) Values of one code for the GPS fix being present (on the second mark) and values of zero code for no fixes
#(13) Subset df to 1 Hz (same as GPS logging frequency) #This is based on the 'index.sec' column made above because GPS was set to record at 1 Hz.
df = subset(df, df$index.sec == 1)
#(13) Optional but add a degree of interpolation between speed values if maximum gap between fixes is less than x s
#This essentially gives the user the option of still considering speed values during small periods of missing data.
df$GPS.speed.interp = na.approx(df$GPS.speed, maxgap = flex.1, rule = 2)# Here if values occur <= 3 s to one another, we interpolate
#(14) Optional, but add a degree of (center-aligned) post-smoothing to GPS speed values
df$GPS.speed.interp.sm = rollapply(df$GPS.speed.interp, width = speed.sm, FUN = mean, fill = 'extend') #5 smoothing window used here
df$GPS.speed.interp.sm = ifelse(is.na(df$GPS.speed.interp) == TRUE, NA, df$GPS.speed.interp.sm) #If interpolated speed is NA then max gap between readings > 3 s (flex.1), so replace speed values with NA to ensure they are not considered at such times
#################################################The MVF thresholds##################################################################################################################################
#(15) Should we consider the VeDBA & GPS speed in first place - This is based on step (13), if NA present we will not consider the fix (or rather 'interpolated speed')
df$thresh.consid = ifelse(is.na(df$GPS.speed.interp.sm) == TRUE, 0, 1) #Values of one mean we pass step one of MVF protocol. Note if step 13 is not performed, then this is essentially the 'GPS.fix.present' column
#(16) Using GPS-derived distance to identify extreme outliers - Distance threshold (Z). This example does not have 'extreme' outlier per se, though as an example...
#By applying a rolling median using a suitable window length, large distance estimates reflecting either a single or multiple 'batched' outlier(s) can be distinguished from fixes deemed 'accurate' but highly separated
#in space due to large gaps in locational data. The window length size and Z threshold should be chosen according to the animal in question due to the scales of movement undertaken by different species
#median filter window length (MeFF) of 60 s and a lenient threshold of 100 m used for lions.
df$Lon.approx = na.approx(df$location.long, rule = 2) #First linearly interpolate between NA's - ensure first observation is not an NA (row one))
df$Lat.approx = na.approx(df$location.lat, rule = 2)
df$Med.lon = rollapply(df$Lon.approx, width = MeFF, align = 'center', fill = "extend", FUN=median) # 60 s used here
df$Med.lat = rollapply(df$Lat.approx, width = MeFF, align = 'center', fill = "extend", FUN=median)
df$dist.thresh = disty(df$Lon.approx, df$Lat.approx, df$Med.lon, df$Med.lat)
df$Z.thresh = ifelse(df$dist.thresh < Z, 1, 0) #Values of one mean that we pass step two of MVF protocol
#Investigatory plots
par(mfrow=c(1,1))
plot(df$dist.thresh, type = "l") ; abline(h=100, col="red", lty=2) #Gives an idea of thresh to use (this thresh could indeed probably be lower than 100)
sub = subset(df, df$Z.thresh == 0)
df %>%
ggplot(aes(x = Lon.approx, y = Lat.approx, color= factor(Z.thresh)))+
geom_path(group = -1)+
geom_point(data = sub, aes(x = Lon.approx, y = Lat.approx), col = "red")+
facet_zoom(xlim = c(20.8380, 20.843), ylim = c(-26.206, -26.196))+ theme_bw()
#Note the 'jitter hotspots'
#(17) The VeDBA (X) and GPS-speed (Y) thresholds (0.11 g and 0.35 m/s respectively here)
df$X.thresh = ifelse(df$VeDBA.sm > X, 1, 0) #Values of one mean we pass step three (DBA component) of MVF protocol
df$Y.thresh = ifelse(df$GPS.speed.interp.sm > Y, 1, 0) #Values of one mean we pass step three (GPS speed component) of MVF protocol
#(18) Potential MVF values prior to the time threshold
df$MVF = ifelse(df$thresh.consid == 1 & df$Z.thresh == 1 & df$X.thresh == 1 & df$Y.thresh == 1, 1, 0) #Values of one mean that steps 1 to 3 of MVF protocol are passed
#(19) #Time threshold (T) - the uninterrupted (consecutive) time (s) that 'potential' MVF values of 1 must be present for, for a period to be classified as genuine travelling movement
#Here, we use 5 s, #Though to add another degree of optional flexibility (flex.2) - Potential MVF periods encoded as 1 (that surpassed the above thresholds) occurring <= 2 s (flex.2) from one another (separated by MVF values of zero) are merged prior to the T threshold
#First, compute unique recurrence of the MVF values
y1 = rle(df$MVF)$values
y2 = rle(df$MVF)$lengths
z = c()
for(i in unique(y1)){
z[y1==i] = 1:sum(y1 ==i)
}
df$MVF.unique.occurence = rep(z,y2) #Unique recurrence of MVF values of 0 and 1
df$MVF.and.unique.occurence = paste(df$MVF, df$MVF.unique.occurence) # Acts as the grouping variable to compute maximum time of uninterrupted duration
df = df %>% group_by(MVF.and.unique.occurence) %>% mutate(time.thresh = max(sequence(rle(MVF.and.unique.occurence)$lengths))) #Maximum time (s) of each recurrence
#Now this optional step is to convert MVF periods of zero to one if only 1 or 2 s in duration (as governed by flex.2 input).
#However, we only do this if the first two thresholds 'thresh.consid' and Z.thresh' remain surpassed during such times (e.g., they remained one)
df = df %>% group_by(MVF.and.unique.occurence) %>% mutate(consid.check = min(thresh.consid)) # If one then all good
df = df %>% group_by(MVF.and.unique.occurence) %>% mutate(Z.check = min(Z.thresh)) # If one then all good
df$MVF = ifelse(df$MVF == 0 & df$time.thresh <= flex.2 & df$consid.check > 0 & df$Z.check > 0, 1, ifelse(df$MVF == 1, 1, 0)) #Values converted to one if the above criteria is reached
#Now repeat with the updated MVF values
y1 = rle(df$MVF)$values
y2 = rle(df$MVF)$lengths
z = c()
for(i in unique(y1)){
z[y1==i] = 1:sum(y1 ==i)
}
df$MVF.unique.occurence = rep(z,y2) #Unique recurrence of MVF values of 0 and 1
df$MVF.and.unique.occurence = paste(df$MVF, df$MVF.unique.occurence) # Acts as the grouping variable to compute maximum time of uninterrupted duration
df = df %>% group_by(MVF.and.unique.occurence) %>% mutate(time.thresh = max(sequence(rle(MVF.and.unique.occurence)$lengths))) #Maximum time (s) of each recurrence
#The time threshold
df$T.thresh = ifelse(df$time.thresh >= T, 1, 0) ##Values of one mean either mmoving or non-moving periods >= 5 s in duration
#(20) MVF values with all thresholds (1 = genuine movement and 0 = non-travelling movement/missing/erroneous GPS fixes)
df$MVF = ifelse(df$T.thresh == 1 & df$MVF == 1, 1, 0)
#Investigatory plots
df %>%
ggplot(aes(x = location.long, y = location.lat, color= factor(MVF)))+
geom_path(group = -1)+
facet_zoom(xlim = c(20.8380, 20.843), ylim = c(-26.206, -26.196))+ theme_bw()
#Subset a section of data to plot
date1 <- as.POSIXct("2019-02-28 06:00:00.000")
date2 <- as.POSIXct("2019-02-28 12:00:00.000")
int <- interval(date1, date2)
df.sub<-df[df$timestamp %within% int,]
par(mfrow = c(3, 1))
col.list = c("blue", "red") ; palette(col.list)
plot(df.sub$MVF, col=factor(df.sub$MVF), ylab = "MVF")
plot(df.sub$GPS.speed.interp.sm, type = "l", col = "red", ylab = "GPS-speed (m/s)")
plot(df.sub$VeDBA.sm, type = "l", col = "blue", ylab = "VeDBA (g)")
#zoom in on a section of mostly walking
plot(df.sub$MVF[1:500], col=factor(df.sub$MVF), ylab = "MVF")
plot(df.sub$GPS.speed.interp.sm[1:500], type = "l", col = "red", ylab = "GPS-speed (m/s)")
plot(df.sub$VeDBA.sm[1:500], type = "l", col = "blue", ylab = "VeDBA (g)")
############################################################END##########################################################################################################