-
Notifications
You must be signed in to change notification settings - Fork 0
/
lachesis.py
297 lines (242 loc) · 9.82 KB
/
lachesis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import pandas as pd
from datetime import timedelta
from sklearn.metrics.pairwise import euclidean_distances, pairwise_distances
import numpy as np
from scipy.spatial.distance import pdist, cdist, squareform
from scipy.cluster.hierarchy import linkage, fcluster
from joblib import delayed, Parallel
from IPython.display import clear_output
def meters2degrees(meters):
"""
Converts meters to degrees.
Parameters:
meters -- input in meters
Returns:
degrees -- output in degrees
"""
degrees = meters * (1 / 111319.9)
return degrees
def degrees2meters(degrees):
"""
Converts degrees to meters.
Parameters:
degrees -- input in degrees
Returns:
meters -- output in meters
"""
return degrees * 111319.9
def get_stop_location(df, min_stay_duration, roaming_distance,
is_sorted=False):
"""
Extract the stop-locations and return them as a Pandas dataframe.
Source: "Hariharan, Toyama. 2004. Project Lachesis: Parsing and
Modeling Location Histories"
Parameters:
df -- Numpy array [[timestamp, longitude, latitude]] of size (N x 3)
min_stay_duration -- Minimum stay duration in minutes
roaming_distance -- Maximum roaming distance in meters
is_sorted -- Boolean Flag indicating if df is time-ordered.
Returns:
-- Pandas df [[timestamp, latitude, longitude, t_start, t_end]]
"""
# Sort array by timestamp if is_sorted flag has not been set
if not is_sorted:
df = df[df[:, 0].argsort()]
# Initialise variables
i = 0
medoids_set = set()
while i < df.shape[0]:
# Get the first item that is at least min_stay_duration away from
# point i
time_cutoff = df[i, 0] + timedelta(minutes=min_stay_duration)
idxs_after_time_cutoff = np.where(df[:, 0] >= time_cutoff)
# Break out of while loop if there are no more items that
# satisfy the time_cutoff criterium
if len(idxs_after_time_cutoff[0]) == 0:
break
# This is the first item after that satisfies time_cutoff
j_star = idxs_after_time_cutoff[0][0]
# Check if roaming_distance criterium is satisfied. If point
# exceed roaming distance them move on. Otherwise incrementally
# expand set of points to be included until roaming distance is
# exceeded (or no more datapoints are left)
if np.max(pdist(df[i:j_star + 1, [1, 2]])) > roaming_distance:
i += 1
else:
j_star = i + 1
# Expand sets of points by incrementally checking subsequent
# points in time and checking whether they are within the
# roaming distance.
while j_star < df.shape[0]:
max_distance = np.max(cdist(df[i:j_star + 1, [1, 2]],
df[j_star, [1, 2]].reshape(1, -1)))
if max_distance <= roaming_distance:
j_star += 1
else:
break
j_star -= 1
# Get medoid, if there are only 2 points just take the first
if (j_star - i) == 1:
medoid_idx = 0
else:
# Compute full distance matrix of this current set of
# points,convert to squareform and get the medoid of all
# these points
medoid_idx = np.argmin(
np.sum(squareform(pdist(df[i:j_star + 1, [1, 2]])),
axis=0))
# Add medoid to list and increment i-index
medoids_set.add(tuple(list(df[i + medoid_idx]) +
[df[i, 0], df[j_star, 0]]))
i = j_star + 1
# Convert to dataframe and return as result
df_result = pd.DataFrame(list(medoids_set),
columns=["timestamp", "latitude", "longitude",
"t_start", "t_end"])
return df_result
def get_medoid_index(X):
"""
Returns the index of the medoid of the points in X.
Note: The medoid is the point closest to all the other points in
the set.
Parameters:
X -- Numpy vector [[longitude, latitude]]
Returns:
Index of the medoid of X.
"""
dist_mat = pairwise_distances(X)
return np.argmin(np.sum(dist_mat, axis=0))
def get_clustermedoids_indeces(X):
"""
Return the index of the medoid of each cluster in X.
Parameters:
X -- Numpy array [[cluster_assignment, longitude, latitude]]
Returns:
A list of indeces pointing to the medoid for each cluster in X.
"""
X = X.values
medoid_idx_list = []
for cluster_i in np.unique(X[:, 0]):
current_cluster_idx = np.where(X[:, 0] == cluster_i)[0]
medoid_idx_list.append(
current_cluster_idx[get_medoid_index(X[current_cluster_idx, 1:])])
return medoid_idx_list
def calculate_centroid(X):
"""
Calculates the centroid of an array of longitude-latitude pairs.
Parameters:
X -- Numpy array [[longitude, latitude]]
Returns:
Centroid of X
"""
return np.mean(X, axis=0)
def rgiration_at_k(X, k=None, ignore_weigths=False):
"""
Calculates the radius of gyration at k.
Parameters:
ignore_weigths -- True if the visitation count at a destination
should not be used as weight in final R-gyration calculation.
X -- A pandas dataframe with columns [[longitude, latitude, count]]
:param k -- How many of the most visited destinations to consider.
Returns:
Integer representing the radius of giraiton @k. If k is not
specified it returns the total radius of giration. If the number of
destinations in X is smaller than k np.NaN is returned.
"""
# Check that k is admissibile
if k == None:
k = X.shape[0]
elif (k == 0) or (k > X.shape[0]):
return np.nan
# Convert to numpy array and sort
X = X.values
X = X[X[:, 2].argsort()][::-1]
X = X[:k]
# In case the ignore weights parameter is set.
if ignore_weigths:
X[:, 2] = 1
counts = X[:, 2]
difference_squared = euclidean_distances(X[:, :2], calculate_centroid(X[:, :2])\
.reshape(1, -1), squared=True)
return float(degrees2meters((np.dot(counts.T, difference_squared) / sum(counts))**0.5))
def process_user(df_current, i, current_user, n_users, min_stay, roam_dist, print_output=None):
"""
Helper function to extract the stop locations of one user out of many users.
Note: Required to parallel process data in process_data function.
Parameters:
df_current -- Pandas dataframe [[user_id, timestamp, latitude, longitude]].
Note: Columns "user_id" and "timestamp" are set as the index
i -- Incremental counter
current_user -- user_id of user being processed
n_users -- Total number of users
min_stay -- Minimum stay duration in minutes
roam_dist -- Maximum roaming distance in meters
print_output -- String to indicate whether to print output. ["yes", "notebook"]
Returns:
df_user_stops -- Pandas dataframe of stop locations structured like
[[timestamp, latitude, longitude, t_start, t_end]]
"""
# Print progress only occasionally.
if ((i+1) % (n_users//50)) == 0:
if print_output == "yes":
print("Processing user {} of {}.".format(i + 1, n_users))
elif print_output == "notebook":
clear_output(wait=True)
print("Processing user {} of {}.".format(i + 1, n_users))
df_user_stops = get_stop_location(
df_current, min_stay_duration=min_stay, roaming_distance=roam_dist)
df_user_stops["user_id"] = current_user
return df_user_stops
def process_data(df, roam_dist, min_stay, n_jobs=1, print_output=None):
"""
Process entire dataset to extract stop locations.
Note: for parallel processing set n_jobs > 1.
Parameters:
df -- Pandas dataframe [[user_id, timestamp, latitude, longitude]].
Note: Columns "user_id" and "timestamp" are set as the index
n_users -- Total number of users
min_stay -- Minimum stay duration in minutes
roam_dist -- Maximum roaming distance in meters
print_output -- String to indicate whether to print output.
["yes" "notebook"]
Returns:
df_user_stops -- Pandas dataframe of stop locations structured like
[[timestamp, latitude, longitude, t_start, t_end]]
"""
userids = list(df.index.levels[0])
n_users = len(userids)
# Get stops for each user
df_stops = Parallel(n_jobs=n_jobs)\
(delayed(process_user)\
(np.array(df.loc[userids[i], ["latitude", "longitude"]].reset_index()),
i, userids[i], n_users, min_stay, roam_dist, print_output)
for i in range(n_users))
return df_stops
def cluster_stoplocations(df, method, max_d):
"""
Takes df with lon-lat coordinates and assigns each observation to a
cluster based on the clustering method and max distance choosen.
Parameters:
df -- Pandas dataframe. Must contain columns ["longitude","latitude"]
method -- clustering method. See SciPy docs for linkage methods
max_d -- spatial distance threshold to use. See SciPy docs
Returns:
df -- Pandas dataframe same as input but with added
cluster_assignment column.
"""
Z = linkage(df.loc[:, ["longitude", "latitude"]],
method=method, metric="euclidean")
df.loc[:, "cluster_assignment"] = fcluster(Z, max_d, criterion='distance')
return df
def get_clustermedoids(df):
"""
Gets the medoids for each cluster in a dataframe.
Parameters:
df -- Pandas dataframe consisting of lon-lat coordinates and cluster
assignments for each observation.
Returns:
df -- Pandas dataframe containing the medoid of each cluster
"""
medoids_idx = get_clustermedoids_indeces(
df.loc[:, ["cluster_assignment","longitude", "latitude"]])
return df.iloc[medoids_idx,:]