-
Notifications
You must be signed in to change notification settings - Fork 2
/
neuro_atlas_analysis.py
299 lines (248 loc) · 12.6 KB
/
neuro_atlas_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
import numpy
from sklearn import neighbors
from scipy.spatial import distance as spdist
import neuro_atlas_features
from neuro_atlas_util import unique_rows
import dipy.segment.clustering
import dipy.segment.metric
import nibabel
import os
def find_duplicates(labeled_tracks, identical_track_path):
"""
Exhaustive search over every pair to find exact duplicates
:param labeled_tracks: An iterable of (label, index, track) tuples
:param identical_track_path: The file in which to write duplicates. The output format is tab-separated with fields:
label_1, index_track_1, label_2, index_track_2
:return: The number of identical tracks
"""
numeric_label_to_label = dict()
label_to_numeric_label = dict()
# first compute the neighbors on a small number of points to get a candidate set
# this is much faster than true exhaustive search
numeric_labels = list()
relative_indices = list()
interpolated_tracks = list()
# we're just going to store these all in memory, probably fine
original_tracks = list()
short_tracks = list()
short_labels = list()
short_relative_indices = list()
for label, index, track in labeled_tracks:
if label not in label_to_numeric_label:
label_to_numeric_label[label] = len(numeric_label_to_label)
numeric_label_to_label[len(numeric_label_to_label)] = label
try:
interpolated_tracks.append(neuro_atlas_features.interpolate(track, num_resulting_points=3).reshape((1, -1)))
original_tracks.append(track)
numeric_labels.append(label_to_numeric_label[label])
relative_indices.append(index)
except neuro_atlas_features.TooFewPointsError:
short_tracks.append(track)
short_labels.append(label_to_numeric_label[label])
short_relative_indices.append(index)
# add information about tracks that were too short to featurize to the end
original_tracks += short_tracks
numeric_labels += short_labels
relative_indices += short_relative_indices
numeric_labels = numpy.array(numeric_labels)
relative_indices = numpy.array(relative_indices)
feature_vectors = numpy.vstack(interpolated_tracks)
nn = neighbors.NearestNeighbors(n_neighbors=2)
nn.fit(feature_vectors)
# find any vector with an exact neighbor that is not itself
distances, indices = nn.kneighbors(feature_vectors, n_neighbors=2)
candidates = feature_vectors[distances[:, 1] == 0]
# expand k until we find all exact neighbors of every vector with at least one exact neighbor
for k in xrange(3, len(numeric_labels)):
distances, indices = nn.kneighbors(candidates, n_neighbors=k)
# noinspection PyTypeChecker
if numpy.count_nonzero(distances[:, k - 1] == 0) == 0:
break
candidate_pairs = list()
indicator_zero_distance = distances == 0
for index in xrange(distances.shape[0]):
indices_zero_distance = indices[index, indicator_zero_distance[index, :]][1:]
for index_zero in indices_zero_distance:
if indices[index, 0] < index_zero:
candidate_pairs.append((indices[index, 0], index_zero))
else:
candidate_pairs.append((index_zero, indices[index, 0]))
for index_1 in range(len(short_tracks)):
for index_2 in range(index_1 + 1, len(short_tracks)):
candidate_pairs.append((index_1 + feature_vectors.shape[0], index_2 + feature_vectors.shape[0]))
candidate_pairs = unique_rows(numpy.array(candidate_pairs))
exact_pairs = list()
for index_pair in xrange(candidate_pairs.shape[0]):
if numpy.array_equal(
original_tracks[candidate_pairs[index_pair, 0]],
original_tracks[candidate_pairs[index_pair, 1]]):
exact_pairs.append(candidate_pairs[index_pair, :])
exact_pairs = map(lambda p: (
(numeric_label_to_label[numeric_labels[p[0]]], relative_indices[p[0]]),
(numeric_label_to_label[numeric_labels[p[1]]], relative_indices[p[1]])),
exact_pairs)
num_identical = len(exact_pairs)
with open(identical_track_path, 'w') as identical_file:
for pair in exact_pairs:
identical_file.write('{0}\t{1}\t{2}\t{3}\n'.format(pair[0][0], pair[0][1], pair[1][0], pair[1][1]))
return num_identical
def label_counts(labeled_tracks):
"""
Returns a dictionary mapping labels to track counts
:param labeled_tracks: An iterable of (label, index, track) tuples
:return:
"""
counts = dict()
for label, index, track in labeled_tracks:
if label not in counts:
counts[label] = 1
else:
counts[label] += 1
return counts
def make_atlas(labeled_tracks, output_directory):
if not os.path.exists(output_directory):
os.makedirs(output_directory)
label_to_voxel_counts = dict()
label_to_total = dict()
for label, index, track in labeled_tracks:
if label not in label_to_voxel_counts:
label_to_voxel_counts[label] = dict()
label_to_total[label] = 0
voxel_counts = label_to_voxel_counts[label]
label_to_total[label] += 1
voxels = numpy.round(track).astype(int)
for index_point in xrange(voxels.shape[0]):
point_tuple = tuple(voxels[index_point, :])
if point_tuple not in voxel_counts:
voxel_counts[point_tuple] = 1
else:
voxel_counts[point_tuple] += 1
# this is the affine transform in HCP-MMP1.nii from dsi studio
affine = numpy.array([
[1., 0., 0., -98.],
[0., 1., 0., -134.],
[0., 0., 1., -72.],
[0., 0., 0., 1.]])
# likewise, this comes from HCP-MMP1.nii
shape = (197, 233, 189)
for label, total in label_to_total.iteritems():
voxel_counts = label_to_voxel_counts[label]
label_atlas = numpy.zeros(shape)
for voxel, count in voxel_counts.iteritems():
label_atlas[voxel[0], voxel[1], voxel[2]] = float(count) / total
atlas_img = nibabel.Nifti1Image(label_atlas, affine)
nibabel.save(atlas_img, os.path.join(output_directory, label + '.nii'))
def distance_investigate(numeric_labels, feature_vectors, metric=None):
"""
Tool to find the mean distance to the furthest fiber with the same label and the mean distance to the closest
fiber with a different label for each label type
:param numeric_labels: The numeric labels assigned to each fiber
:param feature_vectors: The points in the track, vectorized
:param metric: What metric to use for computing distance
:return: A dictionary mapping numeric labels to a tuple of
(mean distance to furthest same, mean distance to closest different)
"""
# first find the furthest same label
unique_labels = numpy.unique(numeric_labels)
mean_furthest_same = numpy.full(unique_labels.shape, numpy.nan)
for index_label, label in enumerate(unique_labels):
print('Finding mean furthest same for {0} of {1} ({2:.2%})'.format(
index_label, len(unique_labels), float(index_label) / len(unique_labels)))
current_set = feature_vectors[label == numeric_labels]
if metric is not None:
same_distances = spdist.pdist(current_set, metric=metric)
else:
same_distances = spdist.pdist(current_set)
mean_furthest_same[index_label] = numpy.mean(numpy.max(spdist.squareform(same_distances), axis=1))
knn = neighbors.KNeighborsClassifier(metric=metric, n_neighbors=5000) if metric is not None else \
neighbors.KNeighborsClassifier(n_neighbors=5000)
print('Fitting ... ')
knn.fit(feature_vectors, numeric_labels)
closest_different = numpy.full(feature_vectors.shape[0], numpy.nan)
def __update_closest(closest, k, is_same):
distances, indices = knn.kneighbors(feature_vectors[numpy.isnan(closest)], n_neighbors=k)
labels = numeric_labels[indices]
current_closest = numpy.full(indices.shape[0], numpy.nan)
for index in xrange(labels.shape[0]):
if is_same:
# noinspection PyTypeChecker,PyUnresolvedReferences
matching_indices = numpy.nonzero(labels[index, 1:] == labels[index, 0])[0]
else:
# noinspection PyTypeChecker,PyUnresolvedReferences
matching_indices = numpy.nonzero(labels[index, 1:] != labels[index, 0])[0]
if len(matching_indices) > 0:
current_closest[index] = distances[index, matching_indices[0]]
closest[numpy.isnan(closest)] = current_closest
return numpy.count_nonzero(numpy.isnan(closest))
print ('Checking at k={0}'.format(2))
current_k = 2
num_different_not_found = __update_closest(closest_different, current_k, False)
while current_k < feature_vectors.shape[0]:
print('Still need to find closest different neighbors for {0}'.format(num_different_not_found))
print ('Checking at k={0}'.format(current_k))
num_different_not_found = __update_closest(closest_different, current_k, False)
if num_different_not_found == 0:
break
if current_k == feature_vectors.shape[0] - 1:
break
current_k *= 2
if current_k >= feature_vectors.shape[0]:
current_k = feature_vectors.shape[0] - 1
if num_different_not_found > 0:
num_different_not_found = __update_closest(closest_different, feature_vectors.shape[0], False)
print('Are we done? {0}'.format(num_different_not_found == 0))
unique_labels = numpy.unique(numeric_labels)
label_to_mean_closest_same_different = dict()
# noinspection PyTypeChecker
for index_label, numeric_label in enumerate(unique_labels):
mean_distance_to_closest_different = numpy.mean(closest_different[numeric_label == numeric_labels])
label_to_mean_closest_same_different[numeric_label] = (
mean_furthest_same[index_label], mean_distance_to_closest_different)
return label_to_mean_closest_same_different
class SupervisedQuickBundles:
"""
You can imagine two ways of doing this.
1) Since we have labels, just take the centroid of feature vectors with a given label, then run nearest neighbor
using the resulting centroids as the training data.
2) Do the unsupervised QuickBundles algorithm, then assign labels to the clusters by vote, and use these as the
train data.
When unsupervised_thresh is None this does way (1), when unsupervised_thresh is not None, this does way (2)
This is simply nearest neighbor using k=1 and centroids to train
"""
def __init__(self, unsupervised_thresh):
self.__unsupervised_thresh = unsupervised_thresh
self.__centroids = None
self.__centroid_labels = None
# noinspection PyUnusedLocal
def get_params(self, deep=False):
return {'unsupervised_thresh': self.__unsupervised_thresh}
# noinspection PyPep8Naming
def fit(self, X, y):
if self.__unsupervised_thresh is not None:
# user can pass in a negative to let us decide
if self.__unsupervised_thresh <= 0:
self.__unsupervised_thresh = 20
qb = dipy.segment.clustering.QuickBundles(
threshold=self.__unsupervised_thresh, metric=dipy.segment.metric.AveragePointwiseEuclideanMetric())
clusters = qb.cluster(numpy.vsplit(numpy.reshape(X, (-1, 3)), X.shape[0]))
cluster_labels = list()
for cluster in clusters:
current_labels, current_label_counts = numpy.unique(y[cluster.indices], return_counts=True)
cluster_labels.append(current_labels[numpy.argmax(current_label_counts)])
self.__centroids = map(lambda cl: cl.centroid, clusters)
self.__centroid_labels = numpy.array(cluster_labels)
else:
unique_labels = numpy.unique(y)
centroids = numpy.full((len(unique_labels), X.shape[1]), numpy.nan)
for index_label, unique_label in enumerate(unique_labels):
centroids[index_label] = numpy.mean(X[y == unique_label], axis=0)
self.__centroids = numpy.vsplit(centroids.reshape((-1, 3)), centroids.shape[0])
self.__centroid_labels = unique_labels
# noinspection PyPep8Naming
def score(self, X, y):
closest_clusters = numpy.argmin(dipy.segment.metric.distance_matrix(
dipy.segment.metric.AveragePointwiseEuclideanMetric(),
numpy.vsplit(numpy.reshape(X, (-1, 3)), X.shape[0]),
self.__centroids), axis=1)
# noinspection PyTypeChecker
return numpy.sum(self.__centroid_labels[closest_clusters] == y) / float(len(y))