-
Notifications
You must be signed in to change notification settings - Fork 313
/
cs120_lab4_pca.py
1329 lines (1030 loc) · 70.7 KB
/
cs120_lab4_pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Databricks notebook source exported at Sat, 30 Jul 2016 02:42:14 UTC
# MAGIC %md
# MAGIC <a rel="license" href="http://creativecommons.org/licenses/by-nc-nd/4.0/"> <img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-nd/4.0/88x31.png"/> </a> <br/> This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-nd/4.0/"> Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License. </a>
# COMMAND ----------
# MAGIC %md
# MAGIC ![ML Logo](http://spark-mooc.github.io/web-assets/images/CS190.1x_Banner_300.png)
# MAGIC # Principal Component Analysis Lab
# MAGIC
# MAGIC This lab delves into exploratory analysis of neuroscience data, specifically using principal component analysis (PCA) and feature-based aggregation. We will use a dataset of light-sheet imaging recorded by the [Ahrens Lab](http://www.janelia.org/lab/ahrens-lab) at Janelia Research Campus.
# MAGIC
# MAGIC Our dataset is generated by studying the movement of a larval [zebrafish](http://en.wikipedia.org/wiki/Zebrafish), an animal that is especially useful in neuroscience because it is transparent, making it possible to record activity over its entire brain using a technique called [light-sheet microscopy](http://en.wikipedia.org/wiki/Light_sheet_fluorescence_microscopy). Specifically, we'll work with time-varying images containing patterns of the zebrafish's neural activity as it is presented with a moving visual pattern. Different stimuli induce different patterns across the brain, and we can use exploratory analyses to identify these patterns. Read ["Mapping brain activity at scale with cluster computing"](http://thefreemanlab.com/work/papers/freeman-2014-nature-methods.pdf) for more information about these kinds of analyses.
# MAGIC
# MAGIC During this lab you will learn about PCA, and then compare and contrast different exploratory analyses of the same data set to identify which neural patterns they best highlight.
# MAGIC
# MAGIC ## This lab will cover:
# MAGIC
# MAGIC + *Part 1:* Work through the steps of PCA on a sample dataset
# MAGIC + *Visualization 1:* Two-dimensional Gaussians
# MAGIC
# MAGIC + *Part 2:* Write a PCA function and evaluate PCA on sample datasets
# MAGIC + *Visualization 2:* PCA projection
# MAGIC + *Visualization 3:* Three-dimensional data
# MAGIC + *Visualization 4:* 2D representation of 3D data
# MAGIC
# MAGIC + *Part 3:* Parse, inspect, and preprocess neuroscience data then perform PCA
# MAGIC + *Visualization 5:* Pixel intensity
# MAGIC + *Visualization 6:* Normalized data
# MAGIC + *Visualization 7:* Top two components as images
# MAGIC + *Visualization 8:* Top two components as one image
# MAGIC
# MAGIC + *Part 4:* Perform feature-based aggregation followed by PCA
# MAGIC + *Visualization 9:* Top two components by time
# MAGIC + *Visualization 10:* Top two components by direction
# MAGIC
# MAGIC Note that, for reference, you can look up the details of the relevant Spark methods in [Spark's Python API](https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD) and the relevant NumPy methods in the [NumPy Reference](http://docs.scipy.org/doc/numpy/reference/index.html)
# COMMAND ----------
labVersion = 'cs120.1x-lab4-1.0.5'
# COMMAND ----------
# MAGIC %md
# MAGIC ## Part 1: Work through the steps of PCA on a sample dataset
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualization 1: Two-dimensional Gaussians
# MAGIC
# MAGIC Principal Component Analysis, or PCA, is a strategy for dimensionality reduction. To better understand PCA, we'll work with synthetic data generated by sampling from the [two-dimensional Gaussian distribution](http://en.wikipedia.org/wiki/Multivariate_normal_distribution). This distribution takes as input the mean and variance of each dimension, as well as the covariance between the two dimensions.
# MAGIC
# MAGIC In our visualizations below, we will specify the mean of each dimension to be 50 and the variance along each dimension to be 1. We will explore two different values for the covariance: 0 and 0.9. When the covariance is zero, the two dimensions are uncorrelated, and hence the data looks spherical. In contrast, when the covariance is 0.9, the two dimensions are strongly (positively) correlated and thus the data is non-spherical. As we'll see in Parts 1 and 2, the non-spherical data is amenable to dimensionality reduction via PCA, while the spherical data is not.
# COMMAND ----------
import matplotlib.pyplot as plt
import numpy as np
def prepare_plot(xticks, yticks, figsize=(10.5, 6), hide_labels=False, grid_color='#999999',
grid_width=1.0):
"""Template for generating the plot layout."""
plt.close()
fig, ax = plt.subplots(figsize=figsize, facecolor='white', edgecolor='white')
ax.axes.tick_params(labelcolor='#999999', labelsize='10')
for axis, ticks in [(ax.get_xaxis(), xticks), (ax.get_yaxis(), yticks)]:
axis.set_ticks_position('none')
axis.set_ticks(ticks)
axis.label.set_color('#999999')
if hide_labels: axis.set_ticklabels([])
plt.grid(color=grid_color, linewidth=grid_width, linestyle='-')
map(lambda position: ax.spines[position].set_visible(False), ['bottom', 'top', 'left', 'right'])
return fig, ax
def create_2D_gaussian(mn, variance, cov, n):
"""Randomly sample points from a two-dimensional Gaussian distribution"""
np.random.seed(142)
return np.random.multivariate_normal(np.array([mn, mn]), np.array([[variance, cov], [cov, variance]]), n)
# COMMAND ----------
data_random = create_2D_gaussian(mn=50, variance=1, cov=0, n=100)
# generate layout and plot data
fig, ax = prepare_plot(np.arange(46, 55, 2), np.arange(46, 55, 2))
ax.set_xlabel(r'Simulated $x_1$ values'), ax.set_ylabel(r'Simulated $x_2$ values')
ax.set_xlim(45, 54.5), ax.set_ylim(45, 54.5)
plt.scatter(data_random[:,0], data_random[:,1], s=14**2, c='#d6ebf2', edgecolors='#8cbfd0', alpha=0.75)
display(fig)
# COMMAND ----------
data_correlated = create_2D_gaussian(mn=50, variance=1, cov=.9, n=100)
# generate layout and plot data
fig, ax = prepare_plot(np.arange(46, 55, 2), np.arange(46, 55, 2))
ax.set_xlabel(r'Simulated $x_1$ values'), ax.set_ylabel(r'Simulated $x_2$ values')
ax.set_xlim(45.5, 54.5), ax.set_ylim(45.5, 54.5)
plt.scatter(data_correlated[:,0], data_correlated[:,1], s=14**2, c='#d6ebf2',
edgecolors='#8cbfd0', alpha=0.75)
display(fig)
# COMMAND ----------
# MAGIC %md
# MAGIC ### (1a) Interpreting PCA
# MAGIC
# MAGIC PCA can be interpreted as identifying the "directions" along which the data vary the most. In the first step of PCA, we must first center our data. Working with our correlated dataset, first compute the mean of each feature (column) in the dataset. Then for each observation, modify the features by subtracting their corresponding mean, to create a zero mean dataset.
# MAGIC
# MAGIC > Note:
# MAGIC > * `correlated_data` is an RDD of NumPy arrays.
# MAGIC > * This allows us to perform certain operations more succinctly.
# MAGIC > * For example, we can sum the columns of our dataset using `correlated_data.sum()`.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
correlated_data = sc.parallelize(data_correlated)
mean_correlated = <FILL IN>
correlated_data_zero_mean = correlated_data.<FILL IN>
print mean_correlated
print correlated_data.take(1)
print correlated_data_zero_mean.take(1)
# COMMAND ----------
# TEST Interpreting PCA (1a)
from databricks_test_helper import Test
Test.assertTrue(np.allclose(mean_correlated, [49.95739037, 49.97180477]),
'incorrect value for mean_correlated')
Test.assertTrue(np.allclose(correlated_data_zero_mean.take(1)[0], [-0.28561917, 0.10351492]),
'incorrect value for correlated_data_zero_mean')
# COMMAND ----------
# MAGIC %md
# MAGIC ### (1b) Sample covariance matrix
# MAGIC
# MAGIC We are now ready to compute the sample covariance matrix. If we define \\(\scriptsize \mathbf{X} \in \mathbb{R}^{n \times d}\\) as the zero mean data matrix, then the sample covariance matrix is defined as: \\[ \mathbf{C}_{\mathbf X} = \frac{1}{n} \mathbf{X}^\top \mathbf{X} \,.\\]
# MAGIC
# MAGIC To compute this matrix, compute the outer product of each data point, add together these outer products, and divide by the number of data points. The data are two dimensional, so the resulting covariance matrix should be a 2x2 matrix.
# MAGIC
# MAGIC > Note:
# MAGIC > * [np.outer()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.outer.html) can be used to calculate the outer product of two NumPy arrays.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
# Compute the covariance matrix using outer products and correlated_data_zero_mean
correlated_cov = <FILL IN>
print correlated_cov
# COMMAND ----------
# TEST Sample covariance matrix (1b)
cov_result = [[ 0.99558386, 0.90148989], [0.90148989, 1.08607497]]
Test.assertTrue(np.allclose(cov_result, correlated_cov), 'incorrect value for correlated_cov')
# COMMAND ----------
# MAGIC %md
# MAGIC ### (1c) Covariance Function
# MAGIC
# MAGIC Next, use the expressions above to write a function to compute the sample covariance matrix for an arbitrary `data` RDD.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
def estimate_covariance(data):
"""Compute the covariance matrix for a given rdd.
Note:
The multi-dimensional covariance array should be calculated using outer products. Don't
forget to normalize the data by first subtracting the mean.
Args:
data (RDD of np.ndarray): An `RDD` consisting of NumPy arrays.
Returns:
np.ndarray: A multi-dimensional array where the number of rows and columns both equal the
length of the arrays in the input `RDD`.
"""
<FILL IN>
correlated_cov_auto= estimate_covariance(correlated_data)
print correlated_cov_auto
# COMMAND ----------
# TEST Covariance function (1c)
correct_cov = [[ 0.99558386, 0.90148989], [0.90148989, 1.08607497]]
Test.assertTrue(np.allclose(correct_cov, correlated_cov_auto),
'incorrect value for correlated_cov_auto')
test_data = np.array([[0,1,2,3], [4,5,6,7], [8,9,10,11], [12,13,14,15]])
cov_test_data = sc.parallelize(test_data)
correct_test_cov = [[20., 20., 20., 20.],
[ 20., 20., 20., 20.],
[ 20., 20., 20., 20.],
[ 20., 20., 20., 20.]]
Test.assertTrue(np.allclose(correct_test_cov, estimate_covariance(cov_test_data)), 'incorrect value returned by estimate_covariance')
# COMMAND ----------
# MAGIC %md
# MAGIC ### (1d) Eigendecomposition
# MAGIC
# MAGIC Now that we've computed the sample covariance matrix, we can use it to find directions of maximal variance in the data. Specifically, we can perform an eigendecomposition of this matrix to find its eigenvalues and eigenvectors. The \\(\scriptsize d \\) eigenvectors of the covariance matrix give us the directions of maximal variance, and are often called the "principal components." The associated eigenvalues are the variances in these directions. In particular, the eigenvector corresponding to the largest eigenvalue is the direction of maximal variance (this is sometimes called the "top" eigenvector). Eigendecomposition of a \\(\scriptsize d \times d \\) covariance matrix has a (roughly) cubic runtime complexity with respect to \\(\scriptsize d \\). Whenever \\(\scriptsize d \\) is relatively small (e.g., less than a few thousand) we can quickly perform this eigendecomposition locally.
# MAGIC
# MAGIC Use a function from `numpy.linalg` called [eigh](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html) to perform the eigendecomposition. Next, sort the eigenvectors based on their corresponding eigenvalues (from high to low), yielding a matrix where the columns are the eigenvectors (and the first column is the top eigenvector). Note that [np.argsort](http://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html#numpy-argsort) can be used to obtain the indices of the eigenvalues that correspond to the ascending order of eigenvalues. Finally, set the `top_component` variable equal to the top eigenvector or prinicipal component, which is a \\(\scriptsize 2 \\)-dimensional vector (array with two values).
# MAGIC
# MAGIC > Note:
# MAGIC > * The eigenvectors returned by `eigh` appear in the columns and not the rows.
# MAGIC > * For example, the first eigenvector of `eig_vecs` would be found in the first column and could be accessed using `eig_vecs[:,0]`.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
from numpy.linalg import eigh
# Calculate the eigenvalues and eigenvectors from correlated_cov_auto
eig_vals, eig_vecs = <FILL IN>
print 'eigenvalues: {0}'.format(eig_vals)
print '\neigenvectors: \n{0}'.format(eig_vecs)
# Use np.argsort to find the top eigenvector based on the largest eigenvalue
inds = np.argsort(<FILL IN>)
top_component = <FILL IN>
print '\ntop principal component: {0}'.format(top_component)
# COMMAND ----------
# TEST Eigendecomposition (1d)
def check_basis(vectors, correct):
return np.allclose(vectors, correct) or np.allclose(np.negative(vectors), correct)
Test.assertTrue(check_basis(top_component, [0.68915649, 0.72461254]),
'incorrect value for top_component')
# COMMAND ----------
# MAGIC %md
# MAGIC ### (1e) PCA scores
# MAGIC
# MAGIC We just computed the top principal component for a 2-dimensional non-spherical dataset. Now let's use this principal component to derive a one-dimensional representation for the original data. To compute these compact representations, which are sometimes called PCA "scores", calculate the dot product between each data point in the raw data and the top principal component.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
# Use the top_component and the data from correlated_data to generate PCA scores
correlated_data_scores = <FILL IN>
print 'one-dimensional data (first three):\n{0}'.format(np.asarray(correlated_data_scores.take(3)))
# COMMAND ----------
# TEST PCA Scores (1e)
first_three = [70.51682806, 69.30622356, 71.13588168]
Test.assertTrue(check_basis(correlated_data_scores.take(3), first_three),
'incorrect value for correlated_data_scores')
# COMMAND ----------
# MAGIC %md
# MAGIC ## Part 2: Write a PCA function and evaluate PCA on sample datasets
# COMMAND ----------
# MAGIC %md
# MAGIC ### (2a) PCA function
# MAGIC
# MAGIC We now have all the ingredients to write a general PCA function. Instead of working with just the top principal component, our function will compute the top \\(\scriptsize k\\) principal components and principal scores for a given dataset. The top \\(\scriptsize k\\) principal components should be returned in descending order when ranked by their corresponding principal scores. Write this general function `pca`, and run it with `correlated_data` and \\(\scriptsize k = 2\\). Hint: Use results from Part (1c), Part (1d), and Part (1e).
# MAGIC
# MAGIC Note: As discussed in lecture, our implementation is a reasonable strategy when \\(\scriptsize d \\) is small, though more efficient distributed algorithms exist when \\(\scriptsize d \\) is large.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
def pca(data, k=2):
"""Computes the top `k` principal components, corresponding scores, and all eigenvalues.
Note:
All eigenvalues should be returned in sorted order (largest to smallest). `eigh` returns
each eigenvectors as a column. This function should also return eigenvectors as columns.
Args:
data (RDD of np.ndarray): An `RDD` consisting of NumPy arrays.
k (int): The number of principal components to return.
Returns:
tuple of (np.ndarray, RDD of np.ndarray, np.ndarray): A tuple of (eigenvectors, `RDD` of
scores, eigenvalues). Eigenvectors is a multi-dimensional array where the number of
rows equals the length of the arrays in the input `RDD` and the number of columns equals
`k`. The `RDD` of scores has the same number of rows as `data` and consists of arrays
of length `k`. Eigenvalues is an array of length d (the number of features).
"""
<FILL IN>
# Return the `k` principal components, `k` scores, and all eigenvalues
<FILL IN>
# Run pca on correlated_data with k = 2
top_components_correlated, correlated_data_scores_auto, eigenvalues_correlated = <FILL IN>
# Note that the 1st principal component is in the first column
print 'top_components_correlated: \n{0}'.format(top_components_correlated)
print ('\ncorrelated_data_scores_auto (first three): \n{0}'
.format('\n'.join(map(str, correlated_data_scores_auto.take(3)))))
print '\neigenvalues_correlated: \n{0}'.format(eigenvalues_correlated)
# Create a higher dimensional test set
pca_test_data = sc.parallelize([np.arange(x, x + 4) for x in np.arange(0, 20, 4)])
components_test, test_scores, eigenvalues_test = pca(pca_test_data, 3)
print '\npca_test_data: \n{0}'.format(np.array(pca_test_data.collect()))
print '\ncomponents_test: \n{0}'.format(components_test)
print ('\ntest_scores (first three): \n{0}'
.format('\n'.join(map(str, test_scores.take(3)))))
print '\neigenvalues_test: \n{0}'.format(eigenvalues_test)
# COMMAND ----------
# TEST PCA Function (2a)
Test.assertTrue(check_basis(top_components_correlated.T,
[[0.68915649, 0.72461254], [-0.72461254, 0.68915649]]),
'incorrect value for top_components_correlated')
first_three_correlated = [[70.51682806, 69.30622356, 71.13588168], [1.48305648, 1.5888655, 1.86710679]]
Test.assertTrue(np.allclose(first_three_correlated,
np.vstack(np.abs(correlated_data_scores_auto.take(3))).T),
'incorrect value for first three correlated values')
Test.assertTrue(np.allclose(eigenvalues_correlated, [1.94345403, 0.13820481]),
'incorrect values for eigenvalues_correlated')
top_components_correlated_k1, correlated_data_scores_k1, eigenvalues_correlated_k1 = pca(correlated_data, 1)
Test.assertTrue(check_basis(top_components_correlated_k1.T, [0.68915649, 0.72461254]),
'incorrect value for components when k=1')
Test.assertTrue(np.allclose([70.51682806, 69.30622356, 71.13588168],
np.vstack(np.abs(correlated_data_scores_k1.take(3))).T),
'incorrect value for scores when k=1')
Test.assertTrue(np.allclose(eigenvalues_correlated_k1, [1.94345403, 0.13820481]),
'incorrect values for eigenvalues when k=1')
Test.assertTrue(check_basis(components_test.T[0], [.5, .5, .5, .5]),
'incorrect value for components_test')
Test.assertTrue(np.allclose(np.abs(test_scores.first()[0]), 3.),
'incorrect value for test_scores')
Test.assertTrue(np.allclose(eigenvalues_test, [128, 0, 0, 0]), 'incorrect value for eigenvalues_test')
# COMMAND ----------
# MAGIC %md
# MAGIC ### (2b) PCA on `data_random`
# MAGIC
# MAGIC Next, use the PCA function we just developed to find the top two principal components of the spherical `data_random` we created in Visualization 1.
# MAGIC
# MAGIC First, we need to convert `data_random` to the RDD `random_data_rdd`, and do all subsequent operations on `random_data_rdd`.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
random_data_rdd = sc.parallelize(data_random)
# Use pca on data_random
top_components_random, random_data_scores_auto, eigenvalues_random = <FILL IN>
print 'top_components_random: \n{0}'.format(top_components_random)
print ('\nrandom_data_scores_auto (first three): \n{0}'
.format('\n'.join(map(str, random_data_scores_auto.take(3)))))
print '\neigenvalues_random: \n{0}'.format(eigenvalues_random)
# COMMAND ----------
# TEST PCA on `data_random` (2b)
Test.assertTrue(check_basis(top_components_random.T,
[[-0.2522559 , 0.96766056], [-0.96766056, -0.2522559]]),
'incorrect value for top_components_random')
first_three_random = [[36.61068572, 35.97314295, 35.59836628],
[61.3489929 , 62.08813671, 60.61390415]]
Test.assertTrue(np.allclose(first_three_random, np.vstack(np.abs(random_data_scores_auto.take(3))).T),
'incorrect value for random_data_scores_auto')
Test.assertTrue(np.allclose(eigenvalues_random, [1.4204546, 0.99521397]),
'incorrect value for eigenvalues_random')
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualization 2: PCA projection
# MAGIC
# MAGIC Plot the original data and the 1-dimensional reconstruction using the top principal component to see how the PCA solution looks. The original data is plotted as before; however, the 1-dimensional reconstruction (projection) is plotted in green on top of the original data and the vectors (lines) representing the two principal components are shown as dotted lines.
# COMMAND ----------
def project_points_and_get_lines(data, components, x_range):
"""Project original data onto first component and get line details for top two components."""
top_component = components[:, 0]
slope1, slope2 = components[1, :2] / components[0, :2]
means = data.mean()[:2]
demeaned = data.map(lambda v: v - means)
projected = demeaned.map(lambda v: (v.dot(top_component) /
top_component.dot(top_component)) * top_component)
remeaned = projected.map(lambda v: v + means)
x1,x2 = zip(*remeaned.collect())
line_start_P1_X1, line_start_P1_X2 = means - np.asarray([x_range, x_range * slope1])
line_end_P1_X1, line_end_P1_X2 = means + np.asarray([x_range, x_range * slope1])
line_start_P2_X1, line_start_P2_X2 = means - np.asarray([x_range, x_range * slope2])
line_end_P2_X1, line_end_P2_X2 = means + np.asarray([x_range, x_range * slope2])
return ((x1, x2), ([line_start_P1_X1, line_end_P1_X1], [line_start_P1_X2, line_end_P1_X2]),
([line_start_P2_X1, line_end_P2_X1], [line_start_P2_X2, line_end_P2_X2]))
# COMMAND ----------
((x1, x2), (line1X1, line1X2), (line2X1, line2X2)) = \
project_points_and_get_lines(correlated_data, top_components_correlated, 5)
# generate layout and plot data
fig, ax = prepare_plot(np.arange(46, 55, 2), np.arange(46, 55, 2), figsize=(7, 7))
ax.set_xlabel(r'Simulated $x_1$ values'), ax.set_ylabel(r'Simulated $x_2$ values')
ax.set_xlim(45.5, 54.5), ax.set_ylim(45.5, 54.5)
plt.plot(line1X1, line1X2, linewidth=3.0, c='#8cbfd0', linestyle='--')
plt.plot(line2X1, line2X2, linewidth=3.0, c='#d6ebf2', linestyle='--')
plt.scatter(data_correlated[:,0], data_correlated[:,1], s=14**2, c='#d6ebf2',
edgecolors='#8cbfd0', alpha=0.75)
plt.scatter(x1, x2, s=14**2, c='#62c162', alpha=.75)
display(fig)
# COMMAND ----------
((x1, x2), (line1X1, line1X2), (line2X1, line2X2)) = \
project_points_and_get_lines(random_data_rdd, top_components_random, 5)
# generate layout and plot data
fig, ax = prepare_plot(np.arange(46, 55, 2), np.arange(46, 55, 2), figsize=(7, 7))
ax.set_xlabel(r'Simulated $x_1$ values'), ax.set_ylabel(r'Simulated $x_2$ values')
ax.set_xlim(45.5, 54.5), ax.set_ylim(45.5, 54.5)
plt.plot(line1X1, line1X2, linewidth=3.0, c='#8cbfd0', linestyle='--')
plt.plot(line2X1, line2X2, linewidth=3.0, c='#d6ebf2', linestyle='--')
plt.scatter(data_random[:,0], data_random[:,1], s=14**2, c='#d6ebf2',
edgecolors='#8cbfd0', alpha=0.75)
plt.scatter(x1, x2, s=14**2, c='#62c162', alpha=.75)
display(fig)
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualization 3: Three-dimensional data
# MAGIC
# MAGIC So far we have worked with two-dimensional data. Now let's generate three-dimensional data with highly correlated features. As in Visualization 1, we'll create samples from a multivariate Gaussian distribution, which in three dimensions requires us to specify three means, three variances, and three covariances.
# MAGIC
# MAGIC In the 3D graphs below, we have included the 2D plane that corresponds to the top two principal components, i.e. the plane with the smallest euclidean distance between the points and itself. Notice that the data points, despite living in three-dimensions, are found near a two-dimensional plane: the left graph shows how most points are close to the plane when it is viewed from its side, while the right graph shows that the plane covers most of the variance in the data. Note that darker blues correspond to points with higher values for the third dimension.
# COMMAND ----------
from mpl_toolkits.mplot3d import Axes3D
m = 100
mu = np.array([50, 50, 50])
r1_2 = 0.9
r1_3 = 0.7
r2_3 = 0.1
sigma1 = 5
sigma2 = 20
sigma3 = 20
c = np.array([[sigma1 ** 2, r1_2 * sigma1 * sigma2, r1_3 * sigma1 * sigma3],
[r1_2 * sigma1 * sigma2, sigma2 ** 2, r2_3 * sigma2 * sigma3],
[r1_3 * sigma1 * sigma3, r2_3 * sigma2 * sigma3, sigma3 ** 2]])
np.random.seed(142)
data_threeD = np.random.multivariate_normal(mu, c, m)
from matplotlib.colors import ListedColormap, Normalize
from matplotlib.cm import get_cmap
norm = Normalize()
cmap = get_cmap("Blues")
clrs = cmap(np.array(norm(data_threeD[:,2])))[:,0:3]
fig = plt.figure(figsize=(11, 6))
ax = fig.add_subplot(121, projection='3d')
ax.azim=-100
ax.scatter(data_threeD[:,0], data_threeD[:,1], data_threeD[:,2], c=clrs, s=14**2)
xx, yy = np.meshgrid(np.arange(-15, 10, 1), np.arange(-50, 30, 1))
normal = np.array([0.96981815, -0.188338, -0.15485978])
z = (-normal[0] * xx - normal[1] * yy) * 1. / normal[2]
xx = xx + 50
yy = yy + 50
z = z + 50
ax.set_zlim((-20, 120)), ax.set_ylim((-20, 100)), ax.set_xlim((30, 75))
ax.plot_surface(xx, yy, z, alpha=.10)
ax = fig.add_subplot(122, projection='3d')
ax.azim=10
ax.elev=20
#ax.dist=8
ax.scatter(data_threeD[:,0], data_threeD[:,1], data_threeD[:,2], c=clrs, s=14**2)
ax.set_zlim((-20, 120)), ax.set_ylim((-20, 100)), ax.set_xlim((30, 75))
ax.plot_surface(xx, yy, z, alpha=.1)
plt.tight_layout()
display(fig)
# COMMAND ----------
# MAGIC %md
# MAGIC ### (2c) 3D to 2D
# MAGIC
# MAGIC We will now use PCA to see if we can recover the 2-dimensional plane on which the data live. Parallelize the data, and use our PCA function from above, with \\( \scriptsize k=2 \\) components.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
threeD_data = sc.parallelize(data_threeD)
components_threeD, threeD_scores, eigenvalues_threeD = <FILL IN>
print 'components_threeD: \n{0}'.format(components_threeD)
print ('\nthreeD_scores (first three): \n{0}'
.format('\n'.join(map(str, threeD_scores.take(3)))))
print '\neigenvalues_threeD: \n{0}'.format(eigenvalues_threeD)
# COMMAND ----------
# TEST 3D to 2D (2c)
Test.assertEquals(components_threeD.shape, (3, 2), 'incorrect shape for components_threeD')
Test.assertTrue(np.allclose(np.sum(eigenvalues_threeD), 969.796443367),
'incorrect value for eigenvalues_threeD')
Test.assertTrue(np.allclose(np.abs(np.sum(components_threeD)), 1.77238943258),
'incorrect value for components_threeD')
Test.assertTrue(np.allclose(np.abs(np.sum(threeD_scores.take(3))), 237.782834092),
'incorrect value for threeD_scores')
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualization 4: 2D representation of 3D data
# MAGIC
# MAGIC See the 2D version of the data that captures most of its original structure. Note that darker blues correspond to points with higher values for the original data's third dimension.
# COMMAND ----------
scores_threeD = np.asarray(threeD_scores.collect())
# generate layout and plot data
fig, ax = prepare_plot(np.arange(20, 150, 20), np.arange(-40, 110, 20))
ax.set_xlabel(r'New $x_1$ values'), ax.set_ylabel(r'New $x_2$ values')
ax.set_xlim(5, 150), ax.set_ylim(-45, 50)
plt.scatter(scores_threeD[:, 0], scores_threeD[:, 1], s=14 ** 2, c=clrs, edgecolors='#8cbfd0', alpha=0.75)
display(fig)
# COMMAND ----------
# MAGIC %md
# MAGIC ### (2d) Variance explained
# MAGIC
# MAGIC Finally, let's quantify how much of the variance is being captured by PCA in each of the three synthetic datasets we've analyzed. To do this, we'll compute the fraction of retained variance by the top principal components. Recall that the eigenvalue corresponding to each principal component captures the variance along this direction. If our initial data is \\(\scriptsize d\\)-dimensional, then the total variance in our data equals: \\( \scriptsize \sum_{i=1}^d \lambda_i \\), where \\(\scriptsize \lambda_i\\) is the eigenvalue corresponding to the \\(\scriptsize i\\)th principal component. Moreover, if we use PCA with some \\(\scriptsize k < d\\), then we can compute the variance retained by these principal components by adding the top \\(\scriptsize k\\) eigenvalues. The fraction of retained variance equals the sum of the top \\(\scriptsize k\\) eigenvalues divided by the sum of all of the eigenvalues.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
def variance_explained(data, k=1):
"""Calculate the fraction of variance explained by the top `k` eigenvectors.
Args:
data (RDD of np.ndarray): An RDD that contains NumPy arrays which store the
features for an observation.
k: The number of principal components to consider.
Returns:
float: A number between 0 and 1 representing the percentage of variance explained
by the top `k` eigenvectors.
"""
components, scores, eigenvalues = <FILL IN>
<FILL IN>
variance_random_1 = variance_explained(random_data_rdd, 1)
variance_correlated_1 = variance_explained(correlated_data, 1)
variance_random_2 = variance_explained(random_data_rdd, 2)
variance_correlated_2 = variance_explained(correlated_data, 2)
variance_threeD_2 = variance_explained(threeD_data, 2)
print ('Percentage of variance explained by the first component of random_data_rdd: {0:.1f}%'
.format(variance_random_1 * 100))
print ('Percentage of variance explained by both components of random_data_rdd: {0:.1f}%'
.format(variance_random_2 * 100))
print ('\nPercentage of variance explained by the first component of correlated_data: {0:.1f}%'.
format(variance_correlated_1 * 100))
print ('Percentage of variance explained by both components of correlated_data: {0:.1f}%'
.format(variance_correlated_2 * 100))
print ('\nPercentage of variance explained by the first two components of threeD_data: {0:.1f}%'
.format(variance_threeD_2 * 100))
# COMMAND ----------
# TEST Variance explained (2d)
Test.assertTrue(np.allclose(variance_random_1, 0.588017172066), 'incorrect value for variance_random_1')
Test.assertTrue(np.allclose(variance_correlated_1, 0.933608329586),
'incorrect value for varianceCorrelated1')
Test.assertTrue(np.allclose(variance_random_2, 1.0), 'incorrect value for variance_random_2')
Test.assertTrue(np.allclose(variance_correlated_2, 1.0), 'incorrect value for variance_correlated_2')
Test.assertTrue(np.allclose(variance_threeD_2, 0.993967356912), 'incorrect value for variance_threeD_2')
# COMMAND ----------
# MAGIC %md
# MAGIC ## Part 3: Parse, inspect, and preprocess neuroscience data then perform PCA
# COMMAND ----------
# MAGIC %md
# MAGIC ### Data introduction
# MAGIC
# MAGIC A central challenge in neuroscience is understanding the organization and function of neurons, the cells responsible for processing and representing information in the brain. New technologies make it possible to monitor the responses of large populations of neurons in awake animals. In general, neurons communicate through electrical impulses that must be recorded with electrodes, which is a challenging process. As an alternative, we can genetically engineer animals so that their neurons express special proteins that fluoresce or light up when active, and then use microscopy to record neural activity as images.
# MAGIC
# MAGIC A recently developed method called light-sheet microscopy lets us do this in a special, transparent animal, the larval zebrafish, over nearly its entire brain. The resulting data are time-varying images containing the activity of hundreds of thousands of neurons. Given the raw data, which is enormous, we want to find compact spatial and temporal patterns: Which groups of neurons are active together? What is the time course of their activity? Are those patterns specific to particular events happening during the experiment (e.g. a stimulus that we might present). PCA is a powerful technique for finding spatial and temporal patterns in these kinds of data, and that's what we'll explore here!
# COMMAND ----------
# MAGIC %md
# MAGIC ### (3a) Load neuroscience data
# MAGIC
# MAGIC In the next sections we will use PCA to capture structure in neural datasets. Before doing the analysis, we will load and do some basic inspection of the data. The raw data are currently stored as a text file. Every line in the file contains the time series of image intensity for a single pixel in a time-varying image (i.e. a movie). The first two numbers in each line are the spatial coordinates of the pixel, and the remaining numbers are the time series. We'll use `first()` to inspect a single row, and print just the first 100 characters.
# COMMAND ----------
import os
input_file = os.path.join('databricks-datasets', 'cs190', 'data-001', 'neuro.txt')
lines = sc.textFile(input_file)
print lines.first()[0:100]
# Check that everything loaded properly
assert len(lines.first()) == 1397
assert lines.count() == 46460
# COMMAND ----------
# MAGIC %md
# MAGIC ### (3b) Parse the data
# MAGIC
# MAGIC Parse the data into a key-value representation. We want each key to be a tuple of two-dimensional spatial coordinates and each value to be a NumPy array storing the associated time series. Write a function that converts a line of text into a (`tuple`, `np.ndarray`) pair. Then apply this function to each record in the RDD, and inspect the first entry of the new parsed data set. Now would be a good time to cache the data, and force a computation by calling count, to ensure the data are cached.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
def parse(line):
"""Parse the raw data into a (`tuple`, `np.ndarray`) pair.
Note:
You should store the pixel coordinates as a tuple of two ints and the elements of the pixel intensity
time series as an np.ndarray of floats.
Args:
line (str): A string representing an observation. Elements are separated by spaces. The
first two elements represent the coordinates of the pixel, and the rest of the elements
represent the pixel intensity over time.
Returns:
tuple of tuple, np.ndarray: A (coordinate, pixel intensity array) `tuple` where coordinate is
a `tuple` containing two values and the pixel intensity is stored in an NumPy array
which contains 240 values.
"""
<FILL IN>
raw_data = lines.map(parse)
raw_data.cache()
entry = raw_data.first()
print 'Length of movie is {0} seconds'.format(len(entry[1]))
print 'Number of pixels in movie is {0:,}'.format(raw_data.count())
print ('\nFirst entry of raw_data (with only the first five values of the NumPy array):\n({0}, {1})'
.format(entry[0], entry[1][:5]))
# COMMAND ----------
# TEST Parse the data (3b)
Test.assertTrue(isinstance(entry[0], tuple), "entry's key should be a tuple")
Test.assertEquals(len(entry), 2, 'entry should have a key and a value')
Test.assertTrue(isinstance(entry[0][1], int), 'coordinate tuple should contain ints')
Test.assertEquals(len(entry[0]), 2, "entry's key should have two values")
Test.assertTrue(isinstance(entry[1], np.ndarray), "entry's value should be an np.ndarray")
Test.assertTrue(isinstance(entry[1][0], np.float), 'the np.ndarray should consist of np.float values')
Test.assertEquals(entry[0], (0, 0), 'incorrect key for entry')
Test.assertEquals(entry[1].size, 240, 'incorrect length of entry array')
Test.assertTrue(np.allclose(np.sum(entry[1]), 24683.5), 'incorrect values in entry array')
Test.assertTrue(raw_data.is_cached, 'raw_data is not cached')
# COMMAND ----------
# MAGIC %md
# MAGIC ### (3c) Min and max fluorescence
# MAGIC
# MAGIC Next we'll do some basic preprocessing on the data. The raw time-series data are in units of image fluorescence, and baseline fluorescence varies somewhat arbitrarily from pixel to pixel. First, compute the minimum and maximum values across all pixels.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
mn = <FILL IN>
mx = <FILL IN>
print mn, mx
# COMMAND ----------
# TEST Min and max fluorescence (3c)
Test.assertTrue(np.allclose(mn, 100.6), 'incorrect value for mn')
Test.assertTrue(np.allclose(mx, 940.8), 'incorrect value for mx')
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualization 5: Pixel intensity
# MAGIC
# MAGIC Let's now see how a random pixel varies in value over the course of the time series. We'll visualize a pixel that exhibits a standard deviation of over 100.
# COMMAND ----------
example = raw_data.filter(lambda (k, v): np.std(v) > 100).values().first()
# generate layout and plot data
fig, ax = prepare_plot(np.arange(0, 300, 50), np.arange(300, 800, 100))
ax.set_xlabel(r'time'), ax.set_ylabel(r'fluorescence')
ax.set_xlim(-20, 270), ax.set_ylim(270, 730)
plt.plot(range(len(example)), example, c='#8cbfd0', linewidth='3.0')
display(fig)
# COMMAND ----------
# MAGIC %md
# MAGIC ### (3d) Fractional signal change
# MAGIC
# MAGIC To convert from these raw fluorescence units to more intuitive units of fractional signal change, write a function that takes a time series for a particular pixel and subtracts and divides by the mean. Then apply this function to all the pixels. Confirm that this changes the maximum and minimum values.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
def rescale(ts):
"""Take a np.ndarray and return the standardized array by subtracting and dividing by the mean.
Note:
You should first subtract the mean and then divide by the mean.
Args:
ts (np.ndarray): Time series data (`np.float`) representing pixel intensity.
Returns:
np.ndarray: The times series adjusted by subtracting the mean and dividing by the mean.
"""
<FILL IN>
scaled_data = raw_data.mapValues(lambda v: rescale(v))
mn_scaled = scaled_data.map(lambda (k, v): v).map(lambda v: min(v)).min()
mx_scaled = scaled_data.map(lambda (k, v): v).map(lambda v: max(v)).max()
print mn_scaled, mx_scaled
# COMMAND ----------
# TEST Fractional signal change (3d)
Test.assertTrue(isinstance(scaled_data.first()[1], np.ndarray), 'incorrect type returned by rescale')
Test.assertTrue(np.allclose(mn_scaled, -0.27151288), 'incorrect value for mn_scaled')
Test.assertTrue(np.allclose(mx_scaled, 0.90544876), 'incorrect value for mx_scaled')
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualization 6: Normalized data
# MAGIC
# MAGIC Now that we've normalized our data, let's once again see how a random pixel varies in value over the course of the time series. We'll visualize a pixel that exhibits a standard deviation of over 0.1. Note the change in scale on the y-axis compared to the previous visualization.
# COMMAND ----------
example = scaled_data.filter(lambda (k, v): np.std(v) > 0.1).values().first()
# generate layout and plot data
fig, ax = prepare_plot(np.arange(0, 300, 50), np.arange(-.1, .6, .1))
ax.set_xlabel(r'time'), ax.set_ylabel(r'fluorescence')
ax.set_xlim(-20, 260), ax.set_ylim(-.12, .52)
plt.plot(range(len(example)), example, c='#8cbfd0', linewidth='3.0')
display(fig)
# COMMAND ----------
# MAGIC %md
# MAGIC ### (3e) PCA on the scaled data
# MAGIC
# MAGIC We now have a preprocessed dataset with \\(\scriptsize n = 46460\\) pixels and \\(\scriptsize d = 240\\) seconds of time series data for each pixel. We can interpret the pixels as our observations and each pixel value in the time series as a feature. We would like to find patterns in brain activity during this time series, and we expect to find correlations over time. We can thus use PCA to find a more compact representation of our data and allow us to visualize it.
# MAGIC
# MAGIC Use the `pca` function from Part (2a) to perform PCA on the preprocessed neuroscience data with \\(\scriptsize k = 3\\), resulting in a new low-dimensional 46460 by 3 dataset. The `pca` function takes an RDD of arrays, but `scaled_data` is an RDD of key-value pairs, so you'll need to extract the values.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
# Run pca using scaled_data
components_scaled, scaled_scores, eigenvalues_scaled = <FILL IN>
# COMMAND ----------
# TEST PCA on the scaled data (3e)
Test.assertEquals(components_scaled.shape, (240, 3), 'incorrect shape for components_scaled')
Test.assertTrue(np.allclose(np.abs(np.sum(components_scaled[:5, :])), 0.283150995232),
'incorrect value for components_scaled')
Test.assertTrue(np.allclose(np.abs(np.sum(scaled_scores.take(3))), 0.0285507449251),
'incorrect value for scaled_scores')
Test.assertTrue(np.allclose(np.sum(eigenvalues_scaled[:5]), 0.206987501564),
'incorrect value for eigenvalues_scaled')
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualization 7: Top two components as images
# MAGIC
# MAGIC Now, we'll view the scores for the top two components as images. Note that we reshape the vectors by the dimensions of the original image, 230 x 202.
# MAGIC These graphs map the values for the single component to a grayscale image. This provides us with a visual representation which we can use to see the overall structure of the zebrafish brain and to identify where high and low values occur. However, using this representation, there is a substantial amount of useful information that is difficult to interpret. In the next visualization, we'll see how we can improve interpretability by combining the two principal components into a single image using a color mapping.
# COMMAND ----------
import matplotlib.cm as cm
scores_scaled = np.vstack(scaled_scores.collect())
image_one_scaled = scores_scaled[:, 0].reshape(230, 202).T
# generate layout and plot data
fig, ax = prepare_plot(np.arange(0, 10, 1), np.arange(0, 10, 1), figsize=(9.0, 7.2), hide_labels=True)
ax.grid(False)
ax.set_title('Top Principal Component', color='#888888')
image = plt.imshow(image_one_scaled, interpolation='nearest', aspect='auto', cmap=cm.gray)
display(fig)
# COMMAND ----------
image_two_scaled = scores_scaled[:, 1].reshape(230, 202).T
# generate layout and plot data
fig, ax = prepare_plot(np.arange(0, 10, 1), np.arange(0, 10, 1), figsize=(9.0, 7.2), hide_labels=True)
ax.grid(False)
ax.set_title('Second Principal Component', color='#888888')
image = plt.imshow(image_two_scaled, interpolation='nearest', aspect='auto', cmap=cm.gray)
display(fig)
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualization 8: Top two components as one image
# MAGIC
# MAGIC When we perform PCA and color neurons based on their location in the low-dimensional space, we can interpret areas with similar colors as exhibiting similar responses (at least in terms of the simple representation we recover with PCA). Below, the first graph shows how low-dimensional representations, which correspond to the first two principal components, are mapped to colors. The second graph shows the result of this color mapping using the zebrafish neural data.
# MAGIC
# MAGIC The second graph clearly exhibits patterns of neural similarity throughout different regions of the brain. However, when performing PCA on the full dataset, there are multiple reasons why neurons might have similar responses. The neurons might respond similarly to different stimulus directions, their responses might have similar temporal dynamics, or their response similarity could be influenced by both temporal and stimulus-specific factors. However, with our initial PCA analysis, we cannot pin down the underlying factors, and hence it is hard to interpret what "similarity" really means.
# COMMAND ----------
# MAGIC %md
# MAGIC Optional Details: Note that we use [polar coordinates](https://en.wikipedia.org/wiki/Polar_coordinate_system) to map our low-dimensional points to colors. Using polar coordinates provides us with an angle \\( (\phi) \\) and magnitude \\( (\rho) \\). We then use the well-known polar color space, [hue-saturation-value](https://en.wikipedia.org/wiki/HSL_and_HSV) (HSV), and map the angle to hue and the magnitude to value (brightness). This maps low magnitude points to black while allowing larger magnitude points to be differentiated by their angle. Additionally, the function `polarTransform` that maps low-dimensional representations to colors has an input parameter called `scale`, which we set to 2.0, and you can try lower values for the two graphs to see more nuanced mappings -- values near 1.0 are particularly interesting.
# COMMAND ----------
# Adapted from python-thunder's Colorize.transform where cmap='polar'.
# Checkout the library at: https://github.com/thunder-project/thunder and
# http://thunder-project.org/
def polar_transform(scale, img):
"""Convert points from cartesian to polar coordinates and map to colors."""
from matplotlib.colors import hsv_to_rgb
img = np.asarray(img)
dims = img.shape
phi = ((np.arctan2(-img[0], -img[1]) + np.pi/2) % (np.pi*2)) / (2 * np.pi)
rho = np.sqrt(img[0]**2 + img[1]**2)
saturation = np.ones((dims[1], dims[2]))
out = hsv_to_rgb(np.dstack((phi, saturation, scale * rho)))
return np.clip(out * scale, 0, 1)
# COMMAND ----------
# Show the polar mapping from principal component coordinates to colors.
x1_abs_max = np.max(np.abs(image_one_scaled))
x2_abs_max = np.max(np.abs(image_two_scaled))
num_of_pixels = 300
x1_vals = np.arange(-x1_abs_max, x1_abs_max, (2 * x1_abs_max) / num_of_pixels)
x2_vals = np.arange(x2_abs_max, -x2_abs_max, -(2 * x2_abs_max) / num_of_pixels)
x2_vals.shape = (num_of_pixels, 1)
x1_data = np.tile(x1_vals, (num_of_pixels, 1))
x2_data = np.tile(x2_vals, (1, num_of_pixels))
# Try changing the first parameter to lower values
polar_map = polar_transform(2.0, [x1_data, x2_data])
grid_range = np.arange(0, num_of_pixels + 25, 25)
fig, ax = prepare_plot(grid_range, grid_range, figsize=(9.0, 7.2), hide_labels=True)
image = plt.imshow(polar_map, interpolation='nearest', aspect='auto')
ax.set_xlabel('Principal component one'), ax.set_ylabel('Principal component two')
grid_marks = (2 * grid_range / float(num_of_pixels) - 1.0)
x1_marks = x1_abs_max * grid_marks
x2_marks = -x2_abs_max * grid_marks
ax.get_xaxis().set_ticklabels(map(lambda x: '{0:.1f}'.format(x), x1_marks))
ax.get_yaxis().set_ticklabels(map(lambda x: '{0:.1f}'.format(x), x2_marks))
display(fig)
# COMMAND ----------
# Use the same transformation on the image data
# Try changing the first parameter to lower values
brainmap = polar_transform(2.0, [image_one_scaled, image_two_scaled])
# generate layout and plot data
fig, ax = prepare_plot(np.arange(0, 10, 1), np.arange(0, 10, 1), figsize=(9.0, 7.2), hide_labels=True)
ax.grid(False)
image = plt.imshow(brainmap,interpolation='nearest', aspect='auto')
display(fig)
# COMMAND ----------
# MAGIC %md
# MAGIC ## Part 4: Feature-based aggregation and PCA
# COMMAND ----------
# MAGIC %md
# MAGIC ### (4a) Aggregation using arrays
# MAGIC
# MAGIC In the analysis in Part 3, we performed PCA on the full time series data, trying to find global patterns across all 240 seconds of the time series. However, our analysis doesn't use the fact that different events happened during those 240 seconds. Specifically, during those 240 seconds, the zebrafish was presented with 12 different direction-specific visual patterns, with each one lasting for 20 seconds, for a total of 12 x 20 = 240 features. Stronger patterns are likely to emerge if we incorporate knowledge of our experimental setup into our analysis. As we'll see, we can isolate the impact of temporal response or direction-specific impact by appropriately aggregating our features.
# MAGIC
# MAGIC In order to aggregate the features we will use basic ideas from matrix multiplication. First, note that if we use `np.dot` with a two-dimensional array, then NumPy performs the equivalent matrix-multiply calculation. For example, `np.array([[1, 2, 3], [4, 5, 6]]).dot(np.array([2, 0, 1]))` produces `np.array([5, 14])`.
# MAGIC
# MAGIC \\[\begin{bmatrix} 1 & 2 & 3 \\\ 4 & 5 & 6 \end{bmatrix} \begin{bmatrix} 2 \\\ 0 \\\ 1 \end{bmatrix} = \begin{bmatrix} 5 \\\ 14 \end{bmatrix} \\]
# MAGIC
# MAGIC By setting up our multi-dimensional array properly we can multiply it by a vector to perform certain aggregation operations. For example, imagine we had a 3 dimensional vector, \\( \scriptsize \begin{bmatrix} 1 & 2 & 3 \end{bmatrix}^\top \\) and we wanted to create a 2 dimensional vector containing the sum of its first and last elements as one value and three times its second value as another value, i.e., \\( \scriptsize \begin{bmatrix} 4 & 6 \end{bmatrix}^\top \\). We can generate this result via matrix multiplication as follows: `np.array([[1, 0, 1], [0, 3, 0]]).dot(np.array([1, 2, 3])` which produces `np.array([4, 6]`.
# MAGIC
# MAGIC \\[\begin{bmatrix} 1 & 0 & 1 \\\ 0 & 3 & 0 \end{bmatrix} \begin{bmatrix} 1 \\\ 2 \\\ 3 \end{bmatrix} = \begin{bmatrix} 4 \\\ 6 \end{bmatrix} \\]
# MAGIC
# MAGIC For this exercise, you'll create several arrays that perform different types of aggregation. The aggregation is specified in the comments before each array. You should fill in the array values by hand. We'll automate array creation in the next two exercises.
# COMMAND ----------
# TODO: Replace <FILL IN> with appropriate code
vector = np.array([0., 1., 2., 3., 4., 5.])
# Create a multi-dimensional array that when multiplied (using .dot) against vector, results in
# a two element array where the first element is the sum of the 0, 2, and 4 indexed elements of
# vector and the second element is the sum of the 1, 3, and 5 indexed elements of vector.
# This should be a 2 row by 6 column array
sum_every_other = np.array(<FILL IN>)
# Create a multi-dimensional array that when multiplied (using .dot) against vector, results in a
# three element array where the first element is the sum of the 0 and 3 indexed elements of vector,
# the second element is the sum of the 1 and 4 indexed elements of vector, and the third element is
# the sum of the 2 and 5 indexed elements of vector.
# This should be a 3 row by 6 column array
sum_every_third = np.array(<FILL IN>)
# Create a multi-dimensional array that can be used to sum the first three elements of vector and
# the last three elements of vector, which returns a two element array with those values when dotted
# with vector.
# This should be a 2 row by 6 column array
sum_by_three = np.array(<FILL IN>)
# Create a multi-dimensional array that sums the first two elements, second two elements, and
# last two elements of vector, which returns a three element array with those values when dotted
# with vector.
# This should be a 3 row by 6 column array
sum_by_two = np.array(<FILL IN>)
print 'sum_every_other.dot(vector):\t{0}'.format(sum_every_other.dot(vector))
print 'sum_every_third.dot(vector):\t{0}'.format(sum_every_third.dot(vector))
print '\nsum_by_three.dot(vector):\t{0}'.format(sum_by_three.dot(vector))
print 'sum_by_two.dot(vector): \t{0}'.format(sum_by_two.dot(vector))
# COMMAND ----------
# TEST Aggregation using arrays (4a)
Test.assertEquals(sum_every_other.shape, (2, 6), 'incorrect shape for sum_every_other')
Test.assertEquals(sum_every_third.shape, (3, 6), 'incorrect shape for sum_every_third')
Test.assertTrue(np.allclose(sum_every_other.dot(vector), [6, 9]), 'incorrect value for sum_every_other')
Test.assertTrue(np.allclose(sum_every_third.dot(vector), [3, 5, 7]),
'incorrect value for sum_every_third')
Test.assertEquals(sum_by_three.shape, (2, 6), 'incorrect shape for sum_by_three')
Test.assertEquals(sum_by_two.shape, (3, 6), 'incorrect shape for sum_by_two')
Test.assertTrue(np.allclose(sum_by_three.dot(vector), [3, 12]), 'incorrect value for sum_by_three')
Test.assertTrue(np.allclose(sum_by_two.dot(vector), [1, 5, 9]), 'incorrect value for sum_by_two')
# COMMAND ----------
# MAGIC %md
# MAGIC ### (4b) Recreate with `np.tile` and `np.eye`
# MAGIC [np.tile](http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html) is useful for repeating arrays in one or more dimensions. For example, `np.tile(np.array([[1, 2], [3, 4]]), 2)` produces `np.array([[1, 2, 1, 2], [3, 4, 3, 4]]))`.
# MAGIC
# MAGIC \\[ np.tile( \begin{bmatrix} 1 & 2 \\\ 3 & 4 \end{bmatrix} , 2) \to \begin{bmatrix} 1 & 2 & 1& 2 \\\ 3 & 4 & 3 & 4 \end{bmatrix} \\]
# MAGIC
# MAGIC Recall that [np.eye](http://docs.scipy.org/doc/numpy/reference/generated/numpy.eye.html) can be used to create an identity array \\( (\mathbf{I_n}) \\). For example, `np.eye(3)` produces `np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])`.
# MAGIC
# MAGIC \\[ np.eye( 3 ) \to \begin{bmatrix} 1 & 0 & 0 \\\ 0 & 1 & 0 \\\ 0 & 0 & 1 \end{bmatrix} \\]
# MAGIC
# MAGIC In this exercise, recreate `sum_every_other` and `sum_every_third` using `np.tile` and `np.eye`.
# COMMAND ----------