-
Notifications
You must be signed in to change notification settings - Fork 1
/
1-intro.qmd
822 lines (710 loc) · 38 KB
/
1-intro.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
# Picturing high dimensions {#intro}
High-dimensional data means that we have a large number of numeric features or variables, which can be considered as dimensions in a mathematical space. The variables can be different types, such as categorical or temporal, but the handling of these variables involves different techniques.
\index{dimensionality}
\index{variable}\index{feature}
\index{projection}
![Viewing high dimensions using low-dimensional displays is like playing shadow puppets, looking at the shadows to guess what the shape is.](images/shadow_puppets.png){#fig-shadow-puppets width=450 fig-alt="Three images, each with a hand or two hands, illustrating making shadows of a bird in flight, snail and dog."}
One approach to visualise high dimensional data and models is by using linear projections, as done in a tour. You can think of projections of high-dimensional data like shadows (@fig-shadow-puppets). Unlike shadow puppets, though the object stays fixed, and with multiple projections we can obtain a *view of the object from all sides*.
## Getting familiar with tours
```{r}
#| echo: false
library(mulgar)
library(ggplot2)
library(patchwork)
data("simple_clusters")
s_p <- ggplot(simple_clusters, aes(x=x1, y=x2)) +
geom_point(size=2, alpha=0.8, colour="#EC5C00") +
geom_abline(intercept=0, slope=1) +
annotate("text", x=2.0, y=2.2, label="(0.707, 0.707)", angle=45) +
annotate("text", x=2.2, y=2.0, label="most clustered", angle=45) +
geom_abline(intercept=0, slope=-1) +
annotate("text", x=-1.6, y=1.8, label="(0.707, -0.707)", angle=-45) +
annotate("text", x=-1.8, y=1.6, label="no clusters", angle=-45) +
geom_abline(intercept=0, slope=0) +
annotate("text", x=-1.6, y=0.15, label="(1, 0)") +
annotate("text", x=-1.4, y=-0.1, label="some clustering") +
xlim(-2, 2.5) + ylim(-2, 2.5) +
theme_minimal() +
theme(aspect.ratio=1)
```
```{r}
#| echo: false
#| message: false
library(tourr)
explain_t1 <- save_history(simple_clusters[,1:2],
grand_tour(d=1),
max_bases=9)
explain_t1[,,2] <- matrix(c(1/sqrt(2), 1/sqrt(2)),
ncol=1)
explain_t1[,,3] <- matrix(c(0, 1),
ncol=1)
explain_t1[,,4] <- matrix(c(-1/sqrt(2), 1/sqrt(2)),
ncol=1)
explain_t1[,,5] <- matrix(c(-1, 0),
ncol=1)
explain_t1[,,6] <- matrix(c(-1/sqrt(2), -1/sqrt(2)),
ncol=1)
explain_t1[,,7] <- matrix(c(0, -1),
ncol=1)
explain_t1[,,8] <- matrix(c(1/sqrt(2), -1/sqrt(2)),
ncol=1)
explain_t1[,,9] <- matrix(c(1, 0),
ncol=1)
```
```{r}
#| echo: false
#| eval: false
animate_dist(simple_clusters[,1:2],
planned_tour(explain_t1),
method="density", col="#EC5C00",
scale_density = TRUE,
half_range=0.8)
render_gif(simple_clusters[,1:2],
planned_tour(explain_t1),
display_dist(method="density",
col="#EC5C00",
density_max = 2,
scale_density = TRUE,
half_range=0.8),
gif_file = "gifs/explain_1d.gif",
apf = 1/100,
frames = 1000,
width = 400,
height = 300)
```
::: {.content-visible when-format="html"}
::: {#fig-explain-1D-html layout="[[40, 60]]"}
```{r}
#| echo: false
#| label: fig-explain-1D-data
#| fig-cap: 2D data
#| out-width: 100%
#| fig-width: 4
#| fig-height: 4
#| fig-alt: "Plot shows 2D scatterplot, with lines indicating three 1D projection vectors, and their coefficients. "
s_p
```
![1D grand tour of the 2D data](gifs/explain_1d.gif){#fig-explain-1D-tour width=290 fig-alt="The animation shows a sequence of 1D projections of the 2D data."}
How a tour can be used to explore high-dimensional data illustrated using (a) 2D data with two clusters and (b) a tour of 1D projections shown as a density plot. Imagine spinning a line around the centre of the data plot, with points projected orthogonally onto the line. With this data, when the line is at `x1=x2 (0.707, 0.707)` or `(-0.707, -0.707)` the clustering is the strongest. When it is at `x1=-x2 (0.707, -0.707)` there is no clustering.
:::
:::
```{r fig-explain-1D-pdf, eval=knitr::is_latex_output()}
#| echo: false
#| fig-cap: "How a tour can be used to explore high-dimensional data illustrated using (a) 2D data with two clusters and (b,c,d) 1D projections from a tour shown as a density plot. Imagine spinning a line around the centre of the data plot, with points projected orthogonally onto the line. With this data, when the line is at `x1=x2 (0.707, 0.707)` or `(-0.707, -0.707)` the clustering is the strongest. When it is at `x1=-x2 (0.707, -0.707)` there is no clustering. {{< fa play-circle >}}"
#| fig-width: 8
#| fig-height: 8
#| out-width: 100%
#| fig-env: "figure*"
p1 <- s_p + ggtitle("(a) 2D data")
d <- as.matrix(simple_clusters[,-3]) %*% matrix(explain_t1[,,9])
colnames(d) <- c("P1")
d <- as.data.frame(d)
p2 <- ggplot(d, aes(x=P1)) +
geom_density(fill="#EC5C00") +
xlim(c(-3,3)) + ylim(c(0, 0.5)) +
ggtitle("(b) (1, 0)") +
xlab("Projection") + ylab("") +
theme_minimal() +
theme(axis.text = element_blank(),
panel.grid.major = element_blank())
d <- as.matrix(simple_clusters[,-3]) %*% matrix(explain_t1[,,2])
colnames(d) <- c("P1")
d <- as.data.frame(d)
p3 <- ggplot(d, aes(x=P1)) +
geom_density(fill="#EC5C00") +
xlim(c(-4,4)) + ylim(c(0, 0.5)) +
ggtitle("(c) (0.707, 0.707)") +
xlab("Projection") + ylab("") +
theme_minimal() +
theme(axis.text = element_blank(),
panel.grid.major = element_blank())
d <- as.matrix(simple_clusters[,-3]) %*% matrix(explain_t1[,,8])
colnames(d) <- c("P1")
d <- as.data.frame(d)
p4 <- ggplot(d, aes(x=P1)) +
geom_density(fill="#EC5C00") +
xlim(c(-1.7,1.7)) + ylim(c(0, 1.2)) +
ggtitle("(d) (0.707, -0.707)") +
xlab("Projection") + ylab("") +
theme_minimal() +
theme(axis.text = element_blank(),
panel.grid.major = element_blank())
p1 + p2 + p3 + p4 + plot_layout(ncol=2)
```
`r ifelse(knitr::is_html_output(), '@fig-explain-1D-html', '@fig-explain-1D-pdf')` illustrates a tour for 2D data and 1D projections. The (grand) tour will generate all possible 1D projections of the data, and display with a univariate plot like a histogram or density plot. For this data, the `simple_clusters` data, depending on the projection, the distribution might be clustered into two groups (bimodal), or there might be no clusters (unimodal). In this example, all projections are generated by rotating a line around the centre of the plot. Clustering can be seen in many of the projections, with the strongest being when the contribution of both variables is equal, and the projection is `(0.707, 0.707)` or `(-0.707, -0.707)`. (If you are curious about the number `0.707`, read the last section of this chapter.)
\index{projection!1D}
```{r}
#| echo: false
library(tourr)
library(geozoo)
set.seed(1351)
d <- torus(3, n=4304)$points
d <- apply(d, 2, function(x) (x-mean(x))/sd(x))
ang <- 15
d <- as.matrix(d) %*% matrix(c(cos(ang), 0, sin(ang), 0, 1, 0, -sin(ang), 0, cos(ang)), ncol=3, byrow=T)
colnames(d) <- paste0("x", 1:3)
d <- data.frame(d)
```
```{r}
#| echo: false
#| eval: false
animate_xy(d, little_tour(), aps=0.2)
explain_t2 <- save_history(d, little_tour(), 4)
animate_xy(d, planned_tour(explain_t2), half_range=0.7, axes="bottomleft")
render_gif(d,
planned_tour(explain_t2),
display_xy(col="#EC5C00",
half_range=0.7,
axes="bottomleft"),
gif_file = "gifs/explain_2d.gif",
apf = 1/75,
frames = 1000,
width = 400,
height = 300)
```
```{r}
#| echo: false
explain_prj <- matrix(c(cos(ang), 0, -sin(ang), 0, 1, 0, sin(ang), 0, cos(ang)), ncol=3, byrow=T)[,1:2]
d_prj <- render_proj(d, explain_prj,
position="bottomleft",
limits=1.5)
d_prj_p <- ggplot() +
geom_path(data=d_prj$circle, aes(x=c1, y=c2), colour="grey70") +
geom_segment(data=d_prj$axes, aes(x=x1, y=y1, xend=x2, yend=y2), colour="grey70") +
geom_text(data=d_prj$axes, aes(x=x2, y=y2, label=rownames(d_prj$axes)), colour="grey50") +
geom_point(data=d_prj$data_prj, aes(x=P1, y=P2),
col="#EC5C00", size=1.2) +
xlim(-1.3,1.3) + ylim(-1.3, 1.3) +
theme_bw() +
theme(aspect.ratio=1,
axis.text=element_blank(),
axis.title=element_blank(),
axis.ticks=element_blank(),
panel.grid=element_blank())
```
::: {.content-visible when-format="html"}
::: {#fig-explain-2D-html layout="[[57, 43]]"}
![2D tour of 3D data](gifs/explain_2d.gif){#fig-explain-2D-tour fig-alt="The animation shows a sequence of scatterplots of 2D projections of a 3D torus."}
```{r}
#| echo: false
#| label: fig-explain-2D-data
#| fig-cap: A projection revealing the hole
#| fig-width: 4
#| fig-height: 4
#| fig-alt: "A scatterplot of a single 2D projection where the donut hole is visible."
d_prj_p
```
How a tour can be used to explore high-dimensional data illustrated by showing a sequence of random 2D projections of 3D data (a). The data has a donut shape with the hole revealed in a single 2D projection (b). Data usually arrives with a given number of observations, and when we plot it like this using a scatterplot, it is like shadows of a transparent object.
:::
:::
```{r fig-explain-2D-pdf, eval=knitr::is_latex_output()}
#| echo: false
#| fig-cap: "How a tour can be used to explore high-dimensional data illustrated by showing a sequence of random 2D projections of 3D data (a). The data has a donut shape with the hole revealed in a single 2D projection (b). Data usually arrives with a given number of observations, and when we plot it like this using a scatterplot, it is like shadows of a transparent object. {{< fa play-circle >}}"
#| fig-width: 8
#| fig-height: 8
#| out-width: 100%
#| fig-env: "figure*"
set.seed(437)
explain_prj <- basis_random(3, 2)
d_prj <- render_proj(d, explain_prj,
position="bottomleft",
limits=1.5)
p5 <- ggplot() +
geom_path(data=d_prj$circle, aes(x=c1, y=c2), colour="grey70") +
geom_segment(data=d_prj$axes, aes(x=x1, y=y1, xend=x2, yend=y2), colour="grey70") +
geom_text(data=d_prj$axes, aes(x=x2, y=y2, label=rownames(d_prj$axes)), colour="grey50") +
geom_point(data=d_prj$data_prj, aes(x=P1, y=P2),
col="#EC5C00", size=1.2) +
xlim(-1.3,1.3) + ylim(-1.3, 1.3) +
ggtitle("(a) A random projection") +
theme_bw() +
theme(aspect.ratio=1,
axis.text=element_blank(),
axis.title=element_blank(),
axis.ticks=element_blank(),
panel.grid=element_blank())
p6 <- d_prj_p + ggtitle("(b) A projection revealing the hole")
p5 + p6 + plot_layout(ncol=2)
```
`r ifelse(knitr::is_html_output(), '@fig-explain-2D-html', '@fig-explain-2D-pdf')` illustrates a tour for 3D data using 2D projections. The data are points on the surface of a donut shape. By showing the projections using a scatterplot the donut looks transparent and we can see through the data. The donut shape can be inferred from watching many 2D projections but some are more revealing that others. The projection shown in (b) is where the hole in the donut is clearly visible.
\index{projection!2D}
## What's different about space beyond 2D?
The term "high-dimensional" in this book refers to the dimensionality of the Euclidean space. @fig-dimension-cubes shows a way to imagine this. It shows a sequence of cube wireframes, ranging from one-dimensional (1D) through to five-dimensional (5D), where beyond 2D is a linear projection of the cube. As the dimension increases, a new orthogonal axis is added. For cubes, this is achieved by doubling the cube: a 2D cube consists of two 1D cubes, a 3D cube consists of two 2D cubes, and so forth. This is a great way to think about the space being examined by the visual methods, and also all of the machine learning methods mentioned, in this book.
\index{dimensionality}
```{r}
#| label: fig-dimension-cubes
#| echo: false
#| fig-cap: "Space can be considered to be a high-dimensional cube. Here we have pictured a sequence of increasing dimension cubes, from 1D to 5D, as wireframes, it can be seen that as the dimension increase by one, the cube doubles."
#| fig-width: 8
#| fig-height: 3
#| fig-alt: "Wireframe diagrams show 1D, 2D, 3D, 4D and 5D cubes. Half of each cube is coloured orange to show how a new dimension expands from the previous one."
#| message: false
#| warning: false
# wire frame cubes
library(tidyverse)
library(ggthemes)
library(geozoo)
library(tourr)
library(patchwork)
library(ggrepel)
new_d_clr <- "#E87C00"
d_line_clr <- "#3B99B1"
set.seed(5)
c1 <- cube.iterate(p = 1)
c1$points <- as_tibble(c1$points)
c1$edges <- as_tibble(c1$edges)
c2 <- cube.iterate(p = 2)
c2$points <- as_tibble(c2$points)
c2$edges <- as_tibble(c2$edges)
c3 <- cube.iterate(p = 3)
proj <- basis_random(3,2)
c3$points <- c3$points %*% proj
colnames(c3$points) <- c("Var1", "Var2")
c3$points <- as_tibble(c3$points)
c3$edges <- as_tibble(c3$edges)
c4 <- cube.iterate(p = 4)
proj <- basis_random(4,2)
c4$points <- c4$points %*% proj
colnames(c4$points) <- c("Var1", "Var2")
c4$points <- as_tibble(c4$points)
c4$edges <- as_tibble(c4$edges)
c4$edges.sub <- tibble(from = c(1,1,1,2,2,3,3,4,5,5,6,7),
to = c(2,3,5,4,6,4,7,8,6,7,8,8))
c5 <- cube.iterate(p = 5)
proj <- basis_random(5, 2)
c5$points <- c5$points %*% proj
colnames(c5$points) <- c("Var1", "Var2")
c5$points <- as_tibble(c5$points)
c5$edges <- as_tibble(c5$edges)
c5$edges.sub <- tibble(from = c(1,1,1,1,2,2,2,3,3,3,4,4,5,5,5,6,6,7,7,8,9,9,9,10,10,11,11,12,13,13,14,15),
to = c(2,3,5,9,4,6,10,4,7,11,8,12,6,7,13,8,14,8,15,16,10,11,13,12,14,12,15,16,14,15,16,16))
# plot
# 1D
p1 <- ggplot() +
geom_point(data=c1$points, aes(x=Var1, y=1)) +
geom_segment(data=c1$edges,
aes(x=c1$points$Var1[c1$edges$from],
xend=c1$points$Var1[c1$edges$to],
y=1, yend=1),
linetype=3, colour = d_line_clr) +
geom_point(data=c1$points[1,], aes(x=Var1, y=1), colour = new_d_clr) +
ggtitle("1D") +
theme_void() +
theme(title = element_text(colour = "black", size = 24),
aspect.ratio = 1) +
xlim(c(-0.2, 1.2))
# 2D
p2 <- ggplot() +
geom_point(data=c2$points, aes(x=Var1, y=Var2)) +
geom_segment(data=c2$edges[c(1,4),],
aes(x=c2$points$Var1[from],
xend=c2$points$Var1[to],
y=c2$points$Var2[from],
yend=c2$points$Var2[to])) +
geom_segment(data=c2$edges[c(2,3),],
aes(x=c2$points$Var1[from],
xend=c2$points$Var1[to],
y=c2$points$Var2[from],
yend=c2$points$Var2[to]),
linetype = 3, colour = d_line_clr) + # dashed connectors
geom_point(data=c2$points[1:2,], aes(x=Var1, y=Var2),
colour = new_d_clr) +
geom_segment(data=c2$edges[1,],
aes(x=c2$points$Var1[from],
xend=c2$points$Var1[to],
y=c2$points$Var2[from],
yend=c2$points$Var2[to]),
colour = new_d_clr) +
ggtitle("2D") +
theme_void() +
theme(title = element_text(colour = "black", size = 24),
aspect.ratio = 1) +
xlim(c(-0.15, 1.15)) + ylim(c(-0.15, 1.15))
# 3D
c_in <- c(1,2,4,6,9,10,11,12)
c_out <- c(3,5,7,8)
p3 <- ggplot() +
geom_point(data=c3$points, aes(x=Var1, y=Var2)) +
geom_segment(data=c3$edges[c_in,],
aes(x=c3$points$Var1[from],
xend=c3$points$Var1[to],
y=c3$points$Var2[from],
yend=c3$points$Var2[to])) +
geom_segment(data=c3$edges[c_out,],
aes(x=c3$points$Var1[from],
xend=c3$points$Var1[to],
y=c3$points$Var2[from],
yend=c3$points$Var2[to]),
linetype = 3, colour = d_line_clr) +
geom_point(data=c3$points[1:4,], aes(x=Var1, y=Var2), colour = new_d_clr) +
geom_segment(data=c3$edges[c(1,2,4,6),],
aes(x=c3$points$Var1[from],
xend=c3$points$Var1[to],
y=c3$points$Var2[from],
yend=c3$points$Var2[to]), colour = new_d_clr) +
ggtitle("3D") +
theme_void() +
theme(title = element_text(colour = "black", size = 24),
aspect.ratio = 1)
# p3 + geom_text_repel(data=c3$points, aes(x=Var1, y=Var2, label = 1:nrow(c3$points)), size=5)
# 4D
c_out <- c(4, 7, 10, 12, 15, 17, 19, 20)
c_in <- c(1:nrow(c4$edges))[-c_out]
p4 <- ggplot() +
geom_point(data=c4$points, aes(x=Var1, y=Var2)) +
geom_segment(data=c4$edges[c_in,],
aes(x=c4$points$Var1[from],
xend=c4$points$Var1[to],
y=c4$points$Var2[from],
yend=c4$points$Var2[to])) +
geom_segment(data=c4$edges[c_out,],
aes(x=c4$points$Var1[from],
xend=c4$points$Var1[to],
y=c4$points$Var2[from],
yend=c4$points$Var2[to]),
linetype = 3, colour = d_line_clr) +
geom_point(data=c4$points[1:8,], aes(x=Var1, y=Var2), colour = new_d_clr) +
geom_segment(data=c4$edges.sub,
aes(x=c4$points$Var1[from],
xend=c4$points$Var1[to],
y=c4$points$Var2[from],
yend=c4$points$Var2[to]), colour = new_d_clr) +
ggtitle("4D") +
theme_void() +
theme(title = element_text(colour = "black", size = 24),
aspect.ratio = 1)
# p4 + geom_text_repel(data=c4$points, aes(x=Var1, y=Var2, label = 1:nrow(c4$points)), size=5)
# 5D
c_out <- c(5,9,13,16,20,23,26,28,32,35,38,
40,43,45,47,48)
c_in <- c(1:nrow(c5$edges))[-c_out]
p5 <- ggplot() +
geom_point(data=c5$points, aes(x=Var1, y=Var2)) +
geom_segment(data=c5$edges[c_in,],
aes(x=c5$points$Var1[from],
xend=c5$points$Var1[to],
y=c5$points$Var2[from],
yend=c5$points$Var2[to])) +
geom_segment(data=c5$edges[c_out,],
aes(x=c5$points$Var1[from],
xend=c5$points$Var1[to],
y=c5$points$Var2[from],
yend=c5$points$Var2[to]),
linetype = 3, colour = d_line_clr) +
geom_point(data=c5$points[1:16,], aes(x=Var1, y=Var2), colour = new_d_clr) +
geom_segment(data=c5$edges.sub,
aes(x=c5$points$Var1[from],
xend=c5$points$Var1[to],
y=c5$points$Var2[from],
yend=c5$points$Var2[to]), colour = new_d_clr) +
ggtitle("5D") +
theme_void() +
theme(title = element_text(colour = "black", size = 24),
aspect.ratio = 1)
# p5 + geom_text_repel(data=c5$points, aes(x=Var1, y=Var2, label = 1:nrow(c5$points)), size=5)
p1 + p2 + p3 + p4 + p5 +
plot_layout(ncol = 5)
```
Interestingly, the struggle with imagining high-dimensions this way is described in a novel published in 1884 [@Ab1884] [^4]. Yes, more than 100 years ago! This is a story about characters living in a 2D world, being visited by an alien 3D character. It also is a social satire, serving the reader strong messages about gender inequity, although this provides the means to explain more intricacies in perceiving dimensions. There have been several movies made based on the book in recent decades (e.g. @Ma65, @JT07). Although purchasing the movies may be prohibitive, watching the trailers available for free online is sufficient to gain enough geometric intuition on the nature of understanding high-dimensional spaces while living in a low-dimensional world.
[^4]: Thanks to Barret Schloerke for directing co-author Cook to this history when he was an undergraduate student and we were starting the [geozoo](http://schloerke.com/geozoo/) project.
When we look at high-dimensional spaces from a low-dimensional space, we meet the "curse of dimensionality", a term introduced by @BellmanRichard1961 to express the difficulty of doing optimization in high dimensions because of the exponential growth in space as dimension increases. A way to imagine this is look at the cubes in @fig-dimension-cubes: As you go from 1D to 2D, 2D to 3D, the space expands a lot, and imagine how vast space might get as more dimensions are added[^5]. The volume of the space grows exponentially with dimension, which makes it infeasible to sample enough points -- any sample will be less densely covering the space as dimension increases. The effect is that most points will be far from the sample mean, on the edge of the sample space.
\index{dimensionality!curse of}
[^5]: "Space is big. Really big. You might think it's a long way to the pharmacy, but that’s peanuts to space." from Douglas Adams' [Hitchhiker's Guide to the Galaxy](https://en.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy#Stage_shows) always springs to mind when thinking about high dimensions!
For visualisation, the curse manifests in an opposite manner. Projecting from high to low dimensions creates a crowding or piling of points near the center of the distribution. This was noted by @diaconis1984. @fig-density illustrates this phenomenon. As dimension increases, the points crowd the centre, even with as few as ten dimensions. This is something that we may need to correct for when exploring high dimensions with low-dimensional projections.
\index{dimensionality!crowding}
```{r}
#| label: fig-density
#| fig-cap: "Illustration of data crowding in the low-dimensional projection as dimension increases, here from 3, 10, 100. Colour shows the number of points in each hexagon bin (pink is large, navy is small). As dimension increases the points concentrate near the centre."
#| out-width: 95%
#| fig-width: 6
#| fig-height: 2
#| fig-align: center
#| fig-alt: "Three hexagon binned plots. The plot on the left is relatively uniform in colour, and looks like a disk, and the plot on the right has a high concentration of pink hexagons in the center, and rings of green and navy blue around the outside. The middle plot is in between the two patterns."
#| message: false
#| warning: false
#| echo: false
library(colorspace)
set.seed(212)
n <- 10000
# sample points, only keep first two components for 2D projection
p3 <- geozoo::sphere.solid.random(3, n)$points[, c(1,2)]
p10 <- geozoo::sphere.solid.random(10, n)$points[, c(1,2)]
p100 <- geozoo::sphere.solid.random(100, n)$points[, c(1,2)]
colnames(p3) <- c("x", "y")
colnames(p10) <- c("x", "y")
colnames(p100) <- c("x", "y")
proj_points <- as_tibble(rbind(p3, p10, p100)) %>%
mutate(p = factor(c(rep("p = 3", n), rep("p = 10", n), rep("p = 100", n)), levels = c("p = 3", "p = 10", "p = 100")))
ggplot(proj_points, aes(x, y)) +
geom_hex(bins = 20, aes(fill=log(..count..))) +
scale_fill_continuous_sequential("Batlow", rev=FALSE) +
facet_wrap(~p, scales = "free") +
guides(fill = FALSE) +
theme_bw() +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text = element_blank(),
aspect.ratio = 1)
```
::: {.content-visible when-format="html"}
@fig-tour-intro-html shows 2D tours of two different 5D data sets. One has clusters (a) and the other has two outliers and a plane (b). Can you see these? One difference in the viewing of data with more than three dimensions with 2D projections is that the points seem to shrink towards the centre, and then expand out again. This the effect of dimensionality, with different variance or spread in some directions.
::: {#fig-tour-intro-html layout-ncol=2}
![Clusters](gifs/clusters-intro.gif){#fig-tour-clusters width=250 fig-alt="Animation of sequences of 2D projections shown as scatterplots. You can see points moving in three different movement patterns, and sometimes they separate into clusters."}
![Outliers](gifs/outlier-intro.gif){#fig-tour-outliers width=250 fig-alt="Animation of sequences of 2D projections shown as scatterplots. You can see most points lie in a flat planar shape, and two points can be seen to move differently from the others and separate from the rest of the points in some projections."}
Two 5D datasets shown as tours of 2D projections. Can you see clusters of points in (a) and two outliers with a plane in (b)?
:::
:::
::: {.content-visible when-format="pdf"}
@fig-tour-intro-pdf shows 2D tours of two different 5D data sets. One has clusters (a) and the other has two outliers and a plane (b). Can you see these? One difference in the viewing of data with more than three dimensions with 2D projections is that the points seem to shrink towards the centre, and then expand out again. This the effect of dimensionality, with different variance or spread in some directions.
::: {#fig-tour-intro-pdf layout-ncol=2}
![Clusters](images/clusters-intro.png){#fig-tour-clusters width=200}
![Outliers](images/outlier-intro.png){#fig-tour-clusters width=200}
Frames from 2D tours on two 5D datasets, with clusters of points in (a) and two outliers with a plane in (b). This figure is best viewed in the HTML version of the book. {{< fa play-circle >}}
:::
:::
## What can you learn?
There are two ways of detecting structure in tours:
- patterns in a single low-dimensional projection
- movement patterns
with the latter being especially useful when displaying the projected data as a scatterplot. @fig-example-structure shows examples of patterns we typically look for when making a scatterplot of data. These include clustering, linear and non-linear association, outliers, barriers where there is a sharp edge beyond which no observations are seen. Not shown, but it also might be possible to observe multiple modes, or density of observations, L-shapes, discreteness or uneven spread of points. The tour is especially useful if these patterns are only visible in combinations of variables.
```{r}
#| label: fig-example-structure
#| fig-width: 10
#| fig-height: 3
#| out-width: 100%
#| fig-cap: "Example structures that might be visible in a 2D projection that imply presence of structure in high dimensions. These include clusters, linear and non-linear association, outliers and barriers."
#| fig-alt: "Four scatterplots showing different types of patterns you might expect to see. Plot (a) has three elliptical clusters of points, roughly lying horizontal, making a geese flying pattern. Plot (b) has a nonlinear pattern looking like a horseshoe. Plot (c) has a strong negative linear association and a single outlier in the top right. Plot (d) has points lying only in the bottom triangle."
#| echo: false
library(mulgar)
library(ggplot2)
library(patchwork)
library(geozoo)
library(dplyr)
data("clusters")
data("plane")
data("plane_nonlin")
plane_outliers <- plane
plane_outliers[101,] <- c(2, 2, -2, 0, 0)
plane_outliers[102,] <- c(0, 0, 0,-2, -2)
set.seed(314)
barrier <- data.frame(x1=runif(176)) %>%
mutate(x2=runif(176, min=0, max=1-x1))
e1 <- ggplot(clusters[sample(1:300, 156),], aes(x=x3, y=x2)) +
geom_point(colour="#EC5C00", size=2.2, alpha=0.8) +
ggtitle("(a) gaps or clusters") +
theme_void() +
theme(aspect.ratio = 1,
panel.border = element_rect(fill=NA,
colour="black"),
plot.margin = margin(2, 5, 0, 5))
e2 <- ggplot(plane_nonlin, aes(x=x1, y=x2)) +
geom_point(colour="#EC5C00", size=2.2, alpha=0.8) +
ggtitle("(b) non-linear association") +
theme_void() +
theme(aspect.ratio = 1,
panel.border = element_rect(fill=NA,
colour="black"),
plot.margin = margin(0, 5, 0, 5))
e3 <- ggplot(plane_outliers, aes(x=x1, y=x2)) +
geom_point(colour="#EC5C00", size=2.2, alpha=0.8) +
ggtitle("(c) association + outlier") +
theme_void() +
theme(aspect.ratio = 1,
panel.border = element_rect(fill=NA,
colour="black"),
plot.margin = margin(0, 5, 0, 5))
e4 <- ggplot(barrier, aes(x=x1, y=x2)) +
geom_point(colour="#EC5C00", size=2.2, alpha=0.8) +
ggtitle("(d) barrier") +
theme_void() +
theme(aspect.ratio = 1,
panel.border = element_rect(fill=NA,
colour="black"),
plot.margin = margin(0, 5, 0, 5))
e1 + e2 + e3 + e4 + plot_layout(ncol=4)
```
```{r}
#| echo: false
#| eval: false
library(tourr)
set.seed(340)
render_gif(clusters[,1:5],
grand_tour(),
display_trails(col="#EC5C00",
axes="off",
cex=2,
half_range=0.8,
past=5),
rescale=TRUE,
gif_file = "gifs/trails-clusters.gif",
frames=200,
width=400,
height=400)
render_gif(clusters[,1:5],
grand_tour(),
display_xy(col="#EC5C00",
axes="bottomleft",
cex=2,
half_range=0.8),
rescale=TRUE,
gif_file = "gifs/clusters-intro.gif",
apf = 1/50,
frames=500,
width=400,
height=400)
render_gif(plane_outliers[,1:5],
grand_tour(),
display_trails(col="#EC5C00",
axes="off",
cex=2,
half_range=0.8),
rescale=TRUE,
gif_file = "gifs/trails-outlier.gif",
frames=200,
width=400,
height=400)
render_gif(plane_outliers[,1:5],
grand_tour(),
display_xy(col="#EC5C00",
axes="bottomleft",
cex=2,
half_range=0.8),
rescale=TRUE,
gif_file = "gifs/outlier-intro.gif",
apf = 1/50,
frames=500,
width=400,
height=400)
```
@fig-trails illustrates how movement patterns of points when using scatterplots to display 2D projections indicate clustering (a, b) and outliers (c, d).
::: {#fig-trails layout-ncol=2 fig-align="center"}
![Clustering](images/trails-clusters.png){#fig-clusters-trails-static fig-alt="Frame from the animations shown earlier annotated to mark clustering movement. Movement pattern is indicated by a point and a line."}
![Outliers](images/trails-outlier.png){#fig-outlier-trails-static fig-alt="Frame from the animations shown earlier annotated to mark outliers movement. Movement pattern is indicated by a point and a line."}
The movement of points give further clues about the structure of the data in high-dimensions. In the data with clustering, often we can see a group of points moving differently from the others. Because there are three clusters, you should see three distinct movement patterns. It is similar with outliers, except these may be individual points moving alone, and different from all others. This can be seen in the static plot, one point (top left) has a movement pattern upwards whereas most of the other observations near it are moving down towards the right.
:::
This type of visualisation is useful for many activities in dealing with high-dimensional data, including:
- exploring high-dimensional data.
- detecting if the data lives in a lower dimensional space than the number of variables.
- checking assumptions required for multivariate models to be applicable.
- check for potential problems in modeling such as multicollinearity among predictors.
- checking assumptions required for probabilities calculated for statistical hypothesis testing to be valid.
- diagnosing the fit of multivariate models.
::: {.content-visible when-format="html"}
::: info
With a tour we slowly rotate the viewing direction, this allows us to see many individual projections and to track movement patterns. Look for interesting structures such as clusters or outlying points.
:::
:::
::: {.content-visible when-format="pdf"}
\infobox{With a tour we slowly rotate the viewing direction, this allows us to see many individual projections and to track movement patterns. Look for interesting structures such as clusters or outlying points.}
:::
## A little history
Viewing high-dimensional data based on low-dimensional projections can probably be traced back to the early work on principal component analysis by @pearson-pca and @hotelling-pca, which was extended to known classes as part of discriminant analysis by @fisher1936.
With computer graphics, the capability of animating plots to show more than a single best projection became possible. The video library [@ASA23] is the best place to experience the earliest work. Kruskal's 1962 animation of multidimensional scaling showed the process of finding a good 2D representation of high dimensional data, although the views are not projections. Chang's 1970 video shows her rotating a high dimensional point cloud along coordinate axes to find a special projection where all the numbers align. The classic video that must be watched is PRIM9 [@PRIM9-video] where a variety of interactive and dynamic tools are used together to explore high dimensional physics data, documented in @tukey.
The methods in this book primarily emerge from @As85's grand tour method. The algorithm provided the first smooth and continuous sequence of low dimensional projections, and guaranteed that all possible low dimensional projections were likely to be shown. The algorithm was refined in @BA86b (and documented in detail in @BCAH05) to make it *efficiently* show all possible projections. Since then there have been numerous varieties of tour algorithms developed to focus on specific tasks in exploring high dimensional data, and these are documented in @tours2022.
This book is an evolution from @CS07. One of the difficulties in working on interactive and dynamic graphics research has been the rapid change in technology. Programming languages have changed a little (FORTRAN to C to java to python) but graphics toolkits and display devices have changed a lot! The tour software used in this book evolved from XGobi, which was written in C and used the X Window System, which was then rewritten in GGobi using gtk. The video library has engaging videos of these software systems There have been several other short-lived implementations, including orca [@orca], written in java, and cranvas [@cranvas], written in R with a back-end provided by wrapper functions to `qt` libraries.
Although attempts were made with these ancestor systems to connect the data plots to a statistical analysis system, these were always limited. With the emergence of R, having graphics in the data analysis workflow has been much easier, albeit at the cost of the interactivity with graphics that matches the old systems. We are mostly using the R package, `tourr` [@tourr] for examples in this book. It provides the machinery for running a tour, and has the flexibility that it can be ported, modified, and used as a regular element of data analysis.
## Exercises {-}
1. Randomly generate data points that are uniformly distributed in a hyper-cube of 3, 5 and 10 dimensions, with 500 points in each sample, using the `cube.solid.random()` function of the `geozoo` package. What differences do we expect to see? Now visualise each set in a grand tour and describe how they differ, and whether this matched your expectations?
2. Use the `geozoo` package to generate samples from different shapes and use them to get a better understanding of how shapes appear in a grand tour. You can start with exploring the conic spiral in 3D, a torus in 4D and points along the wire frame of a cube in 5D.
3. For each of the challenge data sets, `c1`, ..., `c7` from the `mulgar` package, use the grand tour to view and try to identify structure (outliers, clusters, non-linear relationships).
```{r}
#| eval: false
#| echo: false
# Answer to Q1
library(tourr)
library(geozoo)
set.seed(1234)
cube3 <- cube.solid.random(3, 500)$points
cube5 <- cube.solid.random(5, 500)$points
cube10 <- cube.solid.random(5, 500)$points
animate_xy(cube3, axes="bottomleft")
animate_xy(cube5, axes="bottomleft")
animate_xy(cube10, axes="bottomleft")
```
::: {.content-hidden when-format="pdf"}
::: {.hidden}
Answer 1. Each of the projections has a boxy shape, which gets less distinct as the dimension increases.
As the dimension increases, the points tend to concentrate in the centre of the plot window, with a smattering of points in the edges.
:::
:::
```{r}
#| eval: false
#| echo: false
# Answer to Q3
library(mulgar)
animate_xy(c1)
render_gif(c1,
grand_tour(),
display_xy(),
gif_file = "gifs/c1.gif",
frames=200,
start = basis_random(ncol(c1), 2),
width=400,
height=400)
# four small clusters, two big clusters
# linear dependence
animate_xy(c2)
render_gif(c2,
grand_tour(),
display_xy(),
gif_file = "gifs/c2.gif",
frames=200,
start = basis_random(ncol(c2), 2),
width=400,
height=400)
# Six spherical clusters
animate_xy(c3)
render_gif(c3,
grand_tour(),
display_xy(),
gif_file = "gifs/c3.gif",
frames=200,
start = basis_random(ncol(c3), 2),
width=400,
height=400)
# tetrahedron with lots of smaller triangles,
# barriers, linear dependence
animate_xy(c4)
render_gif(c4,
grand_tour(),
display_xy(),
gif_file = "gifs/c4.gif",
frames=200,
start = basis_random(ncol(c4), 2),
width=400,
height=400)
# Four linear connected pieces
animate_xy(c5)
render_gif(c5,
grand_tour(),
display_xy(),
gif_file = "gifs/c5.gif",
frames=200,
start = basis_random(ncol(c5), 2),
width=400,
height=400)
# Spiral in lower dimensional space
# Non-linear and linear dependence
animate_xy(c6)
render_gif(c6,
grand_tour(),
display_xy(),
gif_file = "gifs/c6.gif",
frames=200,
start = basis_random(ncol(c6), 2),
width=400,
height=400)
# Two curved clusters
animate_xy(c7)
render_gif(c7,
grand_tour(),
display_xy(),
gif_file = "gifs/c7.gif",
frames=200,
start = basis_random(ncol(c7), 2),
width=400,
height=400)
# spherical cluster, curve cluster and a lot of noise points
```