From 844de4e2e6a32fb4b51ed4a7080b923874d5f3b7 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 27 Jul 2024 15:39:48 -0700
Subject: [PATCH 01/26] Initial multi-threaded distance matrix calculation

---
 smallvis/DESCRIPTION               |   3 +
 smallvis/NAMESPACE                 |   1 +
 smallvis/R/RcppExports.R           |  11 ++
 smallvis/R/cost.R                  | 200 ++++++++++++++++-------------
 smallvis/R/smallvis-package.R      |   4 +
 smallvis/R/smallvis.R              |  70 +++++++---
 smallvis/R/sne.R                   | 174 ++++++++++++++-----------
 smallvis/man/smallvis.Rd           |   9 +-
 smallvis/src/.gitignore            |   3 +
 smallvis/src/RcppExports.cpp       |  47 +++++++
 smallvis/src/matrix.cpp            | 116 +++++++++++++++++
 smallvis/tests/testthat/test_api.R |  11 ++
 12 files changed, 460 insertions(+), 189 deletions(-)
 create mode 100644 smallvis/R/RcppExports.R
 create mode 100644 smallvis/R/smallvis-package.R
 create mode 100644 smallvis/src/.gitignore
 create mode 100644 smallvis/src/RcppExports.cpp
 create mode 100644 smallvis/src/matrix.cpp

diff --git a/smallvis/DESCRIPTION b/smallvis/DESCRIPTION
index 791e2e7..cb379a1 100644
--- a/smallvis/DESCRIPTION
+++ b/smallvis/DESCRIPTION
@@ -11,6 +11,7 @@ License: GPL (>= 2)
 Imports:
     methods,
     mize (>= 0.2),
+    Rcpp,
     Rfast,
     rnndescent,
     vizier
@@ -24,3 +25,5 @@ Date: 2017-11-28
 URL: http://github.com/jlmelville/smallvis
 BugReports: http://github.com/jlmelville/smallvis/issues
 RoxygenNote: 7.3.2
+LinkingTo: 
+    Rcpp
diff --git a/smallvis/NAMESPACE b/smallvis/NAMESPACE
index f75ad9c..f44dae6 100644
--- a/smallvis/NAMESPACE
+++ b/smallvis/NAMESPACE
@@ -3,3 +3,4 @@
 export(smallvis)
 export(smallvis_perpstep)
 export(smallvis_rep)
+useDynLib(smallvis, .registration = TRUE)
diff --git a/smallvis/R/RcppExports.R b/smallvis/R/RcppExports.R
new file mode 100644
index 0000000..923ffe2
--- /dev/null
+++ b/smallvis/R/RcppExports.R
@@ -0,0 +1,11 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+dist2_cpp <- function(input, n_threads = 1L) {
+    .Call(`_smallvis_dist2_cpp`, input, n_threads)
+}
+
+dist_cpp <- function(input, n_threads = 1L) {
+    .Call(`_smallvis_dist_cpp`, input, n_threads)
+}
+
diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index 58d3874..c00e1ab 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -129,21 +129,21 @@ stop_exaggerating <- function(cost, exaggeration_factor) {
 largevis <- function(perplexity, inp_kernel = "gaussian", 
                      symmetrize = "symmetric", gamma = 1, gr_eps = 0.1, 
                      normalize = TRUE, eps = 1e-9, row_weight = NULL,
-                     n_threads = 0) {
+                     use_cpp = FALSE, n_threads = 0) {
   if (!is.null(row_weight)) {
     row_normalize <- row_weight
   }
   else {
     row_normalize <- TRUE
   }
-  lreplace(tsne(perplexity),
+  lreplace(tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
      init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
        symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
        cost <- sne_init(cost, X = X, perplexity = perplexity, 
                         symmetrize = symmetrize, kernel = inp_kernel,
                         normalize = normalize, verbose = verbose,
                         row_normalize = row_normalize, ret_extra = ret_extra,
-                        n_threads = n_threads)
+                        n_threads = n_threads, use_cpp = use_cpp)
        cost$eps <- eps
        cost$greps1 <- gr_eps - 1
        cost
@@ -181,20 +181,21 @@ largevis <- function(perplexity, inp_kernel = "gaussian",
 
 umap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
                  spread = 1, min_dist = 0.001, gr_eps = 0.1, eps = 1e-9,
-                 row_weight = NULL, n_threads = 0) {
+                 row_weight = NULL, n_threads = 0, use_cpp = FALSE) {
   if (!is.null(row_weight)) {
     row_normalize <- row_weight
   }
   else {
     row_normalize <- FALSE
   }
-  lreplace(tsne(perplexity),
+  lreplace(tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = FALSE,
                        row_normalize = row_normalize, n_threads = n_threads,
-                       verbose = verbose, ret_extra = ret_extra)
+                       verbose = verbose, ret_extra = ret_extra,
+                       use_cpp = use_cpp)
       
       cost <- init_ab(cost, spread = spread, min_dist = min_dist, 
                       verbose = verbose)
@@ -240,14 +241,15 @@ umap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
 
 # UMAP with the output kernel fixed to the t-distribution
 tumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap", 
-                  gr_eps = 0.1, eps = 1e-9, row_weight = NULL, n_threads = 0) {
+                  gr_eps = 0.1, eps = 1e-9, row_weight = NULL, n_threads = 0,
+                  use_cpp = FALSE) {
   if (!is.null(row_weight)) {
     row_normalize <- row_weight
   }
   else {
     row_normalize <- FALSE
   }
-  lreplace(umap(perplexity),
+  lreplace(umap(perplexity, n_threads = n_threads, use_cpp = use_cpp),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost$eps <- eps
@@ -255,7 +257,8 @@ tumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = FALSE,
                        row_normalize = row_normalize, n_threads = n_threads,
-                       verbose = verbose, ret_extra = ret_extra)
+                       verbose = verbose, ret_extra = ret_extra,
+                       use_cpp = use_cpp)
       cost
     },
     gr = function(cost, Y) {
@@ -277,15 +280,15 @@ tumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
 
 # t-UMAP where output and input affinities are normalized
 ntumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
-                   gr_eps = 0.1, eps = 1e-9, n_threads = 0) {
-  lreplace(tumap(perplexity),
+                   gr_eps = 0.1, eps = 1e-9, n_threads = 0, use_cpp = FALSE) {
+  lreplace(tumap(perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost$eps <- eps
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        n_threads = n_threads, verbose = verbose,
-                       ret_extra = ret_extra)
+                       ret_extra = ret_extra, use_cpp = use_cpp)
       cost
     },
     pfn = function(cost, Y) {
@@ -329,15 +332,15 @@ ntumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
 # Reverse KL divergence
 rklsne <- function(perplexity, inp_kernel = "gaussian", 
                    symmetrize = "symmetric", eps = .Machine$double.eps,
-                   n_threads = 0) {
+                   n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -379,15 +382,16 @@ rklsne <- function(perplexity, inp_kernel = "gaussian",
 # Jensen-Shannon divergence
 jssne <- function(perplexity, inp_kernel = "gaussian", 
                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0) {
+                  n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel),
+    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp == use_cpp,
+         n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -439,15 +443,16 @@ jssne <- function(perplexity, inp_kernel = "gaussian",
 # Chi-squared divergence
 chsne <- function(perplexity, inp_kernel = "gaussian", 
                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0) {
+                  n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel),
+    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp == use_cpp,
+         n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -499,15 +504,16 @@ chsne <- function(perplexity, inp_kernel = "gaussian",
 # Hellinger distance divergence
 hlsne <- function(perplexity, inp_kernel = "gaussian", 
                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0) {
+                  n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel),
+    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp == use_cpp,
+         n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = "symmetric", normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -590,7 +596,7 @@ absne <- function(perplexity, inp_kernel = "gaussian",
     lambda <- ifelse(lambda == 0, 1, sign(lambda)) * eps0
   }
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = TRUE, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       if (verbose) {
@@ -600,7 +606,7 @@ absne <- function(perplexity, inp_kernel = "gaussian",
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = TRUE)
       cost$inva4 <- 4 / alpha
       cost$minvab <- -1 / (alpha * beta)
       cost$inval <- 1 / (alpha * lambda)
@@ -653,9 +659,9 @@ absne <- function(perplexity, inp_kernel = "gaussian",
 # alpha != 0, beta = 0 => lambda = alpha
 absneb0 <- function(perplexity, inp_kernel = "gaussian", 
                     symmetrize = "symmetric", alpha = 1,
-                    eps = .Machine$double.eps, n_threads = 0) {
+                    eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       if (verbose) {
@@ -665,7 +671,7 @@ absneb0 <- function(perplexity, inp_kernel = "gaussian",
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       
       cost$eps <- eps
       
@@ -723,9 +729,10 @@ absneb0 <- function(perplexity, inp_kernel = "gaussian",
 # alpha = -beta != 0 => lambda = 0
 absneamb <- function(perplexity, inp_kernel = "gaussian", 
                      symmetrize = "symmetric", alpha = 1,
-                     eps = .Machine$double.eps, n_threads = 0) {
+                     eps = .Machine$double.eps, n_threads = 0,
+                     use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       if (verbose) {
@@ -735,7 +742,7 @@ absneamb <- function(perplexity, inp_kernel = "gaussian",
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
 
       cost$N <- nrow(cost$P)
       cost$N2 <- (cost$N - 1) * cost$N
@@ -792,9 +799,10 @@ absneamb <- function(perplexity, inp_kernel = "gaussian",
 # alpha = 0, beta != 0 => lambda = beta
 absnea0 <- function(perplexity, inp_kernel = "gaussian", 
                     symmetrize = "symmetric", beta = 1,
-                    eps = .Machine$double.eps, n_threads = 0) {
+                    eps = .Machine$double.eps, n_threads = 0,
+                    use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       if (verbose) {
@@ -803,7 +811,7 @@ absnea0 <- function(perplexity, inp_kernel = "gaussian",
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       
       cost$eps <- eps
       
@@ -865,9 +873,9 @@ absnea0 <- function(perplexity, inp_kernel = "gaussian",
 # alpha = 0, beta = 0 => lambda = 0
 absne00 <- function(perplexity, inp_kernel = "gaussian", 
                     symmetrize = "symmetric", eps = .Machine$double.eps,
-                    n_threads = 0) {
+                    n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       if (verbose) {
@@ -876,7 +884,7 @@ absne00 <- function(perplexity, inp_kernel = "gaussian",
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -926,7 +934,7 @@ absne00 <- function(perplexity, inp_kernel = "gaussian",
 # alpha-beta divergence
 abssne <- function(perplexity, inp_kernel = "gaussian", 
                    symmetrize = "symmetric", alpha = 1, lambda = 1,
-                   eps = .Machine$double.eps, n_threads = 0) {
+                   eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   beta <- lambda - alpha
   
   eps0 <- 1e-5
@@ -941,9 +949,8 @@ abssne <- function(perplexity, inp_kernel = "gaussian",
     lambda <- ifelse(lambda == 0, 1, sign(lambda)) * eps0
   }
   lreplace(
-    tsne(perplexity = perplexity),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c(), 
-                    n_threads = n_threads) {
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       if (verbose) {
         tsmessage("Using ABSSNE with alpha = ", formatC(alpha), 
@@ -952,7 +959,7 @@ abssne <- function(perplexity, inp_kernel = "gaussian",
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$inva4 <- 4 / alpha
       cost$minvab <- -1 / (alpha * beta)
       cost$inval <- 1 / (alpha * lambda)
@@ -1004,15 +1011,15 @@ abssne <- function(perplexity, inp_kernel = "gaussian",
 # global-SNE
 gsne <- function(perplexity, lambda = 1, inp_kernel = "gaussian", 
                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                 n_threads = 0) {
+                 use_cpp = FALSE, n_threads = 0) {
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       
       # Phat isn't affected by early exaggeration, so we cache it once only
@@ -1020,7 +1027,7 @@ gsne <- function(perplexity, lambda = 1, inp_kernel = "gaussian",
         Phat <- as.matrix(X)
       }
       else {
-        Phat <- safe_dist2(X)
+        Phat <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
       }
       
       Phat <- Phat + 1
@@ -1079,23 +1086,25 @@ gsne <- function(perplexity, lambda = 1, inp_kernel = "gaussian",
 # Distance Preserving Methods ---------------------------------------------
 
 mmds_init <- function(cost, X, max_iter, eps = .Machine$double.eps, 
-                      verbose = FALSE, ret_extra = c()) {
+                      verbose = FALSE, ret_extra = c(), use_cpp = FALSE,
+                      n_threads = 1) {
   tsmessage("Calculating pairwise distances")
   if (methods::is(X, "dist")) {
     cost$R <- as.matrix(X)
   }
   else {
-    cost$R <- sqrt(safe_dist2(X))
+    cost$R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
   }
   cost$eps <- eps
   cost
 }
 
 # Metric MDS, minimizing strain.
-mmds <- function(eps = .Machine$double.eps) {
+mmds <- function(eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 1) {
   list(
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra)
+      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra, 
+                        use_cpp = use_cpp, n_threads = n_threads)
       cost
     },
     pfn = function(cost, Y) {
@@ -1109,7 +1118,7 @@ mmds <- function(eps = .Machine$double.eps) {
       cost
     },
     update = function(cost, Y) {
-      cost$D <- sqrt(safe_dist2(Y))
+      cost$D <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
       cost
     },
     sentinel = "D",
@@ -1128,12 +1137,13 @@ mmds <- function(eps = .Machine$double.eps) {
   )
 }
 
-smmds <- function(eps = .Machine$double.eps) {
+smmds <- function(eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 1) {
   lreplace(
-    mmds(),
+    mmds(use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, eps = .Machine$double.eps, verbose = FALSE,
                     ret_extra = c()) {
-      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra)
+      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra, 
+                        use_cpp = use_cpp, n_threads = n_threads)
       cost$R2 <- cost$R * cost$R
       cost$R <- NULL
       cost
@@ -1165,17 +1175,18 @@ smmds <- function(eps = .Machine$double.eps) {
       res
     },
     update = function(cost, Y) {
-      cost$D2 <- dist2(Y)
+      cost$D2 <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       cost
     },
     sentinel = "D2"
   )
 }
 
-sammon <- function(eps = .Machine$double.eps) {
-  lreplace(mmds(),
+sammon <- function(eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 1) {
+  lreplace(mmds(eps = eps, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra)
+      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra, 
+                        use_cpp = use_cpp, n_threads = n_threads)
       cost$rsum_inv <- 1 / sum(cost$R)
       cost
     },
@@ -1193,12 +1204,13 @@ sammon <- function(eps = .Machine$double.eps) {
 }
 
 
-gmmds <- function(k, eps = .Machine$double.eps, n_threads = 0) {
+gmmds <- function(k, eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 0) {
   lreplace(
-    mmds(),
+    mmds(use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE,
                     ret_extra = c()) {
-      cost$R <- geodesic(X, k, n_threads = n_threads, verbose = verbose)
+      cost$R <- geodesic(X, k, n_threads = n_threads, use_cpp = use_cpp,
+                         verbose = verbose)
       cost$eps <- eps
       cost
     },
@@ -1220,13 +1232,14 @@ gmmds <- function(k, eps = .Machine$double.eps, n_threads = 0) {
 # Define neighborhoods using a radius based on a fraction (f) of all input
 # distances (sorted by increasing length), don't correct non-neighborhood
 # distances unless they smaller than the input distance
-ballmmds <- function(f = 0.1, eps = .Machine$double.eps) {
+ballmmds <- function(f = 0.1, eps = .Machine$double.eps, use_cpp = FALSE,
+                     n_threads = 1) {
   lreplace(
-    mmds(),
+    mmds(use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- mmds_init(cost = cost, X = X, max_iter = max_iter, eps = eps, verbose = verbose,
-                        ret_extra = ret_extra)
-
+      cost <- mmds_init(cost = cost, X = X, max_iter = max_iter, eps = eps, 
+                        verbose = verbose, ret_extra = ret_extra,
+                        use_cpp = use_cpp, n_threads = n_threads)
       rs <- cost$R[upper.tri(cost$R)]
       rmax <- Rfast::nth(rs, max(1, round(f * length(rs))))
       if (verbose) {
@@ -1274,7 +1287,7 @@ ballmmds <- function(f = 0.1, eps = .Machine$double.eps) {
       res
     },
     update = function(cost, Y) {
-      cost$D <- sqrt(safe_dist2(Y))
+      cost$D <- calc_d(Y)
       cost
     }
   )
@@ -1282,12 +1295,14 @@ ballmmds <- function(f = 0.1, eps = .Machine$double.eps) {
 
 # Create the symmetrized knn graph, don't correct non-neighborhood distances
 # unless they smaller than the input distance
-knnmmds <- function(k, eps = .Machine$double.eps, n_threads = 0) {
+knnmmds <- function(k, eps = .Machine$double.eps, use_cpp = FALSE, 
+                    n_threads = 0) {
   lreplace(
-    mmds(),
+    mmds(use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       cost <- mmds_init(cost = cost, X = X, max_iter = max_iter, eps = eps, 
-                        verbose = verbose, ret_extra = ret_extra)
+                        verbose = verbose, ret_extra = ret_extra,
+                        use_cpp = use_cpp, n_threads = n_threads)
       knn <- knn_graph(X = X, k = k, n_threads = n_threads, verbose = verbose)
       # symmetrize
       cost$knn <- pmax(knn, t(knn))
@@ -1310,6 +1325,7 @@ knnmmds <- function(k, eps = .Machine$double.eps, n_threads = 0) {
       eps <- cost$eps
       R <- cost$R
       D <- cost$D
+
       K <- -4 * (R - D) / (D + eps)
 
       knn <- cost$knn
@@ -1332,7 +1348,7 @@ knnmmds <- function(k, eps = .Machine$double.eps, n_threads = 0) {
       res
     },
     update = function(cost, Y) {
-      cost$D <- sqrt(safe_dist2(Y))
+      cost$D <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
       cost
     }
   )
@@ -1349,7 +1365,7 @@ knnmmds <- function(k, eps = .Machine$double.eps, n_threads = 0) {
 # squared input distances. Otherwise, no weighting is applied.
 ee <- function(perplexity, lambda = 100, neg_weights = TRUE, 
                inp_kernel = "gaussian", symmetrize = "symmetric", 
-               eps = .Machine$double.eps, n_threads = 0) {
+               eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 0) {
   list(
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
@@ -1358,7 +1374,7 @@ ee <- function(perplexity, lambda = 100, neg_weights = TRUE,
           R <- X
         }
         else {
-          R <- sqrt(safe_dist2(X))
+          R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
         }
         cost$Vn <- R / sum(R)
       }
@@ -1368,7 +1384,7 @@ ee <- function(perplexity, lambda = 100, neg_weights = TRUE,
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -1423,17 +1439,17 @@ ee <- function(perplexity, lambda = 100, neg_weights = TRUE,
 # default lambda = 0.9 from "Majorization-Minimization for Manifold Embedding"
 # Yang, Peltonen, Kaski 2015
 nerv <- function(perplexity, lambda = 0.9, inp_kernel = "gaussian", 
-                 eps = .Machine$double.eps, n_threads = 0) {
+                 eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   lambda2 <- 2 * lambda
   oml <- 1 - lambda
   oml2 <- 2 * oml
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = "none", normalize = FALSE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -1546,7 +1562,7 @@ snerv <- function(perplexity, lambda = 0.9, inp_kernel = "gaussian",
 # kappa = 0 behaves like ASNE
 # kappa = 1 behaves like NeRV with lambda = 0. Yes that's confusing.
 jse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian", 
-                eps = .Machine$double.eps, n_threads = 0) {
+                eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   eps0 <- 1e-5
   kappa <- max(kappa, eps0)
   kappa <- min(kappa, 1 - eps0)
@@ -1557,12 +1573,12 @@ jse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian",
   om_kappa_inv <- 1 / om_kappa
 
   lreplace(
-    ssne(perplexity = perplexity),
+    ssne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = "none", normalize = FALSE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -1666,14 +1682,15 @@ sjse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian",
 
 
 rsrnerv <- function(perplexity, lambda = 0.9, eps = .Machine$double.eps,
-                    n_threads = 0) {
-  lreplace(nerv(perplexity = perplexity, lambda = lambda),
+                    n_threads = 0, use_cpp = FALSE) {
+  lreplace(nerv(perplexity = perplexity, lambda = lambda, use_cpp = use_cpp, 
+                n_threads = n_threads),
            init = function(cost, X, max_iter, verbose = FALSE,
                            ret_extra = c()) {
              cost <- sne_init(cost, X, perplexity = perplexity,
                               symmetrize = "symmetric", normalize = FALSE,
                               verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads)
+                              n_threads = n_threads, use_cpp = use_cpp)
              P <- cost$P
              P <- P / rowSums(P)
              cost$P <- P
@@ -1684,14 +1701,16 @@ rsrnerv <- function(perplexity, lambda = 0.9, eps = .Machine$double.eps,
   )
 }
 
-rsrjse <- function(perplexity, kappa = 0.5, eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(jse(perplexity = perplexity, kappa = kappa),
+rsrjse <- function(perplexity, kappa = 0.5, eps = .Machine$double.eps, 
+                   n_threads = 0, use_cpp = FALSE) {
+  lreplace(jse(perplexity = perplexity, kappa = kappa, use_cpp = use_cpp,
+               n_threads = n_threads),
            init = function(cost, X, max_iter, verbose = FALSE,
                            ret_extra = c()) {
              cost <- sne_init(cost, X, perplexity = perplexity,
                               symmetrize = "symmetric", normalize = FALSE,
                               verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads)
+                              n_threads = n_threads, use_cpp = use_cpp)
              P <- cost$P
              P <- P / rowSums(P)
              cost$P <- P
@@ -1705,19 +1724,20 @@ rsrjse <- function(perplexity, kappa = 0.5, eps = .Machine$double.eps, n_threads
 # NeRV with input bandwidths transferred to the output kernel, as in the
 # original paper.
 bnerv <- function(perplexity, lambda = 0.9, eps = .Machine$double.eps, 
-                  n_threads = 0) {
+                  n_threads = 0, use_cpp = FALSE) {
   lambda2 <- 2 * lambda
   oml <- 1 - lambda
   oml2 <- 2 * oml
   lreplace(
-    nerv(perplexity = perplexity, lambda = lambda),
+    nerv(perplexity = perplexity, lambda = lambda, use_cpp = use_cpp, 
+         n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       ret_extra <- unique(c(ret_extra, 'beta'))
 
       cost <- sne_init(cost, X, perplexity = perplexity,
                        symmetrize = "none", normalize = FALSE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost$lambda2b <- lambda2 * cost$beta
       cost$oml2b <- oml2 * cost$beta
diff --git a/smallvis/R/smallvis-package.R b/smallvis/R/smallvis-package.R
new file mode 100644
index 0000000..37a033d
--- /dev/null
+++ b/smallvis/R/smallvis-package.R
@@ -0,0 +1,4 @@
+## usethis namespace: start
+#' @useDynLib smallvis, .registration = TRUE
+## usethis namespace: end
+NULL
\ No newline at end of file
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index 0ac0a98..55b2665 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -524,7 +524,12 @@
 #'   to those value which are returned when this value is \code{TRUE}. See the
 #'   \code{Value} section for details.
 #' @param n_threads Number of threads to use in multi-threaded code. Default is
-#'   0, which means no multi-threading.
+#'   0, which means no multi-threading. Mainly affects the calculation of things
+#'   like distance matrices if you set \code{use_cpp = TRUE}. Otherwise, only
+#'   methods that need to calculate nearest neighbors will be affected.
+#' @param use_cpp If \code{TRUE} use multi-threaded C++ code to calculate some
+#'   matrices. Default is \code{FALSE}. This won't speed up all steps and you
+#'   will want to use this in conjunction with \code{n_threads}.
 #' @param eps Set epsilon for avoiding division-by-zero errors. Default is
 #'   \code{.Machine$double.eps}, but if you see inconsistent convergence results
 #'   with optimizer that should be reducing the cost each iteration, then try
@@ -855,6 +860,7 @@ smallvis <- function(X, k = 2, scale = "absmax",
                  tol_wait = 15,
                  ret_extra = FALSE,
                  n_threads = 0,
+                 use_cpp = FALSE,
                  eps = .Machine$double.eps,
                  verbose = TRUE) {
 
@@ -892,20 +898,20 @@ smallvis <- function(X, k = 2, scale = "absmax",
          largevis = largevis(perplexity = perplexity, n_threads = n_threads, eps = eps),
          tumap = tumap(perplexity = perplexity, n_threads = n_threads, eps = eps),
          ntumap = ntumap(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         mmds = mmds(eps = eps),
-         gmmds = gmmds(k = perplexity, n_threads = n_threads, eps = eps),
+         mmds = mmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         gmmds = gmmds(k = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          asne = asne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          ssne = ssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          wtsne = wtsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          wssne = wssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          hssne = hssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         ee = ee(perplexity = perplexity, n_threads = n_threads, eps = eps),
+         ee = ee(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          nerv = nerv(perplexity = perplexity, n_threads = n_threads, eps = eps),
          snerv = snerv(perplexity = perplexity, n_threads = n_threads, eps = eps),
          jse = jse(perplexity = perplexity, n_threads = n_threads, eps = eps),
          sjse = sjse(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         smmds = smmds(eps = eps),
-         sammon = sammon(eps = eps),
+         smmds = smmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         sammon = sammon(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          tasne = tasne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          trmsne = trmsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          trsrsne = trsrsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
@@ -918,8 +924,8 @@ smallvis <- function(X, k = 2, scale = "absmax",
          basne = basne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          btasne = btasne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          bnerv = bnerv(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         ballmmds = ballmmds(eps = eps),
-         knnmmds = knnmmds(k = perplexity, n_threads = n_threads, eps = eps),
+         ballmmds = ballmmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         knnmmds = knnmmds(k = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          dhssne = dhssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          tsneu = tsneu(perplexity = perplexity, n_threads = n_threads, eps = eps),
          pstsne = pstsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
@@ -930,7 +936,7 @@ smallvis <- function(X, k = 2, scale = "absmax",
          absne = absne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          chsne = chsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          hlsne = hlsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         gsne = gsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
+         gsne = gsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          rklsne = rklsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
          jssne = jssne(perplexity = perplexity, n_threads = n_threads, eps = eps),      
          abssne = abssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
@@ -1153,7 +1159,8 @@ smallvis <- function(X, k = 2, scale = "absmax",
                      cost_fn = cost_fn, itercosts = itercosts,
                      start_time = start_time, optionals = ret_optionals,
                      pca = ifelse(pca && !whiten, initial_dims, 0),
-                     whiten = ifelse(pca && whiten, initial_dims, 0)))
+                     whiten = ifelse(pca && whiten, initial_dims, 0),
+                     use_cpp = use_cpp, n_threads = n_threads))
   }
   
   opt_stages <- c()
@@ -1387,7 +1394,8 @@ smallvis <- function(X, k = 2, scale = "absmax",
             exaggeration_factor, late_exaggeration_factor,
             optionals = ret_optionals,
             pca = ifelse(pca && !whiten, initial_dims, 0),
-            whiten = ifelse(pca && whiten, initial_dims, 0))
+            whiten = ifelse(pca && whiten, initial_dims, 0),
+            use_cpp = use_cpp, n_threads = n_threads)
 
   res
 }
@@ -1883,13 +1891,13 @@ make_smallvis_cb <- function(df) {
   palette <- NULL
   function(Y, iter, cost = NULL) {
     if (is.null(palette)) {
-      palette <- vizier:::make_palette(ncolors = nrow(Y), color_scheme = rainbow)
+      palette <- vizier:::color_helper(df, color_scheme = rainbow)$palette
     }
     title <- paste0("iter: ", iter)
     if (!(is.null(cost) || is.na(cost))) {
       title <- paste0(title, " cost = ", formatC(cost))
     }
-    vizier::embed_plot(Y, df, title = title, colors = palette)
+    vizier::embed_plot(Y, df, title = title, color_scheme = palette)
   }
 }
 
@@ -1909,7 +1917,7 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
                       stop_lying_iter = NULL, start_late_lying_iter = NULL,
                       opt_input = NULL, opt_res = NULL,
                       exaggeration_factor = 1, late_exaggeration_factor = 1,
-                      optionals = c()) {
+                      optionals = c(), use_cpp = FALSE, n_threads = 1) {
   attr(Y, "dimnames") <- NULL
   if (ret_extra) {
     end_time <- Sys.time()
@@ -2015,11 +2023,11 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
           res$DX <- X
         }
         else {
-          res$DX <- sqrt(safe_dist2(X))
+          res$DX <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
         }
       }
       else if (o == "dy") {
-        res$DY <- sqrt(safe_dist2(Y))
+        res$DY <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
       }
 
       if (o == "x") {
@@ -2258,7 +2266,8 @@ shannon <- function(D2, beta) {
   )
 }
 
-x2aff_sigma <- function(X, sigma = 1e-3, verbose = FALSE) {
+x2aff_sigma <- function(X, sigma = 1e-3, verbose = FALSE, use_cpp = FALSE, 
+                        n_threads = 1) {
   x_is_dist <- methods::is(X, "dist")
   if (x_is_dist) {
     D <- X
@@ -2267,7 +2276,7 @@ x2aff_sigma <- function(X, sigma = 1e-3, verbose = FALSE) {
     D <- D * D
   }
   else {
-    D <- safe_dist2(X)
+    D <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
   }
   beta <- 1 / (sigma * sigma)
   sres <- shannon(D, beta)
@@ -2360,11 +2369,12 @@ knn_graph <- function(X, k, n_threads, verbose) {
 
 # Given data X and k nearest neighbors, return a geodisic distance matrix
 # Disconnections are treated by using the Euclidean distance.
-geodesic <- function(X, k, fill = TRUE, n_threads = 0, verbose = FALSE) {
+geodesic <- function(X, k, fill = TRUE, use_cpp = FALSE, n_threads = 0, 
+                     verbose = FALSE) {
   tsmessage("Calculating geodesic distances with k = ", k)
 
-  # The hard work is done by Rfast's implementation of Floyd's algorithm
   R <- knn_dist(X, k, n_threads = n_threads, verbose = verbose)
+  # The hard work is done by Rfast's implementation of Floyd's algorithm
   G <- Rfast::floyd(R)
   if (any(is.infinite(G)) && fill) {
     tsmessage("k = ", k, " resulted in disconnections: filling with Euclidean distances")
@@ -2372,7 +2382,7 @@ geodesic <- function(X, k, fill = TRUE, n_threads = 0, verbose = FALSE) {
       R <- as.matrix(X)
     }
     else {
-      R <- sqrt(safe_dist2(X))
+      R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
     }
     G[is.infinite(G)] <- R[is.infinite(G)]
   }
@@ -2529,6 +2539,24 @@ dist2 <- function(X) {
   D2 + sweep(X %*% t(X) * -2, 2, t(D2), `+`)
 }
 
+calc_d2 <- function(X, use_cpp = FALSE, n_threads = 1) {
+  if (use_cpp) {
+    dist2_cpp(X, n_threads = n_threads)
+  }
+  else {
+    safe_dist2(X)
+  }
+}
+
+calc_d <- function(X, use_cpp = FALSE, n_threads = 1) {
+  if (use_cpp) {
+    dist_cpp(X, n_threads = n_threads)
+  }
+  else {
+    sqrt(safe_dist2(X))
+  }
+}
+
 # Squared Euclidean distances, ensuring no small -ve distances can occur
 safe_dist2 <- function(X) {
   D2 <- dist2(X)
diff --git a/smallvis/R/sne.R b/smallvis/R/sne.R
index 882bff3..741aeb4 100644
--- a/smallvis/R/sne.R
+++ b/smallvis/R/sne.R
@@ -83,7 +83,7 @@ kl_cost <- function(cost, Y) {
 # t-SNE
 tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric", 
                  normalize = TRUE, row_normalize = TRUE,
-                 eps = .Machine$double.eps, n_threads = 0) {
+                 eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   list(
     init = function(cost, X, max_iter, verbose = FALSE,
                     ret_extra = c()) {
@@ -92,7 +92,7 @@ tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
                        symmetrize = symmetrize, normalize = normalize,
                        row_normalize = row_normalize,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = 0)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -147,10 +147,11 @@ tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
 # In \emph{International Conference on Artificial Intelligence and Statistics} (pp. 67-74).
 ssne <- function(perplexity, inp_kernel = "gaussian", 
                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                 n_threads = 0) {
+                 n_threads = 0, use_cpp = FALSE) {
   lreplace(
     tsne(perplexity = perplexity, inp_kernel = inp_kernel, 
-         symmetrize = symmetrize, eps = eps, n_threads = n_threads),
+         symmetrize = symmetrize, eps = eps, n_threads = n_threads,
+         use_cpp = use_cpp),
     pfn = kl_costQ,
     gr = function(cost, Y) {
       cost <- cost$update(cost, Y)
@@ -171,14 +172,14 @@ ssne <- function(perplexity, inp_kernel = "gaussian",
 # Stochastic neighbor embedding.
 # In \emph{Advances in neural information processing systems} (pp. 833-840).
 asne <- function(perplexity, inp_kernel = "gaussian", 
-                 eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(tsne(perplexity),
+                 eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+  lreplace(tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE,
                     ret_extra = c()) {
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = "none", normalize = FALSE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost$eps <- eps
       cost
     },
@@ -234,11 +235,11 @@ hssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian",
 # exists to demonstrate that constant beta doesn't have any meaningful effect
 # on the results.
 bhssne <- function(perplexity, alpha = 0.5, beta = 1,
-                   eps = .Machine$double.eps, n_threads = 0) {
+                   eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   alpha <- max(alpha, 1e-8)
   beta <- max(beta, 1e-8)
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     b4 = 4 * beta,
     ab = alpha * beta,
     apow = -1 / alpha,
@@ -247,7 +248,7 @@ bhssne <- function(perplexity, alpha = 0.5, beta = 1,
       cost <- sne_init(cost, X, perplexity = perplexity,
                        symmetrize = "symmetric", normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       # override input bandwidths with fixed beta (although this doesn't do much)
       if (!is.null(beta)) {
         cost$beta <- beta
@@ -340,15 +341,16 @@ dhssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian",
 # (pp. 460-468).
 wtsne <- function(perplexity, inp_kernel = "gaussian", 
                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0) {
-  lreplace(tsne(perplexity = perplexity),
+                  n_threads = 0, use_cpp = FALSE) {
+  lreplace(tsne(perplexity = perplexity, use_cpp = use_cpp,
+                n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       ret_extra <- c(ret_extra, "pdeg")
-      cost <- sne_init(cost, X, perplexity = perplexity, 
-                       kernel = inp_kernel, symmetrize = symmetrize,
-                       normalize = TRUE, verbose = verbose,
-                       ret_extra = ret_extra, n_threads = n_threads)
+      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
+                       symmetrize = symmetrize, normalize = TRUE,
+                       verbose = verbose, ret_extra = ret_extra,
+                       n_threads = n_threads, use_cpp = use_cpp)
       # P matrix degree centrality: column sums
       deg <- cost$pdeg
       if (verbose) {
@@ -381,15 +383,17 @@ wtsne <- function(perplexity, inp_kernel = "gaussian",
 
 wssne <- function(perplexity, inp_kernel = "gaussian", 
                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0) {
-  lreplace(ssne(perplexity = perplexity),
+                  n_threads = 0, use_cpp = FALSE) {
+  lreplace(ssne(perplexity = perplexity, use_cpp = use_cpp,
+                n_threads = n_threads),
      init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
        symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
        ret_extra <- c(ret_extra, "pdeg")
        cost <- sne_init(cost, X, perplexity = perplexity,
                         kernel = inp_kernel, symmetrize = symmetrize, 
                         normalize = TRUE, verbose = verbose, 
-                        ret_extra = ret_extra, n_threads = n_threads)
+                        ret_extra = ret_extra, n_threads = n_threads,
+                        use_cpp = use_cpp)
        # P matrix degree centrality: column sums
        deg <- cost$pdeg
        if (verbose) {
@@ -416,15 +420,16 @@ wssne <- function(perplexity, inp_kernel = "gaussian",
 # t-SNE but with the gradient defined in terms of un-normalized weights
 # Exists entirely as an academic exercise
 tsneu <- function(perplexity, inp_kernel = "gaussian", 
-                  eps = .Machine$double.eps, n_threads = 0) {
+                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel),
+    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp = use_cpp,
+         n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       ret_extra = unique(c(ret_extra, "V"))
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = "symmetric", normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
 
       cost$eps <- eps
 
@@ -447,15 +452,16 @@ tsneu <- function(perplexity, inp_kernel = "gaussian",
 # A pseudo-separable approximation of t-SNE, where the output weight sum is only
 # recalculated during the epoch
 pstsne <- function(perplexity, inp_kernel = "gaussian",
-                   eps = .Machine$double.eps, n_threads = 0) {
+                   eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel),
+    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp = use_cpp,
+         n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       ret_extra = unique(c(ret_extra, "V"))
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = "symmetric", normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
 
       # need to row-normalize and symmetrize affinities
       cost$V <- cost$V / rowSums(cost$V)
@@ -503,16 +509,17 @@ pstsne <- function(perplexity, inp_kernel = "gaussian",
 # EE-like cost function in terms of I-Divergence
 # Scaled to give a gradient similar in form to t-SNE
 tee <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
-                lambda = 0.01, eps = .Machine$double.eps, n_threads = 0) {
+                lambda = 0.01, eps = .Machine$double.eps, n_threads = 0,
+                use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       ret_extra = unique(c(ret_extra, "V", "dint"))
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = FALSE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       V <- cost$P
       cost$eps <- eps
       cost$invN <- 1 / sum(V)
@@ -564,15 +571,15 @@ skdtsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0) {
 # Use the UMAP curve family in output kernel
 usne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric", 
                  spread = 1, min_dist = 0.001, gr_eps = 0.1, 
-                 eps = .Machine$double.eps, n_threads = 0) {
+                 eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity),
+    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = symmetrize, normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       cost <- init_ab(cost, spread = spread, min_dist = min_dist, verbose = verbose)
       cost$eps <- eps
       cost
@@ -600,14 +607,14 @@ usne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
 # UMAP cross entropy cost instead of KL divergence
 cetsne <- function(perplexity, inp_kernel = "gaussian", 
                    symmetrize = "symmetric", eps = .Machine$double.eps,
-                   n_threads = 0) {
-  lreplace(tsne(perplexity),
+                   n_threads = 0, use_cpp = FALSE) {
+  lreplace(tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
            init = function(cost, X, max_iter, verbose = FALSE,
                            ret_extra = c()) {
              cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                               symmetrize = "symmetric", normalize = TRUE,
                               verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads)
+                              n_threads = n_threads, use_cpp = use_cpp)
 
              cost$eps <- eps
              cost
@@ -660,14 +667,15 @@ cetsne <- function(perplexity, inp_kernel = "gaussian",
 
 # t-SNE with input kernel bandwidths transferred to output
 btsne <- function(perplexity, inp_kernel = "gaussian", beta = NULL,
-                  eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(tsne(perplexity = perplexity, inp_kernel = inp_kernel),
+                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+  lreplace(tsne(perplexity = perplexity, inp_kernel = inp_kernel,
+                use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       ret_extra <- unique(c(ret_extra, 'beta'))
       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                        symmetrize = "symmetric", normalize = TRUE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       # override input bandwidths with fixed beta (although this doesn't do much)
       if (!is.null(beta)) {
         cost$beta <- beta
@@ -694,14 +702,15 @@ btsne <- function(perplexity, inp_kernel = "gaussian", beta = NULL,
 
 # SSNE with input kernel bandwidths transferred to output
 bssne <- function(perplexity, inp_kernel = "gaussian", beta = NULL, 
-                  eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(ssne(perplexity = perplexity, inp_kernel = inp_kernel),
+                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+  lreplace(ssne(perplexity = perplexity, inp_kernel = inp_kernel, 
+                use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
        ret_extra <- unique(c(ret_extra, 'beta'))
        cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                         symmetrize = "symmetric", normalize = TRUE,
                         verbose = verbose, ret_extra = ret_extra,
-                        n_threads = n_threads)
+                        n_threads = n_threads, use_cpp = use_cpp)
        if (!is.null(beta)) {
          cost$beta <- beta
        }
@@ -724,16 +733,16 @@ bssne <- function(perplexity, inp_kernel = "gaussian", beta = NULL,
 
 # ASNE with input kernel bandwidths transferred to output
 basne <- function(perplexity, beta = NULL, eps = .Machine$double.eps, 
-                  n_threads = 0) {
+                  n_threads = 0, use_cpp = FALSE) {
   lreplace(
-    asne(perplexity = perplexity),
+    asne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
       ret_extra <- unique(c(ret_extra, 'beta'))
 
       cost <- sne_init(cost, X, perplexity = perplexity,
                        symmetrize = "none", normalize = FALSE,
                        verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads)
+                       n_threads = n_threads, use_cpp = use_cpp)
       # override input bandwidths with fixed beta (although this doesn't do much)
       if (!is.null(beta)) {
         cost$beta <- beta
@@ -779,14 +788,15 @@ btasne <- function(perplexity, beta = NULL, eps = .Machine$double.eps,
   )
 }
 
-tasne <- function(perplexity, n_threads = 0) {
-  lreplace(tsne(perplexity = perplexity),
+tasne <- function(perplexity, n_threads = 0, use_cpp = FALSE) {
+  lreplace(tsne(perplexity = perplexity, use_cpp = use_cpp,
+                n_threads = n_threads),
            init = function(cost, X, max_iter, eps = .Machine$double.eps, 
                            verbose = FALSE, ret_extra = c()) {
              cost <- sne_init(cost, X, perplexity = perplexity,
                               symmetrize = "none", normalize = FALSE,
                               verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads)
+                              n_threads = n_threads, use_cpp = use_cpp)
              cost$eps <- eps
              cost
            },
@@ -814,13 +824,14 @@ tasne <- function(perplexity, n_threads = 0) {
 
 # ASNE but with the t-distributed kernel
 tasne <- function(perplexity, inp_kernel = "gaussian", 
-                  eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(tsne(perplexity = perplexity),
+                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+  lreplace(tsne(perplexity = perplexity, n_threads = n_threads,
+                use_cpp = use_cpp),
   init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-    cost <- sne_init(cost, X, perplexity = perplexity, inp_kernel <- inp_kernel,
+    cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                      symmetrize = "none", normalize = FALSE,
                      verbose = verbose, ret_extra = ret_extra,
-                     n_threads = n_threads)
+                     n_threads = n_threads, use_cpp = use_cpp)
     cost$eps <- eps
     cost
   },
@@ -846,23 +857,26 @@ tasne <- function(perplexity, inp_kernel = "gaussian",
 # t-RM-SNE
 # t-SNE without symmetrization of P (but still pair-normalizing)
 # row-normalize, then matrix normalize
-trmsne <- function(perplexity, inp_kernel = "gaussian", eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(tsne(perplexity = perplexity, inp_kernel = inp_kernel),
-           init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-             cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                              symmetrize = "none", normalize = TRUE,
-                              verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads)
-             cost$eps <- eps
-             cost
-           },
-           gr = function(cost, Y) {
-             cost <- cost_update(cost, Y)
-
-             W <- cost$W
-             cost$G <- k2g(Y, 2 * W * (cost$P - W / cost$Z), symmetrize = TRUE)
-             cost
-           }
+trmsne <- function(perplexity, inp_kernel = "gaussian", 
+                   eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+  lreplace(
+    tsne(perplexity = perplexity, inp_kernel = inp_kernel, 
+         n_threads = n_threads, use_cpp = use_cpp),
+    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
+      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
+                       symmetrize = "none", normalize = TRUE, 
+                       verbose = verbose, ret_extra = ret_extra, 
+                       n_threads = n_threads, use_cpp = use_cpp)
+      cost$eps <- eps
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      
+      W <- cost$W
+      cost$G <- k2g(Y, 2 * W * (cost$P - W / cost$Z), symmetrize = TRUE)
+      cost
+    }
   )
 }
 
@@ -870,14 +884,16 @@ trmsne <- function(perplexity, inp_kernel = "gaussian", eps = .Machine$double.ep
 # t-SNE but without row-normalizing or symmetrizing, just matrix normalization
 # Not recommended
 tmsne <- function(perplexity, inp_kernel = "gaussian", 
-                  eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(trmsne(perplexity = perplexity, inp_kernel = inp_kernel),
+                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+  lreplace(trmsne(perplexity = perplexity, inp_kernel = inp_kernel, 
+                  eps = eps, n_threads = n_threads, use_cpp = use_cpp),
            init = function(cost, X, max_iter, verbose = FALSE,
                            ret_extra = c()) {
              cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
                               symmetrize = "none", row_normalize = FALSE,
                               normalize = TRUE, verbose = verbose, 
-                              ret_extra = ret_extra, n_threads = n_threads)
+                              ret_extra = ret_extra, n_threads = n_threads,
+                              use_cpp = use_cpp)
              cost$eps <- eps
              cost
            }
@@ -886,14 +902,15 @@ tmsne <- function(perplexity, inp_kernel = "gaussian",
 
 # RSR row-normalize, symmetrize, then row-normalize again
 # Might work a tiny bit better than t-ASNE?
-trsrsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(tasne(perplexity),
+trsrsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0,
+                    use_cpp = FALSE) {
+  lreplace(tasne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
            init = function(cost, X, max_iter, verbose = FALSE, 
                            ret_extra = c()) {
              cost <- sne_init(cost, X, perplexity = perplexity,
                               symmetrize = "symmetric", normalize = FALSE,
                               verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads)
+                              n_threads = n_threads, use_cpp = use_cpp)
              P <- cost$P
              P <- P / rowSums(P)
              cost$P <- P
@@ -905,14 +922,15 @@ trsrsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0) {
 }
 
 
-arsrsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0) {
-  lreplace(asne(perplexity),
+arsrsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0,
+                    use_cpp = FALSE) {
+  lreplace(asne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
            init = function(cost, X, max_iter, verbose = FALSE,
                            ret_extra = c()) {
              cost <- sne_init(cost, X, perplexity = perplexity,
                               symmetrize = "symmetric", normalize = FALSE,
                               verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads)
+                              n_threads = n_threads, use_cpp = use_cpp)
              P <- cost$P
              P <- P / rowSums(P)
              cost$P <- P
@@ -970,6 +988,7 @@ sne_init <- function(cost,
                      row_normalize = TRUE,
                      normalize = TRUE,
                      n_threads = 0,
+                     use_cpp = use_cpp,
                      verbose = FALSE,
                      ret_extra = c()) {
   
@@ -1017,7 +1036,8 @@ sne_init <- function(cost,
   }
   else if (tolower(kernel) == "sigma") {
     tsmessage("Using fixed sigma = ", formatC(perplexity))
-    x2ares <- x2aff_sigma(X, sigma = perplexity, verbose = verbose)
+    x2ares <- x2aff_sigma(X, sigma = perplexity, n_threads = n_threads, 
+                          use_cpp = use_cpp, verbose = verbose)
     P <- x2ares$W
   }
   else {
diff --git a/smallvis/man/smallvis.Rd b/smallvis/man/smallvis.Rd
index 7a4e383..b3afb4c 100644
--- a/smallvis/man/smallvis.Rd
+++ b/smallvis/man/smallvis.Rd
@@ -37,6 +37,7 @@ smallvis(
   tol_wait = 15,
   ret_extra = FALSE,
   n_threads = 0,
+  use_cpp = FALSE,
   eps = .Machine$double.eps,
   verbose = TRUE
 )
@@ -195,7 +196,13 @@ to those value which are returned when this value is \code{TRUE}. See the
 \code{Value} section for details.}
 
 \item{n_threads}{Number of threads to use in multi-threaded code. Default is
-0, which means no multi-threading.}
+0, which means no multi-threading. Mainly affects the calculation of things
+like distance matrices if you set \code{use_cpp = TRUE}. Otherwise, only
+methods that need to calculate nearest neighbors will be affected.}
+
+\item{use_cpp}{If \code{TRUE} use multi-threaded C++ code to calculate some
+matrices. Default is \code{FALSE}. This won't speed up all steps and you
+will want to use this in conjunction with \code{n_threads}.}
 
 \item{eps}{Set epsilon for avoiding division-by-zero errors. Default is
 \code{.Machine$double.eps}, but if you see inconsistent convergence results
diff --git a/smallvis/src/.gitignore b/smallvis/src/.gitignore
new file mode 100644
index 0000000..22034c4
--- /dev/null
+++ b/smallvis/src/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.so
+*.dll
diff --git a/smallvis/src/RcppExports.cpp b/smallvis/src/RcppExports.cpp
new file mode 100644
index 0000000..8c05cba
--- /dev/null
+++ b/smallvis/src/RcppExports.cpp
@@ -0,0 +1,47 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
+#endif
+
+// dist2_cpp
+NumericMatrix dist2_cpp(NumericMatrix input, std::size_t n_threads);
+RcppExport SEXP _smallvis_dist2_cpp(SEXP inputSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< NumericMatrix >::type input(inputSEXP);
+    Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(dist2_cpp(input, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// dist_cpp
+NumericMatrix dist_cpp(NumericMatrix input, std::size_t n_threads);
+RcppExport SEXP _smallvis_dist_cpp(SEXP inputSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< NumericMatrix >::type input(inputSEXP);
+    Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(dist_cpp(input, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_smallvis_dist2_cpp", (DL_FUNC) &_smallvis_dist2_cpp, 2},
+    {"_smallvis_dist_cpp", (DL_FUNC) &_smallvis_dist_cpp, 2},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_smallvis(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/smallvis/src/matrix.cpp b/smallvis/src/matrix.cpp
new file mode 100644
index 0000000..5ce07d3
--- /dev/null
+++ b/smallvis/src/matrix.cpp
@@ -0,0 +1,116 @@
+#include <Rcpp.h>
+#include <algorithm>
+#include <cmath>
+#include <thread>
+#include <vector>
+
+using namespace Rcpp;
+
+void d2(const std::vector<double> &data, std::vector<double> &dist_matrix,
+        std::size_t start_row, std::size_t end_row, std::size_t n,
+        std::size_t d) {
+  for (std::size_t i = start_row; i < end_row; ++i) {
+    const std::size_t i_d = i * d;
+    const std::size_t i_n = i * n;
+    for (std::size_t j = 0; j < n; ++j) {
+      const std::size_t j_d = j * d;
+      double dist = 0.0;
+      for (std::size_t k = 0; k < d; ++k) {
+        double diff = data[i_d + k] - data[j_d + k];
+        dist += diff * diff;
+      }
+      dist_matrix[i_n + j] = dist;
+    }
+  }
+}
+
+void dist(const std::vector<double> &data, std::vector<double> &dist_matrix,
+          std::size_t start_row, std::size_t end_row, std::size_t n,
+          std::size_t d) {
+  for (std::size_t i = start_row; i < end_row; ++i) {
+    const std::size_t i_d = i * d;
+    const std::size_t i_n = i * n;
+    for (std::size_t j = 0; j < n; ++j) {
+      const std::size_t j_d = j * d;
+      double dist = 0.0;
+      for (std::size_t k = 0; k < d; ++k) {
+        double diff = data[i_d + k] - data[j_d + k];
+        dist += diff * diff;
+      }
+      dist_matrix[i_n + j] = sqrt(dist);
+    }
+  }
+}
+
+// [[Rcpp::export]]
+NumericMatrix dist2_cpp(NumericMatrix input, std::size_t n_threads = 1) {
+  if (n_threads == 0) {
+    n_threads = 1;
+  }
+  std::size_t n = input.nrow();
+  std::size_t d = input.ncol();
+
+  std::vector<double> transposed_data(n * d);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < d; ++j) {
+      transposed_data[i * d + j] = input(i, j);
+    }
+  }
+
+  std::vector<double> dist_matrix(n * n, 0.0);
+
+  std::size_t chunk_size = (n + n_threads - 1) / n_threads;
+
+  std::vector<std::thread> threads;
+  for (std::size_t t = 0; t < n_threads; ++t) {
+    std::size_t start_row = t * chunk_size;
+    std::size_t end_row = std::min(start_row + chunk_size, n);
+    threads.emplace_back(d2, std::cref(transposed_data), std::ref(dist_matrix),
+                         start_row, end_row, n, d);
+  }
+  for (auto &thread : threads) {
+    thread.join();
+  }
+
+  NumericMatrix result(n, n);
+  std::copy(dist_matrix.begin(), dist_matrix.end(), result.begin());
+
+  return result;
+}
+
+// [[Rcpp::export]]
+NumericMatrix dist_cpp(NumericMatrix input, std::size_t n_threads = 1) {
+  if (n_threads == 0) {
+    n_threads = 1;
+  }
+  std::size_t n = input.nrow();
+  std::size_t d = input.ncol();
+
+  std::vector<double> transposed_data(n * d);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < d; ++j) {
+      transposed_data[i * d + j] = input(i, j);
+    }
+  }
+
+  std::vector<double> dist_matrix(n * n, 0.0);
+
+  std::size_t chunk_size = (n + n_threads - 1) / n_threads;
+
+  std::vector<std::thread> threads;
+  for (std::size_t t = 0; t < n_threads; ++t) {
+    std::size_t start_row = t * chunk_size;
+    std::size_t end_row = std::min(start_row + chunk_size, n);
+    threads.emplace_back(dist, std::cref(transposed_data),
+                         std::ref(dist_matrix), start_row, end_row, n, d);
+  }
+
+  for (auto &thread : threads) {
+    thread.join();
+  }
+
+  NumericMatrix result(n, n);
+  std::copy(dist_matrix.begin(), dist_matrix.end(), result.begin());
+
+  return result;
+}
diff --git a/smallvis/tests/testthat/test_api.R b/smallvis/tests/testthat/test_api.R
index 44ddc92..a1db4f1 100644
--- a/smallvis/tests/testthat/test_api.R
+++ b/smallvis/tests/testthat/test_api.R
@@ -168,6 +168,17 @@ test_that("smmds", {
   expect_equal(final_cost(res), 0.3038, tolerance = 1e-4)
 })
 
+test_that("smmds-cpp", {
+  res <- smallvis(iris10, Y_init = iris10_Y, method = "smmds", eta = 0.001,
+                  epoch_callback = NULL, verbose = FALSE, ret_extra = TRUE,
+                  use_cpp = TRUE)
+  expect_equal(res$Y, c2y(-0.4952, 0.3507, 0.3716, 0.5528, -0.502, -1.449, 0.1852, -0.2768,
+                          1.042, 0.2206, 0.1035, 0.4062, -0.1173, -0.07106, -0.1869, -0.02864,
+                          -0.459, 0.09674, -0.1043, 0.3608), tolerance = 1e-3)
+  expect_equal(final_cost(res), 0.3038, tolerance = 1e-4)
+})
+
+
 test_that("gmmds", {
   res <- smallvis(iris10, Y_init = iris10_Y, method = "gmmds", eta = 0.1,
                   perplexity = 3,

From 5b901dfa2eb4f70b187523446d33b91599ac5d15 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 27 Jul 2024 17:13:15 -0700
Subject: [PATCH 02/26] more C++ dist/t-weight experiments

---
 smallvis/R/RcppExports.R     |  4 ++
 smallvis/R/cost.R            | 58 ++++++++++++-----------
 smallvis/R/smallvis.R        | 89 ++++++++++++++++++++----------------
 smallvis/R/sne.R             | 68 +++++++++++++++------------
 smallvis/src/RcppExports.cpp | 13 ++++++
 smallvis/src/matrix.cpp      | 37 +++++++++++++++
 6 files changed, 173 insertions(+), 96 deletions(-)

diff --git a/smallvis/R/RcppExports.R b/smallvis/R/RcppExports.R
index 923ffe2..5efe2ae 100644
--- a/smallvis/R/RcppExports.R
+++ b/smallvis/R/RcppExports.R
@@ -9,3 +9,7 @@ dist_cpp <- function(input, n_threads = 1L) {
     .Call(`_smallvis_dist_cpp`, input, n_threads)
 }
 
+tweight_cpp <- function(dist_matrix, n_threads) {
+    .Call(`_smallvis_tweight_cpp`, dist_matrix, n_threads)
+}
+
diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index c00e1ab..ed58500 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -160,13 +160,12 @@ largevis <- function(perplexity, inp_kernel = "gaussian",
      },
      gr = function(cost, Y) {
        cost <- cost_update(cost, Y)
-
        W <- cost$W
        cost$G <- k2g(Y, 4 * W * (cost$P - ((gamma * W) / (1 + cost$greps1 * W))))
        cost
      },
      update = function(cost, Y) {
-       W <- dist2(Y)
+       W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
        W <- 1 / (1 + W)
        diag(W) <- 0
 
@@ -225,7 +224,7 @@ umap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
       cost
     },
     update = function(cost, Y) {
-      D2 <- dist2(Y)
+      D2 <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       D2[D2 < 0] <- 0
 
       W <- 1 / (1 + cost$a * D2 ^ cost$b)
@@ -268,7 +267,7 @@ tumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
 
@@ -308,7 +307,7 @@ ntumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
     },
     update = function(cost, Y) {
       P <- cost$P
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
 
@@ -363,7 +362,7 @@ rklsne <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       sumW <- sum(W)
@@ -418,7 +417,7 @@ jssne <- function(perplexity, inp_kernel = "gaussian",
       eps <- cost$eps
       P <- cost$P
       
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       sumW <- sum(W)
@@ -483,7 +482,7 @@ chsne <- function(perplexity, inp_kernel = "gaussian",
     update = function(cost, Y) {
       P <- cost$P
       
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       Z <- sum(W)
@@ -546,7 +545,7 @@ hlsne <- function(perplexity, inp_kernel = "gaussian",
     update = function(cost, Y) {
       P <- cost$P
       
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       Z <- sum(W)
@@ -567,7 +566,7 @@ hlsne <- function(perplexity, inp_kernel = "gaussian",
 # alpha-beta divergence
 absne <- function(perplexity, inp_kernel = "gaussian", 
                   symmetrize = "symmetric", alpha = 1, lambda = 1,
-                  eps = .Machine$double.eps, n_threads = 0) {
+                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
   beta <- lambda - alpha
 
   eps0 <- 1e-5
@@ -634,7 +633,7 @@ absne <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       Z <- sum(W)
@@ -706,7 +705,7 @@ absneb0 <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       Z <- sum(W)
@@ -776,7 +775,7 @@ absneamb <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       Z <- sum(W)
@@ -845,7 +844,7 @@ absnea0 <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       Z <- sum(W)
@@ -913,7 +912,7 @@ absne00 <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       Z <- sum(W)
@@ -990,7 +989,8 @@ abssne <- function(perplexity, inp_kernel = "gaussian",
     update = function(cost, Y) {
       eps <- cost$eps
       
-      Q <- expQ(Y, cost$eps, is_symmetric = TRUE, matrix_normalize = TRUE)$Q
+      Q <- expQ(Y, cost$eps, is_symmetric = TRUE, matrix_normalize = TRUE,
+                use_cpp = use_cpp, n_threads = n_threads)$Q
       
       cost$PaQb <- cost$Pa * powm(Q, beta, eps)
       cost$PaQbc <- colSums(cost$PaQb)
@@ -1066,7 +1066,7 @@ gsne <- function(perplexity, lambda = 1, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       What <- 1 + W
       W <- 1 / What
       diag(W) <- 0
@@ -1416,7 +1416,7 @@ ee <- function(perplexity, lambda = 100, neg_weights = TRUE,
       res
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- exp(-W)
       diag(W) <- 0
       cost$W <- W
@@ -1483,7 +1483,8 @@ nerv <- function(perplexity, lambda = 0.9, inp_kernel = "gaussian",
     update = function(cost, Y) {
       eps <- cost$eps
 
-      Q <- expQ(Y, eps, is_symmetric = TRUE)$Q
+      Q <- expQ(Y, eps, is_symmetric = TRUE, use_cpp = use_cpp, 
+                n_threads = n_threads)$Q
       cost$Q <- Q
       
       # Reverse KL gradient
@@ -1502,7 +1503,7 @@ nerv <- function(perplexity, lambda = 0.9, inp_kernel = "gaussian",
 
 snerv <- function(perplexity, lambda = 0.9, inp_kernel = "gaussian", 
                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0) {
+                  n_threads = 0, use_cpp = FALSE) {
   lambda4 <- 4 * lambda
   oml <- 1 - lambda
   oml4 <- 4 * oml
@@ -1539,7 +1540,8 @@ snerv <- function(perplexity, lambda = 0.9, inp_kernel = "gaussian",
     update = function(cost, Y) {
       eps <- cost$eps
       
-      Q <- expQ(Y, eps, is_symmetric = TRUE, matrix_normalize = TRUE)$Q      
+      Q <- expQ(Y, eps, is_symmetric = TRUE, matrix_normalize = TRUE,
+                use_cpp = use_cpp, n_threads = n_threads)$Q      
       cost$Q <- Q
       
       # Reverse KL gradient
@@ -1610,7 +1612,8 @@ jse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian",
     update = function(cost, Y) {
       eps <- cost$eps
 
-      Q <- expQ(Y, eps = eps, is_symmetric = TRUE)$Q
+      Q <- expQ(Y, eps = eps, is_symmetric = TRUE, use_cpp = use_cpp,
+                n_threads = n_threads)$Q
 
       Z <- kappa * cost$P + om_kappa * Q
       Z[Z < eps] <- eps
@@ -1630,7 +1633,7 @@ jse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian",
 
 sjse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian", 
                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                 n_threads = 0) {
+                 n_threads = 0, use_cpp = FALSE) {
   eps0 <- 1e-5
   kappa <- max(kappa, eps0)
   kappa <- min(kappa, 1 - eps0)
@@ -1642,7 +1645,8 @@ sjse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian",
   
   lreplace(
     ssne(perplexity = perplexity, inp_kernel = inp_kernel, 
-         symmetrize = symmetrize, eps = eps, n_threads = n_threads),
+         symmetrize = symmetrize, eps = eps, n_threads = n_threads,
+         use_cpp = use_cpp),
     pfn = function(cost, Y) {
       cost <- cost_update(cost, Y)
       eps <- cost$eps
@@ -1663,7 +1667,8 @@ sjse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian",
     update = function(cost, Y) {
       eps <- cost$eps
       
-      Q <- expQ(Y, cost$eps, is_symmetric = TRUE, matrix_normalize = TRUE)$Q      
+      Q <- expQ(Y, cost$eps, is_symmetric = TRUE, matrix_normalize = TRUE,
+                use_cpp = use_cpp, n_threads = n_threads)$Q      
       Z <- kappa * cost$P + om_kappa * Q
       Z[Z < eps] <- eps
       diag(Z) <- 0
@@ -1758,7 +1763,8 @@ bnerv <- function(perplexity, lambda = 0.9, eps = .Machine$double.eps,
     update = function(cost, Y) {
       eps <- cost$eps
 
-      Q <- expQ(Y, eps, beta = cost$beta, is_symmetric = FALSE)$Q
+      Q <- expQ(Y, eps, beta = cost$beta, is_symmetric = FALSE,
+                use_cpp = use_cpp, n_threads = n_threads)$Q
       cost$Q <- Q
       
       # Reverse KL gradient
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index 55b2665..a3e7f27 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -893,54 +893,54 @@ smallvis <- function(X, k = 2, scale = "absmax",
   if (is.character(method)) {
     method <- match.arg(tolower(method), method_names)
     cost_fn <- switch(method,
-         tsne = tsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         umap = umap(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         largevis = largevis(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         tumap = tumap(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         ntumap = ntumap(perplexity = perplexity, n_threads = n_threads, eps = eps),
+         tsne = tsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         umap = umap(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         largevis = largevis(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         tumap = tumap(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         ntumap = ntumap(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          mmds = mmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          gmmds = gmmds(k = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         asne = asne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         ssne = ssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         wtsne = wtsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         wssne = wssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         hssne = hssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
+         asne = asne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         ssne = ssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         wtsne = wtsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         wssne = wssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         hssne = hssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          ee = ee(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         nerv = nerv(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         snerv = snerv(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         jse = jse(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         sjse = sjse(perplexity = perplexity, n_threads = n_threads, eps = eps),
+         nerv = nerv(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         snerv = snerv(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         jse = jse(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         sjse = sjse(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          smmds = smmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          sammon = sammon(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         tasne = tasne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         trmsne = trmsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         trsrsne = trsrsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         tmsne = tmsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         arsrsne = arsrsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         rsrjse = rsrjse(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         rsrnerv = rsrnerv(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         btsne = btsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         bssne = bssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         basne = basne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         btasne = btasne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         bnerv = bnerv(perplexity = perplexity, n_threads = n_threads, eps = eps),
+         tasne = tasne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         trmsne = trmsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         trsrsne = trsrsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         tmsne = tmsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         arsrsne = arsrsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         rsrjse = rsrjse(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         rsrnerv = rsrnerv(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         btsne = btsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         bssne = bssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         basne = basne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         btasne = btasne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         bnerv = bnerv(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          ballmmds = ballmmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          knnmmds = knnmmds(k = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         dhssne = dhssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         tsneu = tsneu(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         pstsne = pstsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         skdtsne = skdtsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         usne = usne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         cetsne = cetsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         tee = tee(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         absne = absne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         chsne = chsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         hlsne = hlsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
+         dhssne = dhssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         tsneu = tsneu(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         pstsne = pstsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         skdtsne = skdtsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         usne = usne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         cetsne = cetsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         tee = tee(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         absne = absne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         chsne = chsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         hlsne = hlsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          gsne = gsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         rklsne = rklsne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         jssne = jssne(perplexity = perplexity, n_threads = n_threads, eps = eps),      
-         abssne = abssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
-         bhssne = bhssne(perplexity = perplexity, n_threads = n_threads, eps = eps),
+         rklsne = rklsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         jssne = jssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),      
+         abssne = abssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
+         bhssne = bhssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
          stop("BUG: someone forgot to implement option: '", method, "'")
     )
   }
@@ -2564,6 +2564,15 @@ safe_dist2 <- function(X) {
   D2
 }
 
+calc_tweight <- function(D2, use_cpp = FALSE, n_threads = 1) {
+  if (use_cpp) {
+    tweight_cpp(D2, n_threads = n_threads)
+  }
+  else {
+    1 / (1 + D2)
+  }
+}
+
 # 2-norm of a vector or matrix
 norm2 <- function(X) {
   sqrt(sum(X * X))
diff --git a/smallvis/R/sne.R b/smallvis/R/sne.R
index 741aeb4..af0ebc9 100644
--- a/smallvis/R/sne.R
+++ b/smallvis/R/sne.R
@@ -14,10 +14,12 @@ exp_shift <- function(X) {
 }
 
 expQ <- function(Y, eps = .Machine$double.eps, beta = NULL,
-                     A = NULL,
-                     is_symmetric = FALSE,
-                     matrix_normalize = FALSE) {
-  W <- dist2(Y)
+                 A = NULL,
+                 is_symmetric = FALSE,
+                 matrix_normalize = FALSE,
+                 use_cpp = FALSE,
+                 n_threads = 1) {
+  W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
   
   if (!is.null(beta)) {
     W <- exp_shift(-W * beta)
@@ -126,8 +128,9 @@ tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
       res
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
-      W <- 1 / (1 + W)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      # not faster to use threading
+      W <- calc_tweight(W, use_cpp = FALSE, n_threads = n_threads)
       diag(W) <- 0
 
       cost$Z <- sum(W)
@@ -160,8 +163,8 @@ ssne <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      cost$Q <- expQ(Y, cost$eps, is_symmetric = TRUE,
-                         matrix_normalize = TRUE)$Q
+      cost$Q <- expQ(Y, cost$eps, is_symmetric = TRUE, matrix_normalize = TRUE,
+                     use_cpp = use_cpp, n_threads = n_threads)$Q
       cost
     },
     sentinel = "Q"
@@ -191,7 +194,8 @@ asne <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      cost$Q <- expQ(Y, eps = cost$eps, is_symmetric = FALSE)$Q
+      cost$Q <- expQ(Y, eps = cost$eps, is_symmetric = FALSE,
+                     use_cpp = use_cpp, n_threads = n_threads)$Q
       cost
     },
     sentinel = "Q"
@@ -204,7 +208,7 @@ asne <- function(perplexity, inp_kernel = "gaussian",
 # In \emph{Advances in neural information processing systems} (pp. 2169-2177).
 hssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian", 
                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0) {
+                  n_threads = 0, use_cpp = FALSE) {
   alpha <- max(alpha, 1e-8)
   apow <- -1 / alpha
   lreplace(
@@ -219,7 +223,7 @@ hssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       # to include bandwidth
       # W <- (alpha * beta * W + 1) ^ (-1 / alpha)
       W <- powm(alpha * W + 1, apow, cost$eps)
@@ -264,7 +268,7 @@ bhssne <- function(perplexity, alpha = 0.5, beta = 1,
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- powm(cost$ab * W + 1, cost$apow, cost$eps)
       diag(W) <- 0
       
@@ -278,7 +282,7 @@ bhssne <- function(perplexity, alpha = 0.5, beta = 1,
 # A version of HSSNE where alpha is allowed to vary at every epoch
 dhssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian", 
                    symmetrize = "symmetric", eps = .Machine$double.eps,
-                   n_threads = 0) {
+                   n_threads = 0, use_cpp = FALSE) {
   alpha_min <- 1e-8
   alpha <- max(alpha, alpha_min)
   lreplace(
@@ -322,7 +326,7 @@ dhssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian",
     },
     update = function(cost, Y) {
       alpha <- cost$alpha
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- powm(alpha * W + 1, cost$apow, cost$eps)
       diag(W) <- 0
 
@@ -370,7 +374,7 @@ wtsne <- function(perplexity, inp_kernel = "gaussian",
     update = function(cost, Y) {
       M <- cost$M
 
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- M / (1 + W)
       diag(W) <- 0
 
@@ -411,7 +415,8 @@ wssne <- function(perplexity, inp_kernel = "gaussian",
        cost
      },
      update = function(cost, Y) {
-       cost$Q <- expQ(Y, cost$eps, A = cost$M, matrix_normalize = TRUE)$Q
+       cost$Q <- expQ(Y, cost$eps, A = cost$M, matrix_normalize = TRUE, 
+                      use_cpp = use_cpp, n_threads = n_threads)$Q
        cost
      }
   )
@@ -488,7 +493,7 @@ pstsne <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
 
@@ -536,7 +541,7 @@ tee <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + W)
       diag(W) <- 0
       cost$W <- W
@@ -563,9 +568,10 @@ tee <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
 # UMAP/t-SNE Hybrids ------------------------------------------------------
 
 # Calculate P via normalized smooth knn-distances
-skdtsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0) {
+skdtsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0,
+                    use_cpp = FALSE) {
   tsne(perplexity = perplexity, inp_kernel = "skd", symmetrize = "umap", 
-       eps = eps, n_threads = n_threads)
+       eps = eps, n_threads = n_threads, use_cpp = use_cpp)
 }
 
 # Use the UMAP curve family in output kernel
@@ -585,7 +591,7 @@ usne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
       cost
     },
     update = function(cost, Y) {
-      D2 <- dist2(Y)
+      D2 <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       D2[D2 < 0] <- 0
 
       W <- 1 / (1 + cost$a * D2 ^ cost$b)
@@ -643,7 +649,7 @@ cetsne <- function(perplexity, inp_kernel = "gaussian",
            },
            update = function(cost, Y) {
              P <- cost$P
-             W <- dist2(Y)
+             W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
              W <- 1 / (1 + W)
              diag(W) <- 0
 
@@ -689,7 +695,7 @@ btsne <- function(perplexity, inp_kernel = "gaussian", beta = NULL,
       cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + (cost$beta * W))
       diag(W) <- 0
 
@@ -725,7 +731,8 @@ bssne <- function(perplexity, inp_kernel = "gaussian", beta = NULL,
       cost
     },
     update = function(cost, Y) {
-      cost$Q <- expQ(Y, cost$eps, beta = cost$beta, matrix_normalize = TRUE)$Q
+      cost$Q <- expQ(Y, cost$eps, beta = cost$beta, matrix_normalize = TRUE,
+                     use_cpp = use_cpp, n_threads = n_threads)$Q
       cost
     }
   )
@@ -752,7 +759,8 @@ basne <- function(perplexity, beta = NULL, eps = .Machine$double.eps,
       cost
     },
     update = function(cost, Y) {
-      cost$Q <- expQ(Y, eps = cost$eps, beta = cost$beta, is_symmetric = FALSE)$Q
+      cost$Q <- expQ(Y, eps = cost$eps, beta = cost$beta, is_symmetric = FALSE,
+                     use_cpp = use_cpp, n_threads = n_threads)$Q
       cost
     },
     gr = function(cost, Y) {
@@ -765,9 +773,9 @@ basne <- function(perplexity, beta = NULL, eps = .Machine$double.eps,
 
 # t-ASNE with input kernel bandwidths transferred to output
 btasne <- function(perplexity, beta = NULL, eps = .Machine$double.eps, 
-                   n_threads = 0) {
+                   n_threads = 0, use_cpp = FALSE) {
   lreplace(basne(perplexity = perplexity, beta = beta, eps = eps,
-                 n_threads = n_threads),
+                 n_threads = n_threads, use_cpp = use_cpp),
     pfn = kl_cost,
     gr = function(cost, Y) {
     cost <- cost_update(cost, Y)
@@ -777,7 +785,7 @@ btasne <- function(perplexity, beta = NULL, eps = .Machine$double.eps,
      cost
     },
     update = function(cost, Y) {
-      W <- dist2(Y)
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + cost$beta * W)
       diag(W) <- 0
        
@@ -808,7 +816,7 @@ tasne <- function(perplexity, n_threads = 0, use_cpp = FALSE) {
              cost
            },
            update = function(cost, Y) {
-             W <- dist2(Y)
+             W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
              W <- 1 / (1 + W)
              diag(W) <- 0
              
@@ -843,7 +851,7 @@ tasne <- function(perplexity, inp_kernel = "gaussian",
     cost
   },
   update = function(cost, Y) {
-    W <- dist2(Y)
+    W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
     W <- 1 / (1 + W)
     diag(W) <- 0
 
diff --git a/smallvis/src/RcppExports.cpp b/smallvis/src/RcppExports.cpp
index 8c05cba..8f8e016 100644
--- a/smallvis/src/RcppExports.cpp
+++ b/smallvis/src/RcppExports.cpp
@@ -34,10 +34,23 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// tweight_cpp
+NumericMatrix tweight_cpp(NumericMatrix dist_matrix, int n_threads);
+RcppExport SEXP _smallvis_tweight_cpp(SEXP dist_matrixSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< NumericMatrix >::type dist_matrix(dist_matrixSEXP);
+    Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(tweight_cpp(dist_matrix, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_smallvis_dist2_cpp", (DL_FUNC) &_smallvis_dist2_cpp, 2},
     {"_smallvis_dist_cpp", (DL_FUNC) &_smallvis_dist_cpp, 2},
+    {"_smallvis_tweight_cpp", (DL_FUNC) &_smallvis_tweight_cpp, 2},
     {NULL, NULL, 0}
 };
 
diff --git a/smallvis/src/matrix.cpp b/smallvis/src/matrix.cpp
index 5ce07d3..c954e70 100644
--- a/smallvis/src/matrix.cpp
+++ b/smallvis/src/matrix.cpp
@@ -42,6 +42,13 @@ void dist(const std::vector<double> &data, std::vector<double> &dist_matrix,
   }
 }
 
+void tweight(const std::vector<double> &dist_matrix,
+             std::vector<double> &transformed_matrix, int start, int end) {
+  for (int idx = start; idx < end; ++idx) {
+    transformed_matrix[idx] = 1.0 / (1.0 + dist_matrix[idx]);
+  }
+}
+
 // [[Rcpp::export]]
 NumericMatrix dist2_cpp(NumericMatrix input, std::size_t n_threads = 1) {
   if (n_threads == 0) {
@@ -114,3 +121,33 @@ NumericMatrix dist_cpp(NumericMatrix input, std::size_t n_threads = 1) {
 
   return result;
 }
+
+// [[Rcpp::export]]
+NumericMatrix tweight_cpp(NumericMatrix dist_matrix, int n_threads) {
+  int n = dist_matrix.nrow();
+  int total_elements = n * n;
+
+  std::vector<double> dist_matrix_vec(dist_matrix.begin(), dist_matrix.end());
+
+  std::vector<double> transformed_matrix(total_elements, 0.0);
+
+  int chunk_size = (total_elements + n_threads - 1) / n_threads;
+
+  std::vector<std::thread> threads;
+  for (int t = 0; t < n_threads; ++t) {
+    int start = t * chunk_size;
+    int end = std::min(start + chunk_size, total_elements);
+    threads.emplace_back(tweight, std::cref(dist_matrix_vec),
+                         std::ref(transformed_matrix), start, end);
+  }
+
+  for (auto &thread : threads) {
+    thread.join();
+  }
+
+  NumericMatrix result(n, n);
+  std::copy(transformed_matrix.begin(), transformed_matrix.end(),
+            result.begin());
+
+  return result;
+}

From a6f3ecd7ce6f3202f245e2e22ecbe19986b15d14 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 27 Jul 2024 17:41:46 -0700
Subject: [PATCH 03/26] combine D2/W

---
 smallvis/R/RcppExports.R     |  8 +++--
 smallvis/R/smallvis.R        | 14 ++++++--
 smallvis/R/sne.R             |  4 +--
 smallvis/src/RcppExports.cpp | 19 +++++++++--
 smallvis/src/matrix.cpp      | 62 ++++++++++++++++++++++++++++++++++--
 5 files changed, 94 insertions(+), 13 deletions(-)

diff --git a/smallvis/R/RcppExports.R b/smallvis/R/RcppExports.R
index 5efe2ae..7ad2d3c 100644
--- a/smallvis/R/RcppExports.R
+++ b/smallvis/R/RcppExports.R
@@ -9,7 +9,11 @@ dist_cpp <- function(input, n_threads = 1L) {
     .Call(`_smallvis_dist_cpp`, input, n_threads)
 }
 
-tweight_cpp <- function(dist_matrix, n_threads) {
-    .Call(`_smallvis_tweight_cpp`, dist_matrix, n_threads)
+tweight_cpp <- function(input, n_threads = 1L) {
+    .Call(`_smallvis_tweight_cpp`, input, n_threads)
+}
+
+d2_to_tweight_cpp <- function(dist_matrix, n_threads) {
+    .Call(`_smallvis_d2_to_tweight_cpp`, dist_matrix, n_threads)
 }
 
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index a3e7f27..b739642 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -2564,15 +2564,25 @@ safe_dist2 <- function(X) {
   D2
 }
 
-calc_tweight <- function(D2, use_cpp = FALSE, n_threads = 1) {
+calc_d2tweight <- function(D2, use_cpp = FALSE, n_threads = 1) {
   if (use_cpp) {
-    tweight_cpp(D2, n_threads = n_threads)
+    d2_to_tweight_cpp(D2, n_threads = n_threads)
   }
   else {
     1 / (1 + D2)
   }
 }
 
+calc_tweight <- function(X, use_cpp = FALSE, n_threads = 1) {
+  if (use_cpp) {
+    tweight_cpp(X, n_threads = n_threads)
+  }
+  else {
+    D2 <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
+    1 / (1 + D2)
+  }
+}
+
 # 2-norm of a vector or matrix
 norm2 <- function(X) {
   sqrt(sum(X * X))
diff --git a/smallvis/R/sne.R b/smallvis/R/sne.R
index af0ebc9..59302e4 100644
--- a/smallvis/R/sne.R
+++ b/smallvis/R/sne.R
@@ -128,9 +128,7 @@ tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
       res
     },
     update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      # not faster to use threading
-      W <- calc_tweight(W, use_cpp = FALSE, n_threads = n_threads)
+      W <- calc_tweight(Y, use_cpp = use_cpp, n_threads = n_threads)
       diag(W) <- 0
 
       cost$Z <- sum(W)
diff --git a/smallvis/src/RcppExports.cpp b/smallvis/src/RcppExports.cpp
index 8f8e016..9107744 100644
--- a/smallvis/src/RcppExports.cpp
+++ b/smallvis/src/RcppExports.cpp
@@ -35,14 +35,26 @@ BEGIN_RCPP
 END_RCPP
 }
 // tweight_cpp
-NumericMatrix tweight_cpp(NumericMatrix dist_matrix, int n_threads);
-RcppExport SEXP _smallvis_tweight_cpp(SEXP dist_matrixSEXP, SEXP n_threadsSEXP) {
+NumericMatrix tweight_cpp(NumericMatrix input, std::size_t n_threads);
+RcppExport SEXP _smallvis_tweight_cpp(SEXP inputSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< NumericMatrix >::type input(inputSEXP);
+    Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(tweight_cpp(input, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// d2_to_tweight_cpp
+NumericMatrix d2_to_tweight_cpp(NumericMatrix dist_matrix, int n_threads);
+RcppExport SEXP _smallvis_d2_to_tweight_cpp(SEXP dist_matrixSEXP, SEXP n_threadsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< NumericMatrix >::type dist_matrix(dist_matrixSEXP);
     Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP);
-    rcpp_result_gen = Rcpp::wrap(tweight_cpp(dist_matrix, n_threads));
+    rcpp_result_gen = Rcpp::wrap(d2_to_tweight_cpp(dist_matrix, n_threads));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -51,6 +63,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_smallvis_dist2_cpp", (DL_FUNC) &_smallvis_dist2_cpp, 2},
     {"_smallvis_dist_cpp", (DL_FUNC) &_smallvis_dist_cpp, 2},
     {"_smallvis_tweight_cpp", (DL_FUNC) &_smallvis_tweight_cpp, 2},
+    {"_smallvis_d2_to_tweight_cpp", (DL_FUNC) &_smallvis_d2_to_tweight_cpp, 2},
     {NULL, NULL, 0}
 };
 
diff --git a/smallvis/src/matrix.cpp b/smallvis/src/matrix.cpp
index c954e70..bb10230 100644
--- a/smallvis/src/matrix.cpp
+++ b/smallvis/src/matrix.cpp
@@ -6,6 +6,24 @@
 
 using namespace Rcpp;
 
+void tweight(const std::vector<double> &data, std::vector<double> &dist_matrix,
+        std::size_t start_row, std::size_t end_row, std::size_t n,
+        std::size_t d) {
+  for (std::size_t i = start_row; i < end_row; ++i) {
+    const std::size_t i_d = i * d;
+    const std::size_t i_n = i * n;
+    for (std::size_t j = 0; j < n; ++j) {
+      const std::size_t j_d = j * d;
+      double dist = 0.0;
+      for (std::size_t k = 0; k < d; ++k) {
+        double diff = data[i_d + k] - data[j_d + k];
+        dist += diff * diff;
+      }
+      dist_matrix[i_n + j] = 1.0 / (1.0 + dist);
+    }
+  }
+}
+
 void d2(const std::vector<double> &data, std::vector<double> &dist_matrix,
         std::size_t start_row, std::size_t end_row, std::size_t n,
         std::size_t d) {
@@ -42,7 +60,7 @@ void dist(const std::vector<double> &data, std::vector<double> &dist_matrix,
   }
 }
 
-void tweight(const std::vector<double> &dist_matrix,
+void d2_to_tweight(const std::vector<double> &dist_matrix,
              std::vector<double> &transformed_matrix, int start, int end) {
   for (int idx = start; idx < end; ++idx) {
     transformed_matrix[idx] = 1.0 / (1.0 + dist_matrix[idx]);
@@ -122,8 +140,46 @@ NumericMatrix dist_cpp(NumericMatrix input, std::size_t n_threads = 1) {
   return result;
 }
 
+
+// [[Rcpp::export]]
+NumericMatrix tweight_cpp(NumericMatrix input, std::size_t n_threads = 1) {
+  if (n_threads == 0) {
+    n_threads = 1;
+  }
+  std::size_t n = input.nrow();
+  std::size_t d = input.ncol();
+  
+  std::vector<double> transposed_data(n * d);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < d; ++j) {
+      transposed_data[i * d + j] = input(i, j);
+    }
+  }
+  
+  std::vector<double> dist_matrix(n * n, 0.0);
+  
+  std::size_t chunk_size = (n + n_threads - 1) / n_threads;
+  
+  std::vector<std::thread> threads;
+  for (std::size_t t = 0; t < n_threads; ++t) {
+    std::size_t start_row = t * chunk_size;
+    std::size_t end_row = std::min(start_row + chunk_size, n);
+    threads.emplace_back(tweight, std::cref(transposed_data),
+                         std::ref(dist_matrix), start_row, end_row, n, d);
+  }
+  
+  for (auto &thread : threads) {
+    thread.join();
+  }
+  
+  NumericMatrix result(n, n);
+  std::copy(dist_matrix.begin(), dist_matrix.end(), result.begin());
+  
+  return result;
+}
+
 // [[Rcpp::export]]
-NumericMatrix tweight_cpp(NumericMatrix dist_matrix, int n_threads) {
+NumericMatrix d2_to_tweight_cpp(NumericMatrix dist_matrix, int n_threads) {
   int n = dist_matrix.nrow();
   int total_elements = n * n;
 
@@ -137,7 +193,7 @@ NumericMatrix tweight_cpp(NumericMatrix dist_matrix, int n_threads) {
   for (int t = 0; t < n_threads; ++t) {
     int start = t * chunk_size;
     int end = std::min(start + chunk_size, total_elements);
-    threads.emplace_back(tweight, std::cref(dist_matrix_vec),
+    threads.emplace_back(d2_to_tweight, std::cref(dist_matrix_vec),
                          std::ref(transformed_matrix), start, end);
   }
 

From c73379eba7114ac126d59263a22a08b576a96449 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 27 Jul 2024 18:05:40 -0700
Subject: [PATCH 04/26] check fixes

---
 smallvis/DESCRIPTION          | 2 +-
 smallvis/NAMESPACE            | 1 +
 smallvis/R/smallvis-package.R | 1 +
 smallvis/R/smallvis.R         | 8 ++++----
 smallvis/man/smallvis.Rd      | 2 +-
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/smallvis/DESCRIPTION b/smallvis/DESCRIPTION
index cb379a1..349fa2f 100644
--- a/smallvis/DESCRIPTION
+++ b/smallvis/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: smallvis
 Type: Package
 Title: Small Scale Neighborhood Embedding Algorithms
-Version: 0.0.0.9000
+Version: 0.0.1.9000
 Authors@R: person("James", "Melville", email = "jlmelville@gmail.com",
     role = c("aut", "cre"))
 Description: Neighborhood embedding methods for small scale visualization, such
diff --git a/smallvis/NAMESPACE b/smallvis/NAMESPACE
index f44dae6..c801172 100644
--- a/smallvis/NAMESPACE
+++ b/smallvis/NAMESPACE
@@ -3,4 +3,5 @@
 export(smallvis)
 export(smallvis_perpstep)
 export(smallvis_rep)
+importFrom(Rcpp,sourceCpp)
 useDynLib(smallvis, .registration = TRUE)
diff --git a/smallvis/R/smallvis-package.R b/smallvis/R/smallvis-package.R
index 37a033d..73cd7c4 100644
--- a/smallvis/R/smallvis-package.R
+++ b/smallvis/R/smallvis-package.R
@@ -1,4 +1,5 @@
 ## usethis namespace: start
 #' @useDynLib smallvis, .registration = TRUE
+#' @importFrom Rcpp sourceCpp
 ## usethis namespace: end
 NULL
\ No newline at end of file
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index b739642..e34476a 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -621,7 +621,7 @@
 #' tsne_iris <- smallvis(iris, perplexity = 50, verbose = TRUE)
 #'
 #' # Can use a custom epoch_callback for visualization
-#' colors = rainbow(length(unique(iris$Species)))
+#' colors = grDevices::rainbow(length(unique(iris$Species)))
 #' names(colors) = unique(iris$Species)
 #' ecb = function(x, y) {
 #'   plot(x, t = 'n')
@@ -993,7 +993,7 @@ smallvis <- function(X, k = 2, scale = "absmax",
     }
   }
 
-  if (class(pca) == "character" && pca == "whiten") {
+  if (methods::is(pca, "character") && pca == "whiten") {
     pca <- TRUE
     whiten <- TRUE
   }
@@ -1891,7 +1891,7 @@ make_smallvis_cb <- function(df) {
   palette <- NULL
   function(Y, iter, cost = NULL) {
     if (is.null(palette)) {
-      palette <- vizier:::color_helper(df, color_scheme = rainbow)$palette
+      palette <- vizier:::color_helper(df, color_scheme = grDevices::rainbow)$palette
     }
     title <- paste0("iter: ", iter)
     if (!(is.null(cost) || is.na(cost))) {
@@ -2687,7 +2687,7 @@ find_ab_params <- function(spread = 1, min_dist = 0.001) {
     stats::nls(yv ~ 1 / (1 + a * xv ^ (2 * b)),
              start = list(a = 1, b = 1))$m$getPars()
     }, silent = TRUE)
-    if (class(result) == "try-error") {
+    if (methods::is(result, "try-error")) {
       stop("Can't find a, b for provided spread/min_dist values")
     }
   result
diff --git a/smallvis/man/smallvis.Rd b/smallvis/man/smallvis.Rd
index b3afb4c..cc4f19b 100644
--- a/smallvis/man/smallvis.Rd
+++ b/smallvis/man/smallvis.Rd
@@ -714,7 +714,7 @@ function with these optimizers.
 tsne_iris <- smallvis(iris, perplexity = 50, verbose = TRUE)
 
 # Can use a custom epoch_callback for visualization
-colors = rainbow(length(unique(iris$Species)))
+colors = grDevices::rainbow(length(unique(iris$Species)))
 names(colors) = unique(iris$Species)
 ecb = function(x, y) {
   plot(x, t = 'n')

From 954bbc6d8119a05675ddf756a65b302a9d0a64a4 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sun, 28 Jul 2024 14:44:41 -0700
Subject: [PATCH 05/26] C++ multi-threaded t-SNE grad

---
 smallvis/DESCRIPTION                 |  2 +-
 smallvis/R/RcppExports.R             |  4 ++
 smallvis/R/sne.R                     |  8 ++-
 smallvis/src/RcppExports.cpp         | 16 +++++
 smallvis/src/matrix.cpp              | 96 ++++++++++++++++++++++++----
 smallvis/tests/testthat/helper_api.R |  4 +-
 smallvis/tests/testthat/test_api.R   |  9 +++
 7 files changed, 123 insertions(+), 16 deletions(-)

diff --git a/smallvis/DESCRIPTION b/smallvis/DESCRIPTION
index 349fa2f..7470b56 100644
--- a/smallvis/DESCRIPTION
+++ b/smallvis/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: smallvis
 Type: Package
 Title: Small Scale Neighborhood Embedding Algorithms
-Version: 0.0.1.9000
+Version: 0.0.1.9001
 Authors@R: person("James", "Melville", email = "jlmelville@gmail.com",
     role = c("aut", "cre"))
 Description: Neighborhood embedding methods for small scale visualization, such
diff --git a/smallvis/R/RcppExports.R b/smallvis/R/RcppExports.R
index 7ad2d3c..05e26d5 100644
--- a/smallvis/R/RcppExports.R
+++ b/smallvis/R/RcppExports.R
@@ -17,3 +17,7 @@ d2_to_tweight_cpp <- function(dist_matrix, n_threads) {
     .Call(`_smallvis_d2_to_tweight_cpp`, dist_matrix, n_threads)
 }
 
+tsne_grad_cpp <- function(P, W, Z, Y, n_threads) {
+    .Call(`_smallvis_tsne_grad_cpp`, P, W, Z, Y, n_threads)
+}
+
diff --git a/smallvis/R/sne.R b/smallvis/R/sne.R
index 59302e4..02647fc 100644
--- a/smallvis/R/sne.R
+++ b/smallvis/R/sne.R
@@ -112,8 +112,12 @@ tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
     gr = function(cost, Y) {
       cost <- cost_update(cost, Y)
       P <- cost$P
-      cost$G <- k2g(Y, 4 * cost$W * (P - cost$W / cost$Z))
-      
+      if (use_cpp) {
+        cost$G <- tsne_grad_cpp(P, cost$W, cost$Z, Y, n_threads = n_threads)
+      }
+      else {
+        cost$G <- k2g(Y, 4 * cost$W * (P - cost$W / cost$Z))
+      }
       cost
     },
     export = function(cost, val) {
diff --git a/smallvis/src/RcppExports.cpp b/smallvis/src/RcppExports.cpp
index 9107744..5ac503a 100644
--- a/smallvis/src/RcppExports.cpp
+++ b/smallvis/src/RcppExports.cpp
@@ -58,12 +58,28 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// tsne_grad_cpp
+NumericMatrix tsne_grad_cpp(const NumericMatrix& P, const NumericMatrix& W, double Z, const NumericMatrix& Y, std::size_t n_threads);
+RcppExport SEXP _smallvis_tsne_grad_cpp(SEXP PSEXP, SEXP WSEXP, SEXP ZSEXP, SEXP YSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const NumericMatrix& >::type P(PSEXP);
+    Rcpp::traits::input_parameter< const NumericMatrix& >::type W(WSEXP);
+    Rcpp::traits::input_parameter< double >::type Z(ZSEXP);
+    Rcpp::traits::input_parameter< const NumericMatrix& >::type Y(YSEXP);
+    Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(tsne_grad_cpp(P, W, Z, Y, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_smallvis_dist2_cpp", (DL_FUNC) &_smallvis_dist2_cpp, 2},
     {"_smallvis_dist_cpp", (DL_FUNC) &_smallvis_dist_cpp, 2},
     {"_smallvis_tweight_cpp", (DL_FUNC) &_smallvis_tweight_cpp, 2},
     {"_smallvis_d2_to_tweight_cpp", (DL_FUNC) &_smallvis_d2_to_tweight_cpp, 2},
+    {"_smallvis_tsne_grad_cpp", (DL_FUNC) &_smallvis_tsne_grad_cpp, 5},
     {NULL, NULL, 0}
 };
 
diff --git a/smallvis/src/matrix.cpp b/smallvis/src/matrix.cpp
index bb10230..5febb97 100644
--- a/smallvis/src/matrix.cpp
+++ b/smallvis/src/matrix.cpp
@@ -6,9 +6,32 @@
 
 using namespace Rcpp;
 
+void tsne_grad(const std::vector<double> &P, const std::vector<double> &W,
+               double Z, const std::vector<double> &Y,
+               std::vector<double> &gradient, std::size_t start,
+               std::size_t end, std::size_t n, std::size_t d) {
+
+  const double Z4 = 4.0 / Z;
+  
+  for (std::size_t i = start; i < end; ++i) {
+    const std::size_t i_d = i * d;
+    const std::size_t i_n = i * n;
+    for (std::size_t j = 0; j < n; ++j) {
+      if (i == j) {
+        continue;
+      }
+      const std::size_t ij = i_n + j;
+      double k_ij = Z4 * W[ij] * (Z * P[ij] - W[ij]);
+      for (std::size_t k = 0; k < d; ++k) {
+        gradient[i_d + k] += k_ij * (Y[i_d + k] - Y[j * d + k]);
+      }
+    }
+  }
+}
+
 void tweight(const std::vector<double> &data, std::vector<double> &dist_matrix,
-        std::size_t start_row, std::size_t end_row, std::size_t n,
-        std::size_t d) {
+             std::size_t start_row, std::size_t end_row, std::size_t n,
+             std::size_t d) {
   for (std::size_t i = start_row; i < end_row; ++i) {
     const std::size_t i_d = i * d;
     const std::size_t i_n = i * n;
@@ -61,7 +84,8 @@ void dist(const std::vector<double> &data, std::vector<double> &dist_matrix,
 }
 
 void d2_to_tweight(const std::vector<double> &dist_matrix,
-             std::vector<double> &transformed_matrix, int start, int end) {
+                   std::vector<double> &transformed_matrix, int start,
+                   int end) {
   for (int idx = start; idx < end; ++idx) {
     transformed_matrix[idx] = 1.0 / (1.0 + dist_matrix[idx]);
   }
@@ -140,7 +164,6 @@ NumericMatrix dist_cpp(NumericMatrix input, std::size_t n_threads = 1) {
   return result;
 }
 
-
 // [[Rcpp::export]]
 NumericMatrix tweight_cpp(NumericMatrix input, std::size_t n_threads = 1) {
   if (n_threads == 0) {
@@ -148,18 +171,18 @@ NumericMatrix tweight_cpp(NumericMatrix input, std::size_t n_threads = 1) {
   }
   std::size_t n = input.nrow();
   std::size_t d = input.ncol();
-  
+
   std::vector<double> transposed_data(n * d);
   for (std::size_t i = 0; i < n; ++i) {
     for (std::size_t j = 0; j < d; ++j) {
       transposed_data[i * d + j] = input(i, j);
     }
   }
-  
+
   std::vector<double> dist_matrix(n * n, 0.0);
-  
+
   std::size_t chunk_size = (n + n_threads - 1) / n_threads;
-  
+
   std::vector<std::thread> threads;
   for (std::size_t t = 0; t < n_threads; ++t) {
     std::size_t start_row = t * chunk_size;
@@ -167,19 +190,22 @@ NumericMatrix tweight_cpp(NumericMatrix input, std::size_t n_threads = 1) {
     threads.emplace_back(tweight, std::cref(transposed_data),
                          std::ref(dist_matrix), start_row, end_row, n, d);
   }
-  
+
   for (auto &thread : threads) {
     thread.join();
   }
-  
+
   NumericMatrix result(n, n);
   std::copy(dist_matrix.begin(), dist_matrix.end(), result.begin());
-  
+
   return result;
 }
 
 // [[Rcpp::export]]
 NumericMatrix d2_to_tweight_cpp(NumericMatrix dist_matrix, int n_threads) {
+  if (n_threads == 0) {
+    n_threads = 1;
+  }
   int n = dist_matrix.nrow();
   int total_elements = n * n;
 
@@ -207,3 +233,51 @@ NumericMatrix d2_to_tweight_cpp(NumericMatrix dist_matrix, int n_threads) {
 
   return result;
 }
+
+// [[Rcpp::export]]
+NumericMatrix tsne_grad_cpp(const NumericMatrix &P,
+                                    const NumericMatrix &W, double Z,
+                                    const NumericMatrix &Y,
+                                    std::size_t n_threads) {
+  std::size_t n = Y.nrow();
+  std::size_t d = Y.ncol();
+
+  std::vector<double> P_vec(P.begin(), P.end());
+  std::vector<double> W_vec(W.begin(), W.end());
+
+  std::vector<double> Y_vec(n * d);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < d; ++j) {
+      Y_vec[i * d + j] = Y(i, j);
+    }
+  }
+
+  std::vector<double> gradient_vec(n * d, 0.0);
+
+  if (n_threads > 1) {
+    std::size_t chunk_size = (n + n_threads - 1) / n_threads;
+    std::vector<std::thread> threads;
+    for (std::size_t t = 0; t < n_threads; ++t) {
+      std::size_t start_row = t * chunk_size;
+      std::size_t end_row = std::min(start_row + chunk_size, n);
+      threads.emplace_back(tsne_grad, std::cref(P_vec), std::cref(W_vec), Z,
+                           std::cref(Y_vec), std::ref(gradient_vec), start_row,
+                           end_row, n, d);
+    }
+    for (auto &thread : threads) {
+      thread.join();
+    }
+  } else {
+    tsne_grad(P_vec, W_vec, Z, Y_vec, gradient_vec, 0, n, n, d);
+  }
+
+  NumericMatrix gradient(n, d);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < d; ++j) {
+      gradient(i, j) = gradient_vec[i * d + j];
+    }
+  }
+
+  return gradient;
+}
+
diff --git a/smallvis/tests/testthat/helper_api.R b/smallvis/tests/testthat/helper_api.R
index 46e9317..6b679ad 100644
--- a/smallvis/tests/testthat/helper_api.R
+++ b/smallvis/tests/testthat/helper_api.R
@@ -1,8 +1,8 @@
-expect_api <- function(method, Y, cost, X = iris10) {
+expect_api <- function(method, Y, cost, X = iris10, use_cpp = FALSE) {
   res <- smallvis(X, Y_init = iris10_Y, 
                   method = method, eta = 0.1,
                   perplexity = 5, epoch_callback = NULL, verbose = FALSE, 
-                  ret_extra = TRUE)
+                  ret_extra = TRUE, use_cpp = use_cpp)
   expect_equal(res$Y, c2y(Y), tolerance = 1e-3, info = paste0(method[[1]], " Y"))
   expect_equal(final_cost(res), cost, tolerance = 1e-4, info = paste0(method[[1]], " cost"))
 }
diff --git a/smallvis/tests/testthat/test_api.R b/smallvis/tests/testthat/test_api.R
index a1db4f1..1ae7e7e 100644
--- a/smallvis/tests/testthat/test_api.R
+++ b/smallvis/tests/testthat/test_api.R
@@ -512,6 +512,15 @@ test_that("Miscellany", {
                                        0.04283, 1.227, -0.2283, -2.662, -1.389,
                                        2.117, 1.291, -1.949, -0.4529, 2.002), X = ui10, 
              cost = 0.1442)
+  expect_api(method = "tsne", Y = c(-4.617, 2.008, -0.8907, 4.044, 0.6146, -3.478, 0.4454,
+                                    1.967, 4.82, -4.913, 0.5231, 0.837, 0.02866, -1.871, -1.438,
+                                    1.273, 0.6842, -1.849, -0.6084, 2.42), X = ui10,
+             cost = 0.02485)
+  expect_api(method = "tsne", Y = c(-4.617, 2.008, -0.8907, 4.044, 0.6146, -3.478, 0.4454,
+                                    1.967, 4.82, -4.913, 0.5231, 0.837, 0.02866, -1.871, -1.438,
+                                    1.273, 0.6842, -1.849, -0.6084, 2.42), X = ui10, use_cpp = TRUE,
+             cost = 0.02485) 
+             
 })
 
 test_that("repeated runs", {

From 4705f6893f4814469c646049cc292897ae7f6930 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sun, 28 Jul 2024 18:39:09 -0700
Subject: [PATCH 06/26] rename source file

---
 smallvis/src/{matrix.cpp => gradients.cpp} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename smallvis/src/{matrix.cpp => gradients.cpp} (100%)

diff --git a/smallvis/src/matrix.cpp b/smallvis/src/gradients.cpp
similarity index 100%
rename from smallvis/src/matrix.cpp
rename to smallvis/src/gradients.cpp

From f5c1bd98368d7117f22fe9a617249bbe7b3a12e3 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sun, 28 Jul 2024 18:40:34 -0700
Subject: [PATCH 07/26] perplexity calibration in multi-threaded C++

---
 smallvis/R/RcppExports.R     |   4 +
 smallvis/R/smallvis.R        |  11 +--
 smallvis/R/sne.R             |  10 ++-
 smallvis/man/smallvis.Rd     |  11 +--
 smallvis/src/RcppExports.cpp |  16 ++++
 smallvis/src/perplexity.cpp  | 158 +++++++++++++++++++++++++++++++++++
 6 files changed, 197 insertions(+), 13 deletions(-)
 create mode 100644 smallvis/src/perplexity.cpp

diff --git a/smallvis/R/RcppExports.R b/smallvis/R/RcppExports.R
index 05e26d5..44391e2 100644
--- a/smallvis/R/RcppExports.R
+++ b/smallvis/R/RcppExports.R
@@ -21,3 +21,7 @@ tsne_grad_cpp <- function(P, W, Z, Y, n_threads) {
     .Call(`_smallvis_tsne_grad_cpp`, P, W, Z, Y, n_threads)
 }
 
+find_beta_cpp <- function(X, perplexity = 15, tol = 1e-5, max_tries = 50L, n_threads = 1L) {
+    .Call(`_smallvis_find_beta_cpp`, X, perplexity, tol, max_tries, n_threads)
+}
+
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index e34476a..b1f2dc3 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -525,11 +525,12 @@
 #'   \code{Value} section for details.
 #' @param n_threads Number of threads to use in multi-threaded code. Default is
 #'   0, which means no multi-threading. Mainly affects the calculation of things
-#'   like distance matrices if you set \code{use_cpp = TRUE}. Otherwise, only
-#'   methods that need to calculate nearest neighbors will be affected.
-#' @param use_cpp If \code{TRUE} use multi-threaded C++ code to calculate some
-#'   matrices. Default is \code{FALSE}. This won't speed up all steps and you
-#'   will want to use this in conjunction with \code{n_threads}.
+#'   like distance matrices and perplexity calibration if you set 
+#'   \code{use_cpp = TRUE}. Otherwise, only methods that need to calculate
+#'   nearest neighbors will be affected.
+#' @param use_cpp If \code{TRUE} use multi-threaded C++ code for some
+#'   calculations. Default is \code{FALSE}. This won't speed up all steps and
+#'   you will want to use this in conjunction with \code{n_threads}.
 #' @param eps Set epsilon for avoiding division-by-zero errors. Default is
 #'   \code{.Machine$double.eps}, but if you see inconsistent convergence results
 #'   with optimizer that should be reducing the cost each iteration, then try
diff --git a/smallvis/R/sne.R b/smallvis/R/sne.R
index 02647fc..e58c611 100644
--- a/smallvis/R/sne.R
+++ b/smallvis/R/sne.R
@@ -1056,9 +1056,13 @@ sne_init <- function(cost,
     }
     tsmessage("Commencing calibration for perplexity = ",
                 format_perps(perplexity))
-    x2ares <- x2aff(X, perplexity, tol = 1e-5, kernel = kernel,
-                      verbose = verbose)
-    P <- x2ares$W
+    if (use_cpp) {
+      P <- find_beta_cpp(X, perplexity, tol = 1e-5, n_threads = n_threads)$W
+    }
+    else {
+      x2ares <- x2aff(X, perplexity, tol = 1e-5, kernel = kernel, verbose = verbose)
+      P <- x2ares$W
+    }
   }
 
   P <- scale_affinities(P, 
diff --git a/smallvis/man/smallvis.Rd b/smallvis/man/smallvis.Rd
index cc4f19b..284de3c 100644
--- a/smallvis/man/smallvis.Rd
+++ b/smallvis/man/smallvis.Rd
@@ -197,12 +197,13 @@ to those value which are returned when this value is \code{TRUE}. See the
 
 \item{n_threads}{Number of threads to use in multi-threaded code. Default is
 0, which means no multi-threading. Mainly affects the calculation of things
-like distance matrices if you set \code{use_cpp = TRUE}. Otherwise, only
-methods that need to calculate nearest neighbors will be affected.}
+like distance matrices and perplexity calibration if you set 
+\code{use_cpp = TRUE}. Otherwise, only methods that need to calculate
+nearest neighbors will be affected.}
 
-\item{use_cpp}{If \code{TRUE} use multi-threaded C++ code to calculate some
-matrices. Default is \code{FALSE}. This won't speed up all steps and you
-will want to use this in conjunction with \code{n_threads}.}
+\item{use_cpp}{If \code{TRUE} use multi-threaded C++ code for some
+calculations. Default is \code{FALSE}. This won't speed up all steps and
+you will want to use this in conjunction with \code{n_threads}.}
 
 \item{eps}{Set epsilon for avoiding division-by-zero errors. Default is
 \code{.Machine$double.eps}, but if you see inconsistent convergence results
diff --git a/smallvis/src/RcppExports.cpp b/smallvis/src/RcppExports.cpp
index 5ac503a..372b9ad 100644
--- a/smallvis/src/RcppExports.cpp
+++ b/smallvis/src/RcppExports.cpp
@@ -73,6 +73,21 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// find_beta_cpp
+List find_beta_cpp(const NumericMatrix& X, double perplexity, double tol, int max_tries, std::size_t n_threads);
+RcppExport SEXP _smallvis_find_beta_cpp(SEXP XSEXP, SEXP perplexitySEXP, SEXP tolSEXP, SEXP max_triesSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const NumericMatrix& >::type X(XSEXP);
+    Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP);
+    Rcpp::traits::input_parameter< double >::type tol(tolSEXP);
+    Rcpp::traits::input_parameter< int >::type max_tries(max_triesSEXP);
+    Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(find_beta_cpp(X, perplexity, tol, max_tries, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_smallvis_dist2_cpp", (DL_FUNC) &_smallvis_dist2_cpp, 2},
@@ -80,6 +95,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_smallvis_tweight_cpp", (DL_FUNC) &_smallvis_tweight_cpp, 2},
     {"_smallvis_d2_to_tweight_cpp", (DL_FUNC) &_smallvis_d2_to_tweight_cpp, 2},
     {"_smallvis_tsne_grad_cpp", (DL_FUNC) &_smallvis_tsne_grad_cpp, 5},
+    {"_smallvis_find_beta_cpp", (DL_FUNC) &_smallvis_find_beta_cpp, 5},
     {NULL, NULL, 0}
 };
 
diff --git a/smallvis/src/perplexity.cpp b/smallvis/src/perplexity.cpp
new file mode 100644
index 0000000..b261897
--- /dev/null
+++ b/smallvis/src/perplexity.cpp
@@ -0,0 +1,158 @@
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+void find_beta(const std::vector<double> &data, std::size_t n, std::size_t d,
+               double perplexity, double logU, double tol, int max_tries,
+               std::vector<double> &W, std::vector<double> &beta, int &bad_perp,
+               std::size_t start_row, std::size_t end_row) {
+
+  for (std::size_t i = start_row; i < end_row; ++i) {
+    const std::size_t i_d = i * d;
+    const std::size_t idx = i * n;
+
+    std::vector<double> D2i(n);
+    double sum_d2i = 0.0;
+
+    // D2
+    for (std::size_t j = 0; j < n; ++j) {
+      const std::size_t j_d = j * d;
+      double dist = 0.0;
+      for (std::size_t k = 0; k < d; ++k) {
+        double diff = data[i_d + k] - data[j_d + k];
+        dist += diff * diff;
+      }
+      sum_d2i += dist;
+      D2i[j] = dist;
+    }
+
+    double betamin = -std::numeric_limits<double>::infinity();
+    double betamax = std::numeric_limits<double>::infinity();
+
+    // Initial guess for beta: 0.5 * perplexity / mean(D2i)
+    beta[i] = (0.5 * perplexity * n) / sum_d2i;
+
+    double Z = 0.0;
+    double entropy = 0.0;
+    for (std::size_t j = 0; j < n; ++j) {
+      if (i == j) {
+        W[idx + j] = 0.0;
+        continue;
+      }
+      W[idx + j] = exp(-D2i[j] * beta[i]);
+      entropy += D2i[j] * W[idx + j];
+      Z += W[idx + j];
+    }
+    if (Z == 0.0) {
+      entropy = 0.0;
+    } else {
+      entropy = (entropy / Z) * beta[i] + log(Z);
+    }
+
+    double Hdiff = entropy - logU;
+
+    int tries = 0;
+    while (fabs(Hdiff) > tol && tries < max_tries) {
+      if (Hdiff > 0) {
+        betamin = beta[i];
+        if (std::isinf(betamax)) {
+          beta[i] *= 2;
+        } else {
+          beta[i] = (beta[i] + betamax) / 2;
+        }
+      } else {
+        betamax = beta[i];
+        if (std::isinf(betamin)) {
+          beta[i] /= 2;
+        } else {
+          beta[i] = (beta[i] + betamin) / 2;
+        }
+      }
+
+      Z = 0.0;
+      entropy = 0.0;
+      for (std::size_t j = 0; j < n; ++j) {
+        if (i == j) {
+          W[idx + j] = 0.0;
+          continue;
+        }
+        W[idx + j] = exp(-D2i[j] * beta[i]);
+        entropy += D2i[j] * W[idx + j];
+        Z += W[idx + j];
+      }
+      if (Z == 0.0) {
+        entropy = 0.0;
+      } else {
+        entropy = (entropy / Z) * beta[i] + log(Z);
+      }
+
+      Hdiff = entropy - logU;
+      tries++;
+    }
+
+    if (fabs(Hdiff) > tol) {
+      bad_perp++;
+      std::fill(W.begin() + idx, W.begin() + idx + n, 0.0);
+      std::vector<int> knn_idx(n);
+      std::iota(knn_idx.begin(), knn_idx.end(), 0);
+      std::partial_sort(
+          knn_idx.begin(),
+          knn_idx.begin() + std::max(static_cast<int>(floor(perplexity)), 1),
+          knn_idx.end(), [&D2i](int a, int b) { return D2i[a] < D2i[b]; });
+      for (int k = 0; k < std::max(static_cast<int>(floor(perplexity)), 1);
+           ++k) {
+        W[idx + knn_idx[k]] = 1.0;
+      }
+    }
+  }
+}
+
+// [[Rcpp::export]]
+List find_beta_cpp(const NumericMatrix &X, double perplexity = 15,
+                   double tol = 1e-5, int max_tries = 50,
+                   std::size_t n_threads = 1) {
+
+  std::size_t n = X.nrow();
+  std::size_t d = X.ncol();
+  double logU = log(perplexity);
+
+  std::vector<double> X_vec(n * d);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < d; ++j) {
+      X_vec[i * d + j] = X(i, j);
+    }
+  }
+
+  std::vector<double> W(n * n, 0.0);
+  std::vector<double> beta(n, 0.0);
+  int bad_perp = 0;
+  if (n_threads > 1) {
+    std::size_t chunk_size = (n + n_threads - 1) / n_threads;
+    std::vector<std::thread> threads;
+    for (std::size_t t = 0; t < n_threads; ++t) {
+      std::size_t start_row = t * chunk_size;
+      std::size_t end_row = std::min(start_row + chunk_size, n);
+      threads.emplace_back(find_beta, std::cref(X_vec), n, d, perplexity, logU,
+                           tol, max_tries, std::ref(W), std::ref(beta),
+                           std::ref(bad_perp), start_row, end_row);
+    }
+    for (auto &thread : threads) {
+      thread.join();
+    }
+  } else {
+    find_beta(X_vec, n, d, perplexity, logU, tol, max_tries, W, beta, bad_perp,
+              0, n);
+  }
+
+  NumericMatrix W_mat(n, n);
+  std::copy(W.begin(), W.end(), W_mat.begin());
+  return List::create(Named("W") = transpose(W_mat), Named("beta") = beta,
+                      Named("bad_perp") = bad_perp);
+}

From a83817a92efc22bcd9b231486ba62b5653d63f09 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sun, 28 Jul 2024 18:46:39 -0700
Subject: [PATCH 08/26] Fix 1/k knn if perplexity calibration fails

---
 smallvis/R/smallvis.R | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index b1f2dc3..05dad45 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -2211,13 +2211,15 @@ x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
       tries <- tries + 1
     }
     if (abs(Hdiff) > tol) {
+      # Put a weight of 1/perplexity on the perplexity-nearest neighbors
       bad_perp <- bad_perp + 1
       knn_idx <- order(Di, decreasing = FALSE)[1:max(floor(perplexity), 1)]
       knn_idx[knn_idx >= i] <- knn_idx[knn_idx >= i]  + 1
-      Wi <- rep(0, length(Di))
-      Wi[knn_idx] <- 1
+      Wi <- rep(0, n)
+      Wi[knn_idx] <- 1 / floor(perplexity)
 
       intd[i] <- 0
+      W[i, ] <- Wi
     }
     else {
       # if we didn't supply estimates for beta manually, then initialize guess for
@@ -2227,8 +2229,9 @@ x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
         beta[i + 1] <- beta[i]
       }
       intd[i] <- intd_x2aff(Di, beta[i], Wi, sumWi, H)
+      W[i, -i] <- Wi
     }
-    W[i, -i] <- Wi
+    
   }
   sigma <- sqrt(1 / beta)
 

From 491d3722f2806e09c9c527861d9d37fda6d94512 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Wed, 31 Jul 2024 22:17:16 -0700
Subject: [PATCH 09/26] add multi-threaded MMDS gradient

---
 smallvis/R/RcppExports.R     |  4 ++
 smallvis/R/cost.R            |  7 +++-
 smallvis/src/RcppExports.cpp | 16 ++++++++
 smallvis/src/gradients.cpp   | 72 ++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/smallvis/R/RcppExports.R b/smallvis/R/RcppExports.R
index 44391e2..3728f9e 100644
--- a/smallvis/R/RcppExports.R
+++ b/smallvis/R/RcppExports.R
@@ -21,6 +21,10 @@ tsne_grad_cpp <- function(P, W, Z, Y, n_threads) {
     .Call(`_smallvis_tsne_grad_cpp`, P, W, Z, Y, n_threads)
 }
 
+mmds_grad_cpp <- function(R, D, Y, eps, n_threads) {
+    .Call(`_smallvis_mmds_grad_cpp`, R, D, Y, eps, n_threads)
+}
+
 find_beta_cpp <- function(X, perplexity = 15, tol = 1e-5, max_tries = 50L, n_threads = 1L) {
     .Call(`_smallvis_find_beta_cpp`, X, perplexity, tol, max_tries, n_threads)
 }
diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index ed58500..db81356 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -1114,7 +1114,12 @@ mmds <- function(eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 1) {
     },
     gr = function(cost, Y) {
       cost <- cost_update(cost, Y)
-      cost$G <- k2g(Y, -4 * (cost$R - cost$D) / (cost$D + cost$eps))
+      if (use_cpp) {
+        cost$G <- mmds_grad_cpp(cost$R, cost$D, Y, eps = eps, n_threads = n_threads)
+      }
+      else {
+        cost$G <- k2g(Y, -4 * (cost$R - cost$D) / (cost$D + cost$eps))
+      }
       cost
     },
     update = function(cost, Y) {
diff --git a/smallvis/src/RcppExports.cpp b/smallvis/src/RcppExports.cpp
index 372b9ad..93960dc 100644
--- a/smallvis/src/RcppExports.cpp
+++ b/smallvis/src/RcppExports.cpp
@@ -73,6 +73,21 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// mmds_grad_cpp
+NumericMatrix mmds_grad_cpp(const NumericMatrix& R, const NumericMatrix& D, const NumericMatrix& Y, double eps, std::size_t n_threads);
+RcppExport SEXP _smallvis_mmds_grad_cpp(SEXP RSEXP, SEXP DSEXP, SEXP YSEXP, SEXP epsSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const NumericMatrix& >::type R(RSEXP);
+    Rcpp::traits::input_parameter< const NumericMatrix& >::type D(DSEXP);
+    Rcpp::traits::input_parameter< const NumericMatrix& >::type Y(YSEXP);
+    Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
+    Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(mmds_grad_cpp(R, D, Y, eps, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
 // find_beta_cpp
 List find_beta_cpp(const NumericMatrix& X, double perplexity, double tol, int max_tries, std::size_t n_threads);
 RcppExport SEXP _smallvis_find_beta_cpp(SEXP XSEXP, SEXP perplexitySEXP, SEXP tolSEXP, SEXP max_triesSEXP, SEXP n_threadsSEXP) {
@@ -95,6 +110,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_smallvis_tweight_cpp", (DL_FUNC) &_smallvis_tweight_cpp, 2},
     {"_smallvis_d2_to_tweight_cpp", (DL_FUNC) &_smallvis_d2_to_tweight_cpp, 2},
     {"_smallvis_tsne_grad_cpp", (DL_FUNC) &_smallvis_tsne_grad_cpp, 5},
+    {"_smallvis_mmds_grad_cpp", (DL_FUNC) &_smallvis_mmds_grad_cpp, 5},
     {"_smallvis_find_beta_cpp", (DL_FUNC) &_smallvis_find_beta_cpp, 5},
     {NULL, NULL, 0}
 };
diff --git a/smallvis/src/gradients.cpp b/smallvis/src/gradients.cpp
index 5febb97..055202f 100644
--- a/smallvis/src/gradients.cpp
+++ b/smallvis/src/gradients.cpp
@@ -6,6 +6,28 @@
 
 using namespace Rcpp;
 
+void mmds_grad(const std::vector<double> &R, const std::vector<double> &D,
+               const std::vector<double> &Y, double eps,
+               std::vector<double> &gradient, std::size_t start,
+               std::size_t end, std::size_t n, std::size_t d) {
+  
+  for (std::size_t i = start; i < end; ++i) {
+    const std::size_t i_d = i * d;
+    const std::size_t i_n = i * n;
+    for (std::size_t j = 0; j < n; ++j) {
+      if (i == j) {
+        continue;
+      }
+      const std::size_t ij = i_n + j;
+      double k_ij = 4.0 * (D[ij] - R[ij]) / (D[ij] + eps);
+      for (std::size_t k = 0; k < d; ++k) {
+        gradient[i_d + k] += k_ij * (Y[i_d + k] - Y[j * d + k]);
+      }
+    }
+  }
+}
+
+
 void tsne_grad(const std::vector<double> &P, const std::vector<double> &W,
                double Z, const std::vector<double> &Y,
                std::vector<double> &gradient, std::size_t start,
@@ -281,3 +303,53 @@ NumericMatrix tsne_grad_cpp(const NumericMatrix &P,
   return gradient;
 }
 
+// [[Rcpp::export]]
+NumericMatrix mmds_grad_cpp(const NumericMatrix &R,
+                            const NumericMatrix &D,
+                            const NumericMatrix &Y,
+                            double eps,
+                            std::size_t n_threads) {
+  std::size_t n = Y.nrow();
+  std::size_t d = Y.ncol();
+  
+  std::vector<double> R_vec(R.begin(), R.end());
+  std::vector<double> D_vec(D.begin(), D.end());
+  
+  std::vector<double> Y_vec(n * d);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < d; ++j) {
+      Y_vec[i * d + j] = Y(i, j);
+    }
+  }
+  
+  std::vector<double> gradient_vec(n * d, 0.0);
+  
+  if (n_threads > 1) {
+    std::size_t chunk_size = (n + n_threads - 1) / n_threads;
+    std::vector<std::thread> threads;
+    for (std::size_t t = 0; t < n_threads; ++t) {
+      std::size_t start_row = t * chunk_size;
+      std::size_t end_row = std::min(start_row + chunk_size, n);
+      threads.emplace_back(mmds_grad, std::cref(R_vec), std::cref(D_vec), 
+                           std::cref(Y_vec), eps, std::ref(gradient_vec), start_row,
+                           end_row, n, d);
+    }
+    for (auto &thread : threads) {
+      thread.join();
+    }
+  } else {
+    mmds_grad(R_vec, D_vec, Y_vec, eps, gradient_vec, 0, n, n, d);
+  }
+  
+  NumericMatrix gradient(n, d);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < d; ++j) {
+      gradient(i, j) = gradient_vec[i * d + j];
+    }
+  }
+  
+  return gradient;
+}
+
+
+

From 98e0c6f7102ca2de20e02e56b2a02e528673d63e Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 11:15:50 -0700
Subject: [PATCH 10/26] move perplexity

---
 smallvis/R/perplexity.R | 434 ++++++++++++++++++++++++++++++++++++++++
 smallvis/R/smallvis.R   | 433 ---------------------------------------
 2 files changed, 434 insertions(+), 433 deletions(-)
 create mode 100644 smallvis/R/perplexity.R

diff --git a/smallvis/R/perplexity.R b/smallvis/R/perplexity.R
new file mode 100644
index 0000000..9dfe5d0
--- /dev/null
+++ b/smallvis/R/perplexity.R
@@ -0,0 +1,434 @@
+# Perplexity Calibration --------------------------------------------------
+
+# Calculates the input affinities from X, such that each normalized row of the
+# affinity matrix has the specified perplexity (within the supplied tolerance).
+# Returns a list containing the affinities, beta values and intrinsic
+# dimensionalities.
+# NB set default kernel to "exp" to get results closer to the tsne package.
+# This differs from the procedure in the t-SNE paper by exponentially weighting
+# the distances, rather than the squared distances.
+x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
+                  verbose = FALSE, guesses = NULL) {
+  x_is_dist <- methods::is(X, "dist")
+  if (x_is_dist) {
+    D <- X
+    n <- attr(D, "Size")
+    
+    D <- as.matrix(D)
+    if (kernel == "gauss") {
+      D <- D * D
+    }
+  }
+  else {
+    XX <- rowSums(X * X)
+    n <- nrow(X)
+  }
+  
+  nperps <- length(perplexity)
+  if (nperps > 1 && nperps != n) {
+    stop("Must provide one perplexity per point")
+  }
+  
+  if (!is.null(guesses) && length(guesses) != n) {
+    stop("Initial guess vector must match number of observations in X")
+  }
+  
+  W <- matrix(0, n, n)
+  intd <- rep(0, n)
+  if (!is.null(guesses)) {
+    beta <- guesses
+  }
+  else {
+    beta <- rep(1, n)
+  }
+  if (nperps == 1) {
+    logU <- log(perplexity)
+  }
+  else {
+    perps <- perplexity
+  }
+  bad_perp <- 0
+  
+  for (i in 1:n) {
+    if (nperps > 1) {
+      perplexity <- perps[i]
+      logU <- log(perplexity)
+    }
+    betamin <- -Inf
+    betamax <- Inf
+    
+    if (x_is_dist) {
+      Di <- D[i, -i]
+    }
+    else {
+      Di <- (XX[i] + XX - 2 * colSums(tcrossprod(X[i, ], X)))[-i]
+      Di[Di < 0] <- 0
+      if (kernel == "exp") {
+        Di <- sqrt(Di)
+      }
+    }
+    
+    # If we haven't been provided with guesses, then try the initialization used
+    # for all points in ELKI according to Schubert & Gertz in "Intrinsic
+    # t-Stochastic Neighbor Embedding for Visualization and Outlier Detection: A
+    # Remedy Against the Curse of Dimensionality?" Using the last optimized beta
+    # seems to be better most of the time based on my testing though, so we'll
+    # only use it for the first point.
+    if (is.null(guesses) && i == 1) {
+      beta[1] <- 0.5 * perplexity / mean(Di)
+    }
+    
+    sres <- shannon(Di, beta[i])
+    H <- sres$H
+    Wi <- sres$W
+    sumWi <- sres$Z
+    
+    Hdiff <- H - logU
+    tries <- 0
+    while (abs(Hdiff) > tol && tries < 50) {
+      if (Hdiff > 0) {
+        betamin <- beta[i]
+        if (is.infinite(betamax)) {
+          beta[i] <- beta[i] * 2
+        } else {
+          beta[i] <- (beta[i] + betamax) / 2
+        }
+      } else {
+        betamax <- beta[i]
+        if (is.infinite(betamin)) {
+          beta[i] <- beta[i] / 2
+        } else {
+          beta[i] <- (beta[i] + betamin) / 2
+        }
+      }
+      
+      sres <- shannon(Di, beta[i])
+      H <- sres$H
+      Wi <- sres$W
+      sumWi <- sres$Z
+      
+      Hdiff <- H - logU
+      tries <- tries + 1
+    }
+    if (abs(Hdiff) > tol) {
+      # Put a weight of 1/perplexity on the perplexity-nearest neighbors
+      bad_perp <- bad_perp + 1
+      knn_idx <- order(Di, decreasing = FALSE)[1:max(floor(perplexity), 1)]
+      knn_idx[knn_idx >= i] <- knn_idx[knn_idx >= i]  + 1
+      Wi <- rep(0, n)
+      Wi[knn_idx] <- 1 / floor(perplexity)
+      
+      intd[i] <- 0
+      W[i, ] <- Wi
+    }
+    else {
+      # if we didn't supply estimates for beta manually, then initialize guess for
+      # next point with optimized beta for this point: doesn't save many
+      # iterations, but why not?
+      if (is.null(guesses) && i < n) {
+        beta[i + 1] <- beta[i]
+      }
+      intd[i] <- intd_x2aff(Di, beta[i], Wi, sumWi, H)
+      W[i, -i] <- Wi
+    }
+    
+  }
+  sigma <- sqrt(1 / beta)
+  
+  if (bad_perp > 0) {
+    tsmessage("Warning: ", bad_perp, " perplexity calibrations failed!")
+    warning(bad_perp, " perplexity calibrations failed")
+  }
+  
+  if (verbose) {
+    summarize(sigma, "sigma summary", verbose = verbose)
+    summarize(intd, "Dint", verbose = verbose)
+  }
+  list(W = W, beta = beta, dint = intd)
+}
+
+# Calculates affinites based on exponential weighting of D2 with beta
+# and returns a list containing:
+# W, the affinities; Z, the sum of the affinities; H, the Shannon entropy
+# This routine relies specifically on input weights being = exp(-beta * D)
+# and calculates the Shannon entropy as log(Z) + beta * sum(W * D) / Z
+# where Z is the sum of W.
+shannon <- function(D2, beta) {
+  W <- exp(-D2 * beta)
+  Z <- sum(W)
+  
+  if (Z == 0) {
+    H <- 0
+  }
+  else {
+    H <- log(Z) + beta * sum(D2 * W) / Z
+  }
+  list(
+    W = W,
+    Z = Z,
+    H = H
+  )
+}
+
+x2aff_sigma <- function(X, sigma = 1e-3, verbose = FALSE, use_cpp = FALSE, 
+                        n_threads = 1) {
+  x_is_dist <- methods::is(X, "dist")
+  if (x_is_dist) {
+    D <- X
+    
+    D <- as.matrix(D)
+    D <- D * D
+  }
+  else {
+    D <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
+  }
+  beta <- 1 / (sigma * sigma)
+  sres <- shannon(D, beta)
+  W <- sres$W
+  diag(W) <- 0
+  
+  list(W = W, beta = beta)
+}
+
+# Create a symmetrized distance matrix based on the k-nearest neighbors
+# Non-neighbor distances are set to Inf
+knn_dist <- function(X, k, n_threads, verbose) {
+  if (methods::is(X, "dist")) {
+    # If it's already a distance matrix, find k-smallest distance per column
+    # (ignoring self-distances of zero) and set everything larger to Inf
+    # (potentially more than k finite distances in the event of ties, not
+    # going to worry about that)
+    D <- as.matrix(X)
+    n <- nrow(D)
+    if (k > n - 1) {
+      stop("k must be not be > n - 1")
+    }
+    kdists <- Rfast::colnth(D, rep(k + 1, n))
+    for (i in 1:n) {
+      Di <- D[, i]
+      Di[Di > kdists[i]] <- Inf
+      D[, i] <- Di
+    }
+  }
+  else {
+    # Find the k-nearest indexes and distances of X, and set the corresponding
+    # distance matrix elements
+    n <- nrow(X)
+    if (k > n - 1) {
+      stop("k must be not be > n - 1")
+    }
+    tsmessage("Finding ", k + 1, " nearest neighbors")
+    knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
+    knn$idx <- knn$idx[, 2:(k + 1)]
+    knn$dist <- knn$dist[, 2:(k + 1)]
+    
+    D <- matrix(Inf, nrow = n, ncol = n)
+    diag(D) <- 0
+    for (i in 1:n) {
+      D[i, knn$idx[i, ]] <- knn$dist[i, ]
+    }
+  }
+  
+  # symmetrize
+  pmin(D, t(D))
+}
+
+# Create the knn graph: D[i, j] = 1 if j is one of i's k-nearest neighbors.
+# i is NOT considered a neighbor of itself.
+# No symmetrization is carried out.
+# Used by knnmmds and knn kernel for SNE
+knn_graph <- function(X, k, n_threads, verbose) {
+  if (methods::is(X, "dist")) {
+    D <- as.matrix(X)
+    n <- nrow(D)
+    if (k > n - 1) {
+      stop("k must be not be > n - 1")
+    }
+    kdists <- Rfast::colnth(D, rep(k + 1, n))
+    for (i in 1:n) {
+      Di <- D[, i]
+      Di[Di > kdists[i]] <- 0
+      D[, i] <- 1
+    }
+  }
+  else {
+    # Find the k-nearest indexes and distances of X, and set the corresponding
+    # distance matrix elements
+    n <- nrow(X)
+    if (k > n - 1) {
+      stop("k must be not be > n - 1")
+    }
+    
+    tsmessage("Finding ", k + 1, " nearest neighbors")
+    knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
+    knn$idx <- knn$idx[, 2:(k + 1)]
+    
+    D <- matrix(0, nrow = n, ncol = n)
+    for (i in 1:n) {
+      D[i, knn$idx[i, ]] <- 1
+    }
+  }
+  D
+}
+
+# Given data X and k nearest neighbors, return a geodisic distance matrix
+# Disconnections are treated by using the Euclidean distance.
+geodesic <- function(X, k, fill = TRUE, use_cpp = FALSE, n_threads = 0, 
+                     verbose = FALSE) {
+  tsmessage("Calculating geodesic distances with k = ", k)
+  
+  R <- knn_dist(X, k, n_threads = n_threads, verbose = verbose)
+  # The hard work is done by Rfast's implementation of Floyd's algorithm
+  G <- Rfast::floyd(R)
+  if (any(is.infinite(G)) && fill) {
+    tsmessage("k = ", k, " resulted in disconnections: filling with Euclidean distances")
+    if (methods::is(X, "dist")) {
+      R <- as.matrix(X)
+    }
+    else {
+      R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
+    }
+    G[is.infinite(G)] <- R[is.infinite(G)]
+  }
+  G
+}
+
+# Multiscale perplexities: P is an average over the results of multiple 
+# perplexities
+# as described by de Bodt et al in
+# Perplexity-free t-SNE and twice Student tt-SNE (2018)
+msp <- function(X, perplexities = NULL, tol = 1e-5,
+                symmetrize = "symmetric", row_normalize = TRUE,
+                normalize = TRUE,
+                verbose = FALSE, guesses = NULL) {
+  if (methods::is(X, "dist")) {
+    n <- attr(X, "Size")
+  }
+  else {
+    n <- nrow(X)
+  }
+  
+  if (is.null(perplexities)) {
+    perplexities <- idp_perps(n)
+  }
+  tsmessage("Calculating multi-scale P with perplexities from ",
+            formatC(perplexities[1]), " to ", formatC(last(perplexities)))
+  
+  res <- NULL
+  for (perplexity in perplexities) {
+    tsmessage("Commencing calibration for perplexity = ",
+              format_perps(perplexity))
+    x2a_res <- x2aff(X = X, perplexity = perplexity, tol = tol, kernel = "gauss",
+                     verbose = verbose, guesses = guesses)
+    P <- x2a_res$W
+    Q <- scale_affinities(P, 
+                          symmetrize = "symmetric", row_normalize = TRUE,
+                          normalize = TRUE)
+    if (is.null(res)) {
+      res$P <- Q
+    }
+    else {
+      res$P <- res$P + Q
+    }
+  }
+  
+  if (length(perplexities) > 1) {
+    res$P <- res$P / length(perplexities)
+  }
+  
+  if (is.logical(row_normalize)) {
+    tsmessage("Effective perplexity of multiscale P approx = ", 
+              formatC(stats::median(perpp(res$P))))
+  }
+  
+  res
+}
+
+# Use the Intrinsic Dimensionality Perplexity (IDP)
+# Scan through the provided perplexities and use the result which maximizes
+# the mean correlation dimension (which is an estimate for the intrinsic
+# dimensionality). Stops at the first maxmimum found.
+idp <- function(X, perplexities = NULL, tol = 1e-5,
+                verbose = FALSE, guesses = NULL) {
+  if (methods::is(X, "dist")) {
+    n <- attr(X, "Size")
+  }
+  else {
+    n <- nrow(X)
+  }
+  
+  if (is.null(perplexities)) {
+    perplexities <- idp_perps(n)
+  }
+  if (verbose) {
+    tsmessage("Searching for intrinsic dimensionality with perplexities from ",
+              formatC(perplexities[1]), " to ", formatC(last(perplexities)))
+  }
+  
+  corr_dim_max <- -Inf
+  idp <- 0
+  idp_res <- NULL
+  for (perplexity in perplexities) {
+    if (verbose) {
+      tsmessage("Commencing calibration for perplexity = ",
+                format_perps(perplexity))
+    }
+    x2a_res <- x2aff(X = X, perplexity = perplexity, tol = tol, kernel = "gauss",
+                     verbose = verbose, guesses = guesses)
+    corr_dim <- mean(x2a_res$dint)
+    if (corr_dim <= corr_dim_max) {
+      break
+    }
+    else {
+      corr_dim_max <- corr_dim
+      idp <- perplexity
+      idp_res <- x2a_res
+    }
+  }
+  if (idp <= 0) {
+    stop("Unable to find an IDP: all correlation dimensions were -ve")
+  }
+  if (verbose) {
+    tsmessage("Found IDP at perplexity = ", formatC(idp),
+              " intrinsic dimensionality = ", formatC(corr_dim_max))
+  }
+  
+  idp_res$idp <- idp
+  idp_res
+}
+
+# Come up with a set of candidate perplexities for finding the Intrinsic
+# Dimensionality Perplexity. Use powers of 2 up to around half the data set
+# size, or a perplexity of 128, whichever is smaller. Tries to provide a balance
+# of coverage of useful perplexities vs time consumption.
+idp_perps <- function(n) {
+  max_u <- min(128, max(2, ceiling(n / 2)))
+  max_uexp <- floor(log2(max_u))
+  min_uexp <- min(2, max_uexp)
+  2 ^ (min_uexp:max_uexp)
+}
+
+# Is the perplexity argument a string or a list with the first element is a
+# string?
+perp_method <- function(perplexity) {
+  method <- ""
+  if (is.character(perplexity) || is.list(perplexity)) {
+    if (is.list(perplexity)) {
+      method <- perplexity[[1]]
+    }
+    else {
+      method <- perplexity
+    }
+  }
+  tolower(method)
+}
+
+# If the user provided a list like ("idp", c(10, 20, 30)), extract the list
+# of numbers as the candidate list of perplexities. Otherwise, return NULL
+user_idp_perps <- function(perplexity) {
+  perplexities <- NULL
+  if (is.list(perplexity) && length(perplexity) == 2) {
+    perplexities <- perplexity[[2]]
+  }
+  perplexities
+}
\ No newline at end of file
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index 05dad45..1eb4e43 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -2098,440 +2098,7 @@ pca_whiten <- function(X, ncol = min(dim(X)), eps = 1e-5, verbose = FALSE) {
 }
 
 
-# Perplexity Calibration --------------------------------------------------
-
-# Calculates the input affinities from X, such that each normalized row of the
-# affinity matrix has the specified perplexity (within the supplied tolerance).
-# Returns a list containing the affinities, beta values and intrinsic
-# dimensionalities.
-# NB set default kernel to "exp" to get results closer to the tsne package.
-# This differs from the procedure in the t-SNE paper by exponentially weighting
-# the distances, rather than the squared distances.
-x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
-                  verbose = FALSE, guesses = NULL) {
-  x_is_dist <- methods::is(X, "dist")
-  if (x_is_dist) {
-    D <- X
-    n <- attr(D, "Size")
-
-    D <- as.matrix(D)
-    if (kernel == "gauss") {
-      D <- D * D
-    }
-  }
-  else {
-    XX <- rowSums(X * X)
-    n <- nrow(X)
-  }
-
-  nperps <- length(perplexity)
-  if (nperps > 1 && nperps != n) {
-    stop("Must provide one perplexity per point")
-  }
-
-  if (!is.null(guesses) && length(guesses) != n) {
-    stop("Initial guess vector must match number of observations in X")
-  }
-
-  W <- matrix(0, n, n)
-  intd <- rep(0, n)
-  if (!is.null(guesses)) {
-    beta <- guesses
-  }
-  else {
-    beta <- rep(1, n)
-  }
-  if (nperps == 1) {
-    logU <- log(perplexity)
-  }
-  else {
-    perps <- perplexity
-  }
-  bad_perp <- 0
-
-  for (i in 1:n) {
-    if (nperps > 1) {
-      perplexity <- perps[i]
-      logU <- log(perplexity)
-    }
-    betamin <- -Inf
-    betamax <- Inf
-
-    if (x_is_dist) {
-      Di <- D[i, -i]
-    }
-    else {
-      Di <- (XX[i] + XX - 2 * colSums(tcrossprod(X[i, ], X)))[-i]
-      Di[Di < 0] <- 0
-      if (kernel == "exp") {
-        Di <- sqrt(Di)
-      }
-    }
-
-    # If we haven't been provided with guesses, then try the initialization used
-    # for all points in ELKI according to Schubert & Gertz in "Intrinsic
-    # t-Stochastic Neighbor Embedding for Visualization and Outlier Detection: A
-    # Remedy Against the Curse of Dimensionality?" Using the last optimized beta
-    # seems to be better most of the time based on my testing though, so we'll
-    # only use it for the first point.
-    if (is.null(guesses) && i == 1) {
-      beta[1] <- 0.5 * perplexity / mean(Di)
-    }
-
-    sres <- shannon(Di, beta[i])
-    H <- sres$H
-    Wi <- sres$W
-    sumWi <- sres$Z
-
-    Hdiff <- H - logU
-    tries <- 0
-    while (abs(Hdiff) > tol && tries < 50) {
-      if (Hdiff > 0) {
-        betamin <- beta[i]
-        if (is.infinite(betamax)) {
-          beta[i] <- beta[i] * 2
-        } else {
-          beta[i] <- (beta[i] + betamax) / 2
-        }
-      } else {
-        betamax <- beta[i]
-        if (is.infinite(betamin)) {
-          beta[i] <- beta[i] / 2
-        } else {
-          beta[i] <- (beta[i] + betamin) / 2
-        }
-      }
-
-      sres <- shannon(Di, beta[i])
-      H <- sres$H
-      Wi <- sres$W
-      sumWi <- sres$Z
-
-      Hdiff <- H - logU
-      tries <- tries + 1
-    }
-    if (abs(Hdiff) > tol) {
-      # Put a weight of 1/perplexity on the perplexity-nearest neighbors
-      bad_perp <- bad_perp + 1
-      knn_idx <- order(Di, decreasing = FALSE)[1:max(floor(perplexity), 1)]
-      knn_idx[knn_idx >= i] <- knn_idx[knn_idx >= i]  + 1
-      Wi <- rep(0, n)
-      Wi[knn_idx] <- 1 / floor(perplexity)
-
-      intd[i] <- 0
-      W[i, ] <- Wi
-    }
-    else {
-      # if we didn't supply estimates for beta manually, then initialize guess for
-      # next point with optimized beta for this point: doesn't save many
-      # iterations, but why not?
-      if (is.null(guesses) && i < n) {
-        beta[i + 1] <- beta[i]
-      }
-      intd[i] <- intd_x2aff(Di, beta[i], Wi, sumWi, H)
-      W[i, -i] <- Wi
-    }
-    
-  }
-  sigma <- sqrt(1 / beta)
 
-  if (bad_perp > 0) {
-    tsmessage("Warning: ", bad_perp, " perplexity calibrations failed!")
-    warning(bad_perp, " perplexity calibrations failed")
-  }
-
-  if (verbose) {
-    summarize(sigma, "sigma summary", verbose = verbose)
-    summarize(intd, "Dint", verbose = verbose)
-  }
-  list(W = W, beta = beta, dint = intd)
-}
-
-# Calculates affinites based on exponential weighting of D2 with beta
-# and returns a list containing:
-# W, the affinities; Z, the sum of the affinities; H, the Shannon entropy
-# This routine relies specifically on input weights being = exp(-beta * D)
-# and calculates the Shannon entropy as log(Z) + beta * sum(W * D) / Z
-# where Z is the sum of W.
-shannon <- function(D2, beta) {
-  W <- exp(-D2 * beta)
-  Z <- sum(W)
-
-  if (Z == 0) {
-    H <- 0
-  }
-  else {
-    H <- log(Z) + beta * sum(D2 * W) / Z
-  }
-  list(
-    W = W,
-    Z = Z,
-    H = H
-  )
-}
-
-x2aff_sigma <- function(X, sigma = 1e-3, verbose = FALSE, use_cpp = FALSE, 
-                        n_threads = 1) {
-  x_is_dist <- methods::is(X, "dist")
-  if (x_is_dist) {
-    D <- X
-
-    D <- as.matrix(D)
-    D <- D * D
-  }
-  else {
-    D <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
-  }
-  beta <- 1 / (sigma * sigma)
-  sres <- shannon(D, beta)
-  W <- sres$W
-  diag(W) <- 0
-  
-  list(W = W, beta = beta)
-}
-
-# Create a symmetrized distance matrix based on the k-nearest neighbors
-# Non-neighbor distances are set to Inf
-knn_dist <- function(X, k, n_threads, verbose) {
-  if (methods::is(X, "dist")) {
-    # If it's already a distance matrix, find k-smallest distance per column
-    # (ignoring self-distances of zero) and set everything larger to Inf
-    # (potentially more than k finite distances in the event of ties, not
-    # going to worry about that)
-    D <- as.matrix(X)
-    n <- nrow(D)
-    if (k > n - 1) {
-      stop("k must be not be > n - 1")
-    }
-    kdists <- Rfast::colnth(D, rep(k + 1, n))
-    for (i in 1:n) {
-      Di <- D[, i]
-      Di[Di > kdists[i]] <- Inf
-      D[, i] <- Di
-    }
-  }
-  else {
-    # Find the k-nearest indexes and distances of X, and set the corresponding
-    # distance matrix elements
-    n <- nrow(X)
-    if (k > n - 1) {
-      stop("k must be not be > n - 1")
-    }
-    tsmessage("Finding ", k + 1, " nearest neighbors")
-    knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
-    knn$idx <- knn$idx[, 2:(k + 1)]
-    knn$dist <- knn$dist[, 2:(k + 1)]
-    
-    D <- matrix(Inf, nrow = n, ncol = n)
-    diag(D) <- 0
-    for (i in 1:n) {
-      D[i, knn$idx[i, ]] <- knn$dist[i, ]
-    }
-  }
-
-  # symmetrize
-  pmin(D, t(D))
-}
-
-# Create the knn graph: D[i, j] = 1 if j is one of i's k-nearest neighbors.
-# i is NOT considered a neighbor of itself.
-# No symmetrization is carried out.
-# Used by knnmmds and knn kernel for SNE
-knn_graph <- function(X, k, n_threads, verbose) {
-  if (methods::is(X, "dist")) {
-    D <- as.matrix(X)
-    n <- nrow(D)
-    if (k > n - 1) {
-      stop("k must be not be > n - 1")
-    }
-    kdists <- Rfast::colnth(D, rep(k + 1, n))
-    for (i in 1:n) {
-      Di <- D[, i]
-      Di[Di > kdists[i]] <- 0
-      D[, i] <- 1
-    }
-  }
-  else {
-    # Find the k-nearest indexes and distances of X, and set the corresponding
-    # distance matrix elements
-    n <- nrow(X)
-    if (k > n - 1) {
-      stop("k must be not be > n - 1")
-    }
-    
-    tsmessage("Finding ", k + 1, " nearest neighbors")
-    knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
-    knn$idx <- knn$idx[, 2:(k + 1)]
-    
-    D <- matrix(0, nrow = n, ncol = n)
-    for (i in 1:n) {
-      D[i, knn$idx[i, ]] <- 1
-    }
-  }
-  D
-}
-
-# Given data X and k nearest neighbors, return a geodisic distance matrix
-# Disconnections are treated by using the Euclidean distance.
-geodesic <- function(X, k, fill = TRUE, use_cpp = FALSE, n_threads = 0, 
-                     verbose = FALSE) {
-  tsmessage("Calculating geodesic distances with k = ", k)
-
-  R <- knn_dist(X, k, n_threads = n_threads, verbose = verbose)
-  # The hard work is done by Rfast's implementation of Floyd's algorithm
-  G <- Rfast::floyd(R)
-  if (any(is.infinite(G)) && fill) {
-    tsmessage("k = ", k, " resulted in disconnections: filling with Euclidean distances")
-    if (methods::is(X, "dist")) {
-      R <- as.matrix(X)
-    }
-    else {
-      R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
-    }
-    G[is.infinite(G)] <- R[is.infinite(G)]
-  }
-  G
-}
-
-# Multiscale perplexities: P is an average over the results of multiple 
-# perplexities
-# as described by de Bodt et al in
-# Perplexity-free t-SNE and twice Student tt-SNE (2018)
-msp <- function(X, perplexities = NULL, tol = 1e-5,
-                symmetrize = "symmetric", row_normalize = TRUE,
-                normalize = TRUE,
-                verbose = FALSE, guesses = NULL) {
-  if (methods::is(X, "dist")) {
-    n <- attr(X, "Size")
-  }
-  else {
-    n <- nrow(X)
-  }
-
-  if (is.null(perplexities)) {
-    perplexities <- idp_perps(n)
-  }
-  tsmessage("Calculating multi-scale P with perplexities from ",
-            formatC(perplexities[1]), " to ", formatC(last(perplexities)))
-
-  res <- NULL
-  for (perplexity in perplexities) {
-    tsmessage("Commencing calibration for perplexity = ",
-              format_perps(perplexity))
-    x2a_res <- x2aff(X = X, perplexity = perplexity, tol = tol, kernel = "gauss",
-                     verbose = verbose, guesses = guesses)
-    P <- x2a_res$W
-    Q <- scale_affinities(P, 
-                          symmetrize = "symmetric", row_normalize = TRUE,
-                          normalize = TRUE)
-    if (is.null(res)) {
-      res$P <- Q
-    }
-    else {
-      res$P <- res$P + Q
-    }
-  }
-
-  if (length(perplexities) > 1) {
-    res$P <- res$P / length(perplexities)
-  }
-  
-  if (is.logical(row_normalize)) {
-    tsmessage("Effective perplexity of multiscale P approx = ", 
-              formatC(stats::median(perpp(res$P))))
-  }
-
-  res
-}
-
-# Use the Intrinsic Dimensionality Perplexity (IDP)
-# Scan through the provided perplexities and use the result which maximizes
-# the mean correlation dimension (which is an estimate for the intrinsic
-# dimensionality). Stops at the first maxmimum found.
-idp <- function(X, perplexities = NULL, tol = 1e-5,
-                verbose = FALSE, guesses = NULL) {
-  if (methods::is(X, "dist")) {
-    n <- attr(X, "Size")
-  }
-  else {
-    n <- nrow(X)
-  }
-  
-  if (is.null(perplexities)) {
-    perplexities <- idp_perps(n)
-  }
-  if (verbose) {
-    tsmessage("Searching for intrinsic dimensionality with perplexities from ",
-              formatC(perplexities[1]), " to ", formatC(last(perplexities)))
-  }
-  
-  corr_dim_max <- -Inf
-  idp <- 0
-  idp_res <- NULL
-  for (perplexity in perplexities) {
-    if (verbose) {
-      tsmessage("Commencing calibration for perplexity = ",
-                format_perps(perplexity))
-    }
-    x2a_res <- x2aff(X = X, perplexity = perplexity, tol = tol, kernel = "gauss",
-                     verbose = verbose, guesses = guesses)
-    corr_dim <- mean(x2a_res$dint)
-    if (corr_dim <= corr_dim_max) {
-      break
-    }
-    else {
-      corr_dim_max <- corr_dim
-      idp <- perplexity
-      idp_res <- x2a_res
-    }
-  }
-  if (idp <= 0) {
-    stop("Unable to find an IDP: all correlation dimensions were -ve")
-  }
-  if (verbose) {
-    tsmessage("Found IDP at perplexity = ", formatC(idp),
-              " intrinsic dimensionality = ", formatC(corr_dim_max))
-  }
-  
-  idp_res$idp <- idp
-  idp_res
-}
-
-# Come up with a set of candidate perplexities for finding the Intrinsic
-# Dimensionality Perplexity. Use powers of 2 up to around half the data set
-# size, or a perplexity of 128, whichever is smaller. Tries to provide a balance
-# of coverage of useful perplexities vs time consumption.
-idp_perps <- function(n) {
-  max_u <- min(128, max(2, ceiling(n / 2)))
-  max_uexp <- floor(log2(max_u))
-  min_uexp <- min(2, max_uexp)
-  2 ^ (min_uexp:max_uexp)
-}
-
-# Is the perplexity argument a string or a list with the first element is a
-# string?
-perp_method <- function(perplexity) {
-  method <- ""
-  if (is.character(perplexity) || is.list(perplexity)) {
-    if (is.list(perplexity)) {
-      method <- perplexity[[1]]
-    }
-    else {
-      method <- perplexity
-    }
-  }
-  tolower(method)
-}
-
-# If the user provided a list like ("idp", c(10, 20, 30)), extract the list
-# of numbers as the candidate list of perplexities. Otherwise, return NULL
-user_idp_perps <- function(perplexity) {
-  perplexities <- NULL
-  if (is.list(perplexity) && length(perplexity) == 2) {
-    perplexities <- perplexity[[2]]
-  }
-  perplexities
-}
 
 # Utility Functions -------------------------------------------------------
 

From 88b2a641ecaa4fcf05bde2007829ff70b93a3bb1 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 11:23:29 -0700
Subject: [PATCH 11/26] move utils

---
 smallvis/R/smallvis.R | 146 ------------------------------------------
 smallvis/R/util.R     | 142 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 146 deletions(-)
 create mode 100644 smallvis/R/util.R

diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index 1eb4e43..3c6c1c6 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -2097,152 +2097,6 @@ pca_whiten <- function(X, ncol = min(dim(X)), eps = 1e-5, verbose = FALSE) {
   sweep(pca$scores, 2, sqrt(pca$lambda + eps), "/")
 }
 
-
-
-
-# Utility Functions -------------------------------------------------------
-
-# Create matrix of squared Euclidean distances
-# For low dimension, X %*% t(X) seems to a bit faster than tcrossprod(X)
-# Small -ve distances are possible
-dist2 <- function(X) {
-  D2 <- rowSums(X * X)
-  D2 + sweep(X %*% t(X) * -2, 2, t(D2), `+`)
-}
-
-calc_d2 <- function(X, use_cpp = FALSE, n_threads = 1) {
-  if (use_cpp) {
-    dist2_cpp(X, n_threads = n_threads)
-  }
-  else {
-    safe_dist2(X)
-  }
-}
-
-calc_d <- function(X, use_cpp = FALSE, n_threads = 1) {
-  if (use_cpp) {
-    dist_cpp(X, n_threads = n_threads)
-  }
-  else {
-    sqrt(safe_dist2(X))
-  }
-}
-
-# Squared Euclidean distances, ensuring no small -ve distances can occur
-safe_dist2 <- function(X) {
-  D2 <- dist2(X)
-  D2[D2 < 0] <- 0
-  D2
-}
-
-calc_d2tweight <- function(D2, use_cpp = FALSE, n_threads = 1) {
-  if (use_cpp) {
-    d2_to_tweight_cpp(D2, n_threads = n_threads)
-  }
-  else {
-    1 / (1 + D2)
-  }
-}
-
-calc_tweight <- function(X, use_cpp = FALSE, n_threads = 1) {
-  if (use_cpp) {
-    tweight_cpp(X, n_threads = n_threads)
-  }
-  else {
-    D2 <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
-    1 / (1 + D2)
-  }
-}
-
-# 2-norm of a vector or matrix
-norm2 <- function(X) {
-  sqrt(sum(X * X))
-}
-
-# Simple time stamp
-stime <- function() {
-  format(Sys.time(), "%T")
-}
-
-# message with a time stamp
-tsmessage <- function(..., domain = NULL, appendLF = TRUE, force = FALSE,
-                      time_stamp = TRUE) {
-  verbose <- get0("verbose", envir = sys.parent())
-  
-  if (force || (!is.null(verbose) && verbose)) {
-    msg <- ""
-    if (time_stamp) {
-      msg <- paste0(stime(), " ")
-    }
-    message(msg, ..., domain = domain, appendLF = appendLF)
-    utils::flush.console()
-  }
-}
-
-# merge lists, where anything non-NULL in l is kept
-# e.g.
-# all(unlist(update_list(list(a = 1, b = 2), list(a = 10, c = 3))) ==
-#     unlist(list(a = 1, b = 2, c = 3)))
-lmerge <- function(l, l2) {
-  for (name in names(l2)) {
-    if (is.null(l[[name]])) {
-      l[[name]] <- l2[[name]]
-    }
-  }
-  l
-}
-
-# replaces the contents of l with the named arguments
-# e.g.
-# all(lreplace(c(a = 1, b = 2), a = 10, c = 3) ==
-#     c(a = 10, b = 2, c = 3))
-lreplace <- function(l, ...) {
-  varargs <- list(...)
-  for (i in names(varargs)) {
-    l[[i]] <- varargs[[i]]
-  }
-  l
-}
-
-# relative tolerance between x and y
-reltol <- function(x, y) {
-  abs(x - y) / min(abs(x), abs(y))
-}
-
-# Check if a value is non-null and true
-nnat <- function(x) {
-  !is.null(x) && is.logical(x) && x
-}
-
-# log vector information
-summarize <- function(X, msg = "", verbose = FALSE) {
-  summary_X <- summary(X, digits = max(3, getOption("digits") - 3))
-  tsmessage(msg, ": ", paste(names(summary_X), ":", summary_X, "|",
-                             collapse = ""))
-}
-
-# Format perplexity as a string. Could be a scalar or a vector. In the latter
-# case, just list the first two values and then ellipses
-format_perps <- function(perplexity) {
-  if (length(perplexity) > 1) {
-    paste0(formatC(perplexity[1]), ", ",
-           formatC(perplexity[2]), "...")
-  }
-  else {
-    formatC(perplexity)
-  }
-}
-
-# last item of a vector
-last <- function(x) {
-  x[length(x)]
-}
-
-# remove NULL items from a list
-remove_nulls <- function(l) {
-  l[!vapply(l, is.null, logical(1))]
-}
-
 # UMAP  -------------------------------------------------------------------
 
 # Fits a kernel for the output distances of the form w = 1 / (1 + a dsq ^ b)
diff --git a/smallvis/R/util.R b/smallvis/R/util.R
new file mode 100644
index 0000000..9d0efc6
--- /dev/null
+++ b/smallvis/R/util.R
@@ -0,0 +1,142 @@
+# Utility Functions -------------------------------------------------------
+
+# Create matrix of squared Euclidean distances
+# For low dimension, X %*% t(X) seems to a bit faster than tcrossprod(X)
+# Small -ve distances are possible
+dist2 <- function(X) {
+  D2 <- rowSums(X * X)
+  D2 + sweep(X %*% t(X) * -2, 2, t(D2), `+`)
+}
+
+calc_d2 <- function(X, use_cpp = FALSE, n_threads = 1) {
+  if (use_cpp) {
+    dist2_cpp(X, n_threads = n_threads)
+  }
+  else {
+    safe_dist2(X)
+  }
+}
+
+calc_d <- function(X, use_cpp = FALSE, n_threads = 1) {
+  if (use_cpp) {
+    dist_cpp(X, n_threads = n_threads)
+  }
+  else {
+    sqrt(safe_dist2(X))
+  }
+}
+
+# Squared Euclidean distances, ensuring no small -ve distances can occur
+safe_dist2 <- function(X) {
+  D2 <- dist2(X)
+  D2[D2 < 0] <- 0
+  D2
+}
+
+calc_d2tweight <- function(D2, use_cpp = FALSE, n_threads = 1) {
+  if (use_cpp) {
+    d2_to_tweight_cpp(D2, n_threads = n_threads)
+  }
+  else {
+    1 / (1 + D2)
+  }
+}
+
+calc_tweight <- function(X, use_cpp = FALSE, n_threads = 1) {
+  if (use_cpp) {
+    tweight_cpp(X, n_threads = n_threads)
+  }
+  else {
+    D2 <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
+    1 / (1 + D2)
+  }
+}
+
+# 2-norm of a vector or matrix
+norm2 <- function(X) {
+  sqrt(sum(X * X))
+}
+
+# Simple time stamp
+stime <- function() {
+  format(Sys.time(), "%T")
+}
+
+# message with a time stamp
+tsmessage <- function(..., domain = NULL, appendLF = TRUE, force = FALSE,
+                      time_stamp = TRUE) {
+  verbose <- get0("verbose", envir = sys.parent())
+  
+  if (force || (!is.null(verbose) && verbose)) {
+    msg <- ""
+    if (time_stamp) {
+      msg <- paste0(stime(), " ")
+    }
+    message(msg, ..., domain = domain, appendLF = appendLF)
+    utils::flush.console()
+  }
+}
+
+# merge lists, where anything non-NULL in l is kept
+# e.g.
+# all(unlist(update_list(list(a = 1, b = 2), list(a = 10, c = 3))) ==
+#     unlist(list(a = 1, b = 2, c = 3)))
+lmerge <- function(l, l2) {
+  for (name in names(l2)) {
+    if (is.null(l[[name]])) {
+      l[[name]] <- l2[[name]]
+    }
+  }
+  l
+}
+
+# replaces the contents of l with the named arguments
+# e.g.
+# all(lreplace(c(a = 1, b = 2), a = 10, c = 3) ==
+#     c(a = 10, b = 2, c = 3))
+lreplace <- function(l, ...) {
+  varargs <- list(...)
+  for (i in names(varargs)) {
+    l[[i]] <- varargs[[i]]
+  }
+  l
+}
+
+# relative tolerance between x and y
+reltol <- function(x, y) {
+  abs(x - y) / min(abs(x), abs(y))
+}
+
+# Check if a value is non-null and true
+nnat <- function(x) {
+  !is.null(x) && is.logical(x) && x
+}
+
+# log vector information
+summarize <- function(X, msg = "", verbose = FALSE) {
+  summary_X <- summary(X, digits = max(3, getOption("digits") - 3))
+  tsmessage(msg, ": ", paste(names(summary_X), ":", summary_X, "|",
+                             collapse = ""))
+}
+
+# Format perplexity as a string. Could be a scalar or a vector. In the latter
+# case, just list the first two values and then ellipses
+format_perps <- function(perplexity) {
+  if (length(perplexity) > 1) {
+    paste0(formatC(perplexity[1]), ", ",
+           formatC(perplexity[2]), "...")
+  }
+  else {
+    formatC(perplexity)
+  }
+}
+
+# last item of a vector
+last <- function(x) {
+  x[length(x)]
+}
+
+# remove NULL items from a list
+remove_nulls <- function(l) {
+  l[!vapply(l, is.null, logical(1))]
+}
\ No newline at end of file

From fca3880c587939cf7e2faf0f23c7bc60c37266b0 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 11:25:20 -0700
Subject: [PATCH 12/26] move init code

---
 smallvis/R/init.R     | 159 ++++++++++++++++++++++++++++++++++++++++++
 smallvis/R/smallvis.R | 156 -----------------------------------------
 2 files changed, 159 insertions(+), 156 deletions(-)
 create mode 100644 smallvis/R/init.R

diff --git a/smallvis/R/init.R b/smallvis/R/init.R
new file mode 100644
index 0000000..2c83c43
--- /dev/null
+++ b/smallvis/R/init.R
@@ -0,0 +1,159 @@
+# Output Initialization ---------------------------------------------------
+
+# Initialization of the output coordinates
+init_out <- function(Y_init, X, n, ndim, pca_preprocessed, verbose = FALSE) {
+  switch(Y_init,
+         pca = {
+           tsmessage("Initializing from PCA scores")
+           pca_init(X, ndim, pca_preprocessed, verbose)
+         },
+         rand = {
+           tsmessage("Initializing from random Gaussian")
+           matrix(stats::rnorm(ndim * n, sd = 1), n)
+         }
+  )
+}
+
+pca_init <- function(X, ndim, pca_preprocessed, verbose = FALSE) {
+  # If we've already done PCA, we can just take the first two columns
+  if (pca_preprocessed) {
+    X[, 1:2]
+  }
+  else {
+    pca_scores(X, ncol = ndim, verbose = verbose)
+  }
+}
+
+# Laplacian Eigenmap (Belkin & Niyogi, 2002)
+# Original formulation solves the generalized eigenvalue problem of the
+# unnormalized graph Laplacian: L v = lambda D v, where L = D - A
+# and uses the bottom eigenvectors v that result
+# (ignoring the constant eigenvector associated with the smallest eigenvalue).
+#
+# This is equivalent to using the top eigenvectors from the usual
+# eigendecomposition of a row-normalized Laplacian P = D^-1 A: P v = lambda' v
+# so we don't need to depend on an external package for generalized eigenvalues.
+# Note that while the eigenvectors are the same, the eigenvalues are
+# different: lambda' = 1 - lambda, but we don't use them with Laplacian
+# Eigenmaps anyway.
+#
+# As we only need to calculate the top ndim + 1 eigenvectors (i.e. normally 3)
+# it's incredibly wasteful to calculate all of them. Therefore, if the
+# RSpectra library is available, we use that instead, which allows for only the
+# top eigenvectors to be extracted. Otherwise, use the slower eigen routine.
+# A must be symmetric and positive semi definite, but not necessarily
+# normalized in any specific way.
+laplacian_eigenmap <- function(A, ndim = 2, use_RSpectra = TRUE) {
+  # Equivalent to: D <- diag(colSums(A)); M <- solve(D) %*% A
+  # This effectively row-normalizes A: colSums is normally faster than rowSums
+  # and because A is symmetric, they're equivalent
+  M <- A / colSums(A)
+  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE,
+                                       warn.conflicts = FALSE)) {
+    tsmessage("Using RSpectra for eigenvectors")
+    Re(RSpectra::eigs(M, k = ndim + 1)$vectors[, 2:(ndim + 1)])
+  }
+  else {
+    tsmessage("Using eigen for eigenvectors")
+    eigen(M, symmetric = FALSE)$vectors[, 2:(ndim + 1)]
+  }
+}
+
+# Use a normalized Laplacian. The UMAP approach, taken from version 0.2.1.
+normalized_spectral_init <- function(A, ndim = 2, use_RSpectra = TRUE) {
+  n <- nrow(A)
+  # Normalized Laplacian: clear and close to UMAP code, but very slow in R
+  # I <- diag(1, nrow = n, ncol = n)
+  # D <- diag(1 / sqrt(colSums(A)))
+  # L <- I - D %*% A %*% D
+  
+  # A lot faster (order of magnitude when n = 1000)
+  Dsq <- sqrt(colSums(A))
+  L <- -t(A / Dsq) / Dsq
+  diag(L) <- 1 + diag(L)
+  
+  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE,
+                                       warn.conflicts = FALSE)) {
+    tsmessage("Using RSpectra for eigenvectors")
+    k <- ndim + 1
+    ncv <- max(2 * k + 1, floor(sqrt(n)))
+    opt <- list(
+      ncv = ncv,
+      maxitr = 5 * n,
+      tol = 1e-4
+    )
+    res <- RSpectra::eigs(L, k = k, which = "SM", opt = opt)
+    vec_indices <- rev(order(res$values, decreasing = TRUE)[1:ndim])
+    res <- Re(res$vectors[, vec_indices])
+  }
+  else {
+    tsmessage("Using eigen for eigenvectors")
+    res <- eigen(L, symmetric = FALSE)
+    vec_indices <- order(res$values, decreasing = FALSE)[2:(ndim + 1)]
+    res <- Re(res$vectors[, vec_indices])
+  }
+  res
+}
+
+is_spectral_init <- function(init) {
+  tolower(init) %in% c("laplacian", "normlaplacian")
+}
+
+# Rescale embedding so that the standard deviation is the specified value.
+# Default gives initialization like t-SNE, but not random.
+shrink_coords <- function(X, sdev = 1e-4) {
+  scale(X, scale = apply(X, 2, stats::sd) / sdev)
+}
+
+# PCA ---------------------------------------------------------------------
+
+
+# Calculates a matrix containing the first ncol columns of the PCA scores.
+# Returns the score matrix unless ret_extra is TRUE, in which case a list
+# is returned also containing the eigenvalues
+pca_scores <- function(X, ncol = min(dim(X)), verbose = FALSE,
+                       ret_extra = FALSE) {
+  if (methods::is(X, "dist")) {
+    res_mds <- stats::cmdscale(X, x.ret = TRUE, eig = TRUE, k = ncol)
+    
+    if (ret_extra || verbose) {
+      lambda <- res_mds$eig
+      varex <- sum(lambda[1:ncol]) / sum(lambda)
+      tsmessage("PCA (using classical MDS): ", ncol, " components explained ",
+                formatC(varex * 100), "% variance")
+    }
+    scores <- res_mds$points
+  }
+  else {
+    X <- scale(X, center = TRUE, scale = FALSE)
+    # do SVD on X directly rather than forming covariance matrix
+    s <- svd(X, nu = ncol, nv = 0)
+    D <- diag(c(s$d[1:ncol]))
+    if (verbose || ret_extra) {
+      # calculate eigenvalues of covariance matrix from singular values
+      lambda <- (s$d ^ 2) / (nrow(X) - 1)
+      varex <- sum(lambda[1:ncol]) / sum(lambda)
+      tsmessage("PCA: ", ncol, " components explained ", formatC(varex * 100),
+                "% variance")
+    }
+    scores <- s$u %*% D
+  }
+  
+  if (ret_extra) {
+    list(
+      scores = scores,
+      lambda = lambda[1:ncol]
+    )
+  }
+  else {
+    scores
+  }
+}
+
+# Whiten the data by PCA. This both reduces the dimensionality, but also
+# scales the scores by the inverse square root of the equivalent eigenvalue
+# so that the variance of each column is 1.
+pca_whiten <- function(X, ncol = min(dim(X)), eps = 1e-5, verbose = FALSE) {
+  pca <- pca_scores(X, ncol = ncol, verbose = verbose, ret_extra = TRUE)
+  sweep(pca$scores, 2, sqrt(pca$lambda + eps), "/")
+}
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index 3c6c1c6..d2932a6 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -1746,112 +1746,7 @@ pca_preprocess <- function(X, pca, whiten, initial_dims, verbose = FALSE) {
   X
 }
 
-# Output Initialization ---------------------------------------------------
-
-# Initialization of the output coordinates
-init_out <- function(Y_init, X, n, ndim, pca_preprocessed, verbose = FALSE) {
-  switch(Y_init,
-         pca = {
-           tsmessage("Initializing from PCA scores")
-           pca_init(X, ndim, pca_preprocessed, verbose)
-         },
-         rand = {
-           tsmessage("Initializing from random Gaussian")
-           matrix(stats::rnorm(ndim * n, sd = 1), n)
-         }
-  )
-}
 
-pca_init <- function(X, ndim, pca_preprocessed, verbose = FALSE) {
-  # If we've already done PCA, we can just take the first two columns
-  if (pca_preprocessed) {
-    X[, 1:2]
-  }
-  else {
-    pca_scores(X, ncol = ndim, verbose = verbose)
-  }
-}
-
-# Laplacian Eigenmap (Belkin & Niyogi, 2002)
-# Original formulation solves the generalized eigenvalue problem of the
-# unnormalized graph Laplacian: L v = lambda D v, where L = D - A
-# and uses the bottom eigenvectors v that result
-# (ignoring the constant eigenvector associated with the smallest eigenvalue).
-#
-# This is equivalent to using the top eigenvectors from the usual
-# eigendecomposition of a row-normalized Laplacian P = D^-1 A: P v = lambda' v
-# so we don't need to depend on an external package for generalized eigenvalues.
-# Note that while the eigenvectors are the same, the eigenvalues are
-# different: lambda' = 1 - lambda, but we don't use them with Laplacian
-# Eigenmaps anyway.
-#
-# As we only need to calculate the top ndim + 1 eigenvectors (i.e. normally 3)
-# it's incredibly wasteful to calculate all of them. Therefore, if the
-# RSpectra library is available, we use that instead, which allows for only the
-# top eigenvectors to be extracted. Otherwise, use the slower eigen routine.
-# A must be symmetric and positive semi definite, but not necessarily
-# normalized in any specific way.
-laplacian_eigenmap <- function(A, ndim = 2, use_RSpectra = TRUE) {
-  # Equivalent to: D <- diag(colSums(A)); M <- solve(D) %*% A
-  # This effectively row-normalizes A: colSums is normally faster than rowSums
-  # and because A is symmetric, they're equivalent
-  M <- A / colSums(A)
-  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE,
-                                       warn.conflicts = FALSE)) {
-    tsmessage("Using RSpectra for eigenvectors")
-    Re(RSpectra::eigs(M, k = ndim + 1)$vectors[, 2:(ndim + 1)])
-  }
-  else {
-    tsmessage("Using eigen for eigenvectors")
-    eigen(M, symmetric = FALSE)$vectors[, 2:(ndim + 1)]
-  }
-}
-
-# Use a normalized Laplacian. The UMAP approach, taken from version 0.2.1.
-normalized_spectral_init <- function(A, ndim = 2, use_RSpectra = TRUE) {
-  n <- nrow(A)
-  # Normalized Laplacian: clear and close to UMAP code, but very slow in R
-  # I <- diag(1, nrow = n, ncol = n)
-  # D <- diag(1 / sqrt(colSums(A)))
-  # L <- I - D %*% A %*% D
-
-  # A lot faster (order of magnitude when n = 1000)
-  Dsq <- sqrt(colSums(A))
-  L <- -t(A / Dsq) / Dsq
-  diag(L) <- 1 + diag(L)
-
-  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE,
-                                       warn.conflicts = FALSE)) {
-    tsmessage("Using RSpectra for eigenvectors")
-    k <- ndim + 1
-    ncv <- max(2 * k + 1, floor(sqrt(n)))
-    opt <- list(
-      ncv = ncv,
-      maxitr = 5 * n,
-      tol = 1e-4
-    )
-    res <- RSpectra::eigs(L, k = k, which = "SM", opt = opt)
-    vec_indices <- rev(order(res$values, decreasing = TRUE)[1:ndim])
-    res <- Re(res$vectors[, vec_indices])
-  }
-  else {
-    tsmessage("Using eigen for eigenvectors")
-    res <- eigen(L, symmetric = FALSE)
-    vec_indices <- order(res$values, decreasing = FALSE)[2:(ndim + 1)]
-    res <- Re(res$vectors[, vec_indices])
-  }
-  res
-}
-
-is_spectral_init <- function(init) {
-  tolower(init) %in% c("laplacian", "normlaplacian")
-}
-
-# Rescale embedding so that the standard deviation is the specified value.
-# Default gives initialization like t-SNE, but not random.
-shrink_coords <- function(X, sdev = 1e-4) {
-  scale(X, scale = apply(X, 2, stats::sd) / sdev)
-}
 
 
 # Epoch Functions ---------------------------------------------------------
@@ -2044,58 +1939,7 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
   }
 }
 
-# PCA ---------------------------------------------------------------------
-
-
-# Calculates a matrix containing the first ncol columns of the PCA scores.
-# Returns the score matrix unless ret_extra is TRUE, in which case a list
-# is returned also containing the eigenvalues
-pca_scores <- function(X, ncol = min(dim(X)), verbose = FALSE,
-                       ret_extra = FALSE) {
-  if (methods::is(X, "dist")) {
-    res_mds <- stats::cmdscale(X, x.ret = TRUE, eig = TRUE, k = ncol)
-
-    if (ret_extra || verbose) {
-      lambda <- res_mds$eig
-      varex <- sum(lambda[1:ncol]) / sum(lambda)
-      tsmessage("PCA (using classical MDS): ", ncol, " components explained ",
-                formatC(varex * 100), "% variance")
-    }
-    scores <- res_mds$points
-  }
-  else {
-    X <- scale(X, center = TRUE, scale = FALSE)
-    # do SVD on X directly rather than forming covariance matrix
-    s <- svd(X, nu = ncol, nv = 0)
-    D <- diag(c(s$d[1:ncol]))
-    if (verbose || ret_extra) {
-      # calculate eigenvalues of covariance matrix from singular values
-      lambda <- (s$d ^ 2) / (nrow(X) - 1)
-      varex <- sum(lambda[1:ncol]) / sum(lambda)
-      tsmessage("PCA: ", ncol, " components explained ", formatC(varex * 100),
-                "% variance")
-    }
-    scores <- s$u %*% D
-  }
-
-  if (ret_extra) {
-    list(
-      scores = scores,
-      lambda = lambda[1:ncol]
-    )
-  }
-  else {
-    scores
-  }
-}
 
-# Whiten the data by PCA. This both reduces the dimensionality, but also
-# scales the scores by the inverse square root of the equivalent eigenvalue
-# so that the variance of each column is 1.
-pca_whiten <- function(X, ncol = min(dim(X)), eps = 1e-5, verbose = FALSE) {
-  pca <- pca_scores(X, ncol = ncol, verbose = verbose, ret_extra = TRUE)
-  sweep(pca$scores, 2, sqrt(pca$lambda + eps), "/")
-}
 
 # UMAP  -------------------------------------------------------------------
 

From 27fcc5ea4c62362ed7dae60288523c7ee8fa7c94 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 11:27:55 -0700
Subject: [PATCH 13/26] move umap code

---
 smallvis/R/cost.R     | 149 ----------------
 smallvis/R/smallvis.R | 183 --------------------
 smallvis/R/umap.R     | 386 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 386 insertions(+), 332 deletions(-)
 create mode 100644 smallvis/R/umap.R

diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index db81356..d1a6768 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -176,155 +176,6 @@ largevis <- function(perplexity, inp_kernel = "gaussian",
   )
 }
 
-# UMAP --------------------------------------------------------------------
-
-umap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
-                 spread = 1, min_dist = 0.001, gr_eps = 0.1, eps = 1e-9,
-                 row_weight = NULL, n_threads = 0, use_cpp = FALSE) {
-  if (!is.null(row_weight)) {
-    row_normalize <- row_weight
-  }
-  else {
-    row_normalize <- FALSE
-  }
-  lreplace(tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = FALSE,
-                       row_normalize = row_normalize, n_threads = n_threads,
-                       verbose = verbose, ret_extra = ret_extra,
-                       use_cpp = use_cpp)
-      
-      cost <- init_ab(cost, spread = spread, min_dist = min_dist, 
-                      verbose = verbose)
-      
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      P <- cost$P
-      eps <- cost$eps
-      cost$Cp <- colSums(P * log(P + eps) + (1 - P) * log1p(-P + eps))
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      P <- cost$P
-      eps <- cost$eps
-      W <- cost$W
-      cost$pcost <- colSums(-P * log(W + eps) - (1 - P) * log1p(-W + eps)) + cost$Cp
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      cost$G <- k2g(Y, 4 * (cost$b / (cost$D2 + cost$eps + gr_eps)) * (cost$P - cost$W))
-      cost
-    },
-    update = function(cost, Y) {
-      D2 <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      D2[D2 < 0] <- 0
-
-      W <- 1 / (1 + cost$a * D2 ^ cost$b)
-      diag(W) <- 0
-
-      cost$W <- W
-      cost$D2 <- D2
-      cost
-    },
-    export = cost_export
-  )
-}
-
-# UMAP with the output kernel fixed to the t-distribution
-tumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap", 
-                  gr_eps = 0.1, eps = 1e-9, row_weight = NULL, n_threads = 0,
-                  use_cpp = FALSE) {
-  if (!is.null(row_weight)) {
-    row_normalize <- row_weight
-  }
-  else {
-    row_normalize <- FALSE
-  }
-  lreplace(umap(perplexity, n_threads = n_threads, use_cpp = use_cpp),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost$eps <- eps
-
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = FALSE,
-                       row_normalize = row_normalize, n_threads = n_threads,
-                       verbose = verbose, ret_extra = ret_extra,
-                       use_cpp = use_cpp)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      cost$G <- k2g(Y, 4 * (cost$W / ((1 - cost$W) + cost$eps + gr_eps)) * (cost$P - cost$W))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-
-      cost$W <- W
-      cost
-    }
-  )
-}
-
-# t-UMAP where output and input affinities are normalized
-ntumap <- function(perplexity, inp_kernel = "skd", symmetrize = "umap",
-                   gr_eps = 0.1, eps = 1e-9, n_threads = 0, use_cpp = FALSE) {
-  lreplace(tumap(perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost$eps <- eps
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       n_threads = n_threads, verbose = verbose,
-                       ret_extra = ret_extra, use_cpp = use_cpp)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      P <- cost$P
-      eps <- cost$eps
-      Q <- cost$Q
-
-      cost$pcost <- colSums(-P * logm(Q, eps) - (1 - P) * log1p(-Q + eps)) + cost$Cp
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$G <- k2g(Y,  4 * cost$W * (cost$C - cost$sumC * cost$Q))
-      cost
-    },
-    update = function(cost, Y) {
-      P <- cost$P
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-
-      Q <- W / sum(W)
-      C <- (P - Q) / (1 - Q)
-      sumC <- sum(C)
-
-      cost$W <- W
-      cost$Q <- Q
-      cost$C <- C
-      cost$sumC <- sumC
-
-      cost
-    },
-    export = cost_export
-  )
-}
 
 # f-Divergences -----------------------------------------------------------
 
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index d2932a6..4a3989f 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -1938,186 +1938,3 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
     Y
   }
 }
-
-
-
-# UMAP  -------------------------------------------------------------------
-
-# Fits a kernel for the output distances of the form w = 1 / (1 + a dsq ^ b)
-# where dsq is the squared Euclidean distance.
-# Standard t-SNE function is a = 1, b = 1.
-# Default UMAP values are a = 1.929, b = 0.7915.
-find_ab_params <- function(spread = 1, min_dist = 0.001) {
-  xv <- seq(from = 0, to = spread * 3, length.out = 300)
-  yv <- rep(0, length(xv))
-  yv[xv < min_dist] <- 1
-  yv[xv >= min_dist] <- exp(-(xv[xv >= min_dist] - min_dist) / spread)
-  result <- try({
-    stats::nls(yv ~ 1 / (1 + a * xv ^ (2 * b)),
-             start = list(a = 1, b = 1))$m$getPars()
-    }, silent = TRUE)
-    if (methods::is(result, "try-error")) {
-      stop("Can't find a, b for provided spread/min_dist values")
-    }
-  result
-}
-
-# The UMAP equivalent of perplexity calibration in x2aff. k is continuous rather
-# than integral and so is analogous to perplexity.
-# Some differences:
-# 1. The target value is the log2 of k, not the Shannon entropy associated
-# with the desired perplexity.
-# 2. Input weights are exponential, rather than Gaussian, with respect to the
-# distances. The distances are also centered with respect to the smoothed
-# distance to the nearest (non-zero distance) neighbor. A non-integral
-# 'local_connectivity' value can result in this shortest distance between an
-# interpolated value between two distances.
-# 3. The weights are not normalized. Their raw sum is compared to the target
-# value.
-# 4. Distances beyond the k-nearest neighbors are not used in the calibration.
-# The equivalent weights are set to 0.
-# 5. Weights associated with distances shorter than the smoothed nearest
-# neighbor distance are clamped to 1.
-# This code has been converted from the original Python and may not be very
-# idiomatic (or vectorizable).
-# tol is SMOOTH_K_TOLERANCE in the Python code.
-smooth_knn_distances <-
-  function(X,
-           k,
-           n_iter = 64,
-           local_connectivity = 1.0,
-           bandwidth = 1.0,
-           tol = 1e-5,
-           min_k_dist_scale = 1e-3,
-           cardinality = log2(k),
-           n_threads = 0,
-           verbose = FALSE) {
-
-    tsmessage("Commencing smooth kNN distance calibration for k = ", formatC(k))
-
-    if (methods::is(X, "dist")) {
-      X <- as.matrix(X)
-      nn_idx <- t(apply(X, 2, order))[, 1:k]
-      nn_dist <- matrix(0, nrow = nrow(X), ncol = k)
-      for (i in 1:nrow(nn_idx)) {
-        nn_dist[i, ] <- X[i, nn_idx[i, ]]
-      }
-    }
-    else {
-      tsmessage("Finding ", k + 1, " nearest neighbors")
-      knn <- rnndescent::brute_force_knn(X, k = k, n_threads = n_threads)
-      knn$idx <- knn$idx[, 2:k]
-      knn$dist <- knn$dist[, 2:k]
-
-      nn_idx <- matrix(nrow = nrow(X), ncol = k)
-      nn_idx[, 1] <- 1:nrow(nn_idx)
-      nn_idx[, 2:ncol(nn_idx)] <- knn$idx
-      nn_dist <- matrix(0, nrow = nrow(X), ncol = k)
-      nn_dist[, 2:ncol(nn_dist)] <- knn$dist
-    }
-
-    n <- nrow(nn_dist)
-    target <- cardinality * bandwidth
-    rho <- rep(0, n)
-    sigma <- rep(0, n)
-    P <- matrix(0, nrow = n, ncol = n)
-    mean_distances <- NULL
-
-    for (i in 1:n) {
-      lo <- 0.0
-      hi <- Inf
-      mid <- 1.0
-
-      ith_distances <- nn_dist[i, ]
-      non_zero_dists <- ith_distances[ith_distances > 0.0]
-      if (length(non_zero_dists) >= local_connectivity) {
-        index <- floor(local_connectivity)
-        interpolation <- local_connectivity - index
-        if (index > 0) {
-          if (interpolation <= tol) {
-            rho[i] <- non_zero_dists[index]
-          }
-          else {
-            rho[i] <- non_zero_dists[index] + interpolation *
-              (non_zero_dists[index + 1] - non_zero_dists[index])
-          }
-        }
-        else {
-          rho[i] <- interpolation * non_zero_dists[1]
-        }
-      } else if (length(non_zero_dists) > 0) {
-        rho[i] <- max(non_zero_dists)
-      }
-      else {
-        rho[i] <- 0.0
-      }
-
-      for (iter in 1:n_iter) {
-        psum <- 0.0
-        for (j in 2:ncol(nn_dist)) {
-          dist <- max(0, (nn_dist[i, j] - rho[i]))
-          psum <- psum + exp(-(dist / mid))
-        }
-        val <- psum
-
-        if (abs(val - target) < tol) {
-          break
-        }
-
-        if (val > target) {
-          hi <- mid
-          mid <- (lo + hi) / 2.0
-        }
-        else {
-          lo <- mid
-          if (is.infinite(hi)) {
-            mid <- mid * 2
-          }
-          else {
-            mid <- (lo + hi) / 2.0
-          }
-        }
-      }
-      sigma[i] <- mid
-
-      if (rho[i] > 0.0) {
-        sigma[i] <- max(sigma[i], min_k_dist_scale * mean(ith_distances))
-      }
-      else {
-        if (is.null(mean_distances)) {
-          mean_distances <- mean(nn_dist)
-        }
-        sigma[i] <- max(sigma[i], min_k_dist_scale * mean_distances)
-      }
-
-      prow <- exp(-(nn_dist[i, ] - rho[i]) / (sigma[i] * bandwidth))
-      prow[nn_dist[i, ] - rho[i] <= 0] <- 1
-      P[i, nn_idx[i, ]] <- prow
-    }
-    diag(P) <- 0
-
-    if (verbose) {
-      summarize(sigma, "sigma summary", verbose = verbose)
-    }
-    list(sigma = sigma, rho = rho, P = P)
-  }
-
-
-# set_op_mix_ratio = between 0 and 1 mixes in fuzzy set intersection
-# set to 0 for intersection only
-fuzzy_set_union <- function(X, set_op_mix_ratio = 1) {
-  XX <- X * t(X)
-  set_op_mix_ratio * (X + t(X) - XX) + (1 - set_op_mix_ratio) * XX
-}
-
-init_ab <- function(cost, spread = 1, min_dist = 0.001, verbose = FALSE) {
-  ab_params <- find_ab_params(spread = spread, min_dist = min_dist)
-  a <- ab_params[1]
-  b <- ab_params[2]
-  if (verbose) {
-    message("Umap curve parameters = ", formatC(a), ", ", formatC(b))
-  }
-  cost$a <- a
-  cost$b <- b
-  cost
-}
diff --git a/smallvis/R/umap.R b/smallvis/R/umap.R
new file mode 100644
index 0000000..ef8eacc
--- /dev/null
+++ b/smallvis/R/umap.R
@@ -0,0 +1,386 @@
+# UMAP --------------------------------------------------------------------
+
+umap <- function(perplexity,
+                 inp_kernel = "skd",
+                 symmetrize = "umap",
+                 spread = 1,
+                 min_dist = 0.001,
+                 gr_eps = 0.1,
+                 eps = 1e-9,
+                 row_weight = NULL,
+                 n_threads = 0,
+                 use_cpp = FALSE) {
+  if (!is.null(row_weight)) {
+    row_normalize <- row_weight
+  } else {
+    row_normalize <- FALSE
+  }
+  lreplace(
+    tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = FALSE,
+        row_normalize = row_normalize,
+        n_threads = n_threads,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        use_cpp = use_cpp
+      )
+
+      cost <- init_ab(cost,
+        spread = spread,
+        min_dist = min_dist,
+        verbose = verbose
+      )
+
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      P <- cost$P
+      eps <- cost$eps
+      cost$Cp <- colSums(P * log(P + eps) + (1 - P) * log1p(-P + eps))
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      P <- cost$P
+      eps <- cost$eps
+      W <- cost$W
+      cost$pcost <- colSums(-P * log(W + eps) - (1 - P) * log1p(-W + eps)) + cost$Cp
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      cost$G <- k2g(Y, 4 * (cost$b / (cost$D2 + cost$eps + gr_eps)) * (cost$P - cost$W))
+      cost
+    },
+    update = function(cost, Y) {
+      D2 <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      D2[D2 < 0] <- 0
+
+      W <- 1 / (1 + cost$a * D2^cost$b)
+      diag(W) <- 0
+
+      cost$W <- W
+      cost$D2 <- D2
+      cost
+    },
+    export = cost_export
+  )
+}
+
+# UMAP with the output kernel fixed to the t-distribution
+tumap <- function(perplexity,
+                  inp_kernel = "skd",
+                  symmetrize = "umap",
+                  gr_eps = 0.1,
+                  eps = 1e-9,
+                  row_weight = NULL,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  if (!is.null(row_weight)) {
+    row_normalize <- row_weight
+  } else {
+    row_normalize <- FALSE
+  }
+  lreplace(
+    umap(perplexity, n_threads = n_threads, use_cpp = use_cpp),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost$eps <- eps
+
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = FALSE,
+        row_normalize = row_normalize,
+        n_threads = n_threads,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        use_cpp = use_cpp
+      )
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      cost$G <- k2g(Y, 4 * (cost$W / ((1 - cost$W) + cost$eps + gr_eps)) * (cost$P - cost$W))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+
+      cost$W <- W
+      cost
+    }
+  )
+}
+
+# t-UMAP where output and input affinities are normalized
+ntumap <- function(perplexity,
+                   inp_kernel = "skd",
+                   symmetrize = "umap",
+                   gr_eps = 0.1,
+                   eps = 1e-9,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
+  lreplace(
+    tumap(perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost$eps <- eps
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        n_threads = n_threads,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        use_cpp = use_cpp
+      )
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      P <- cost$P
+      eps <- cost$eps
+      Q <- cost$Q
+
+      cost$pcost <- colSums(-P * logm(Q, eps) - (1 - P) * log1p(-Q + eps)) + cost$Cp
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$G <- k2g(Y, 4 * cost$W * (cost$C - cost$sumC * cost$Q))
+      cost
+    },
+    update = function(cost, Y) {
+      P <- cost$P
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+
+      Q <- W / sum(W)
+      C <- (P - Q) / (1 - Q)
+      sumC <- sum(C)
+
+      cost$W <- W
+      cost$Q <- Q
+      cost$C <- C
+      cost$sumC <- sumC
+
+      cost
+    },
+    export = cost_export
+  )
+}
+
+# Fits a kernel for the output distances of the form w = 1 / (1 + a dsq ^ b)
+# where dsq is the squared Euclidean distance.
+# Standard t-SNE function is a = 1, b = 1.
+# Default UMAP values are a = 1.929, b = 0.7915.
+find_ab_params <- function(spread = 1, min_dist = 0.001) {
+  xv <- seq(
+    from = 0,
+    to = spread * 3,
+    length.out = 300
+  )
+  yv <- rep(0, length(xv))
+  yv[xv < min_dist] <- 1
+  yv[xv >= min_dist] <- exp(-(xv[xv >= min_dist] - min_dist) / spread)
+  result <- try(
+    {
+      stats::nls(yv ~ 1 / (1 + a * xv^(2 * b)), start = list(a = 1, b = 1))$m$getPars()
+    },
+    silent = TRUE
+  )
+  if (methods::is(result, "try-error")) {
+    stop("Can't find a, b for provided spread/min_dist values")
+  }
+  result
+}
+
+# The UMAP equivalent of perplexity calibration in x2aff. k is continuous rather
+# than integral and so is analogous to perplexity.
+# Some differences:
+# 1. The target value is the log2 of k, not the Shannon entropy associated
+# with the desired perplexity.
+# 2. Input weights are exponential, rather than Gaussian, with respect to the
+# distances. The distances are also centered with respect to the smoothed
+# distance to the nearest (non-zero distance) neighbor. A non-integral
+# 'local_connectivity' value can result in this shortest distance between an
+# interpolated value between two distances.
+# 3. The weights are not normalized. Their raw sum is compared to the target
+# value.
+# 4. Distances beyond the k-nearest neighbors are not used in the calibration.
+# The equivalent weights are set to 0.
+# 5. Weights associated with distances shorter than the smoothed nearest
+# neighbor distance are clamped to 1.
+# This code has been converted from the original Python and may not be very
+# idiomatic (or vectorizable).
+# tol is SMOOTH_K_TOLERANCE in the Python code.
+smooth_knn_distances <-
+  function(X,
+           k,
+           n_iter = 64,
+           local_connectivity = 1.0,
+           bandwidth = 1.0,
+           tol = 1e-5,
+           min_k_dist_scale = 1e-3,
+           cardinality = log2(k),
+           n_threads = 0,
+           verbose = FALSE) {
+    tsmessage("Commencing smooth kNN distance calibration for k = ", formatC(k))
+
+    if (methods::is(X, "dist")) {
+      X <- as.matrix(X)
+      nn_idx <- t(apply(X, 2, order))[, 1:k]
+      nn_dist <- matrix(0, nrow = nrow(X), ncol = k)
+      for (i in 1:nrow(nn_idx)) {
+        nn_dist[i, ] <- X[i, nn_idx[i, ]]
+      }
+    } else {
+      tsmessage("Finding ", k + 1, " nearest neighbors")
+      knn <- rnndescent::brute_force_knn(X, k = k, n_threads = n_threads)
+      knn$idx <- knn$idx[, 2:k]
+      knn$dist <- knn$dist[, 2:k]
+
+      nn_idx <- matrix(nrow = nrow(X), ncol = k)
+      nn_idx[, 1] <- 1:nrow(nn_idx)
+      nn_idx[, 2:ncol(nn_idx)] <- knn$idx
+      nn_dist <- matrix(0, nrow = nrow(X), ncol = k)
+      nn_dist[, 2:ncol(nn_dist)] <- knn$dist
+    }
+
+    n <- nrow(nn_dist)
+    target <- cardinality * bandwidth
+    rho <- rep(0, n)
+    sigma <- rep(0, n)
+    P <- matrix(0, nrow = n, ncol = n)
+    mean_distances <- NULL
+
+    for (i in 1:n) {
+      lo <- 0.0
+      hi <- Inf
+      mid <- 1.0
+
+      ith_distances <- nn_dist[i, ]
+      non_zero_dists <- ith_distances[ith_distances > 0.0]
+      if (length(non_zero_dists) >= local_connectivity) {
+        index <- floor(local_connectivity)
+        interpolation <- local_connectivity - index
+        if (index > 0) {
+          if (interpolation <= tol) {
+            rho[i] <- non_zero_dists[index]
+          } else {
+            rho[i] <- non_zero_dists[index] + interpolation *
+              (non_zero_dists[index + 1] - non_zero_dists[index])
+          }
+        } else {
+          rho[i] <- interpolation * non_zero_dists[1]
+        }
+      } else if (length(non_zero_dists) > 0) {
+        rho[i] <- max(non_zero_dists)
+      } else {
+        rho[i] <- 0.0
+      }
+
+      for (iter in 1:n_iter) {
+        psum <- 0.0
+        for (j in 2:ncol(nn_dist)) {
+          dist <- max(0, (nn_dist[i, j] - rho[i]))
+          psum <- psum + exp(-(dist / mid))
+        }
+        val <- psum
+
+        if (abs(val - target) < tol) {
+          break
+        }
+
+        if (val > target) {
+          hi <- mid
+          mid <- (lo + hi) / 2.0
+        } else {
+          lo <- mid
+          if (is.infinite(hi)) {
+            mid <- mid * 2
+          } else {
+            mid <- (lo + hi) / 2.0
+          }
+        }
+      }
+      sigma[i] <- mid
+
+      if (rho[i] > 0.0) {
+        sigma[i] <- max(sigma[i], min_k_dist_scale * mean(ith_distances))
+      } else {
+        if (is.null(mean_distances)) {
+          mean_distances <- mean(nn_dist)
+        }
+        sigma[i] <- max(sigma[i], min_k_dist_scale * mean_distances)
+      }
+
+      prow <- exp(-(nn_dist[i, ] - rho[i]) / (sigma[i] * bandwidth))
+      prow[nn_dist[i, ] - rho[i] <= 0] <- 1
+      P[i, nn_idx[i, ]] <- prow
+    }
+    diag(P) <- 0
+
+    if (verbose) {
+      summarize(sigma, "sigma summary", verbose = verbose)
+    }
+    list(sigma = sigma, rho = rho, P = P)
+  }
+
+
+# set_op_mix_ratio = between 0 and 1 mixes in fuzzy set intersection
+# set to 0 for intersection only
+fuzzy_set_union <- function(X, set_op_mix_ratio = 1) {
+  XX <- X * t(X)
+  set_op_mix_ratio * (X + t(X) - XX) + (1 - set_op_mix_ratio) * XX
+}
+
+init_ab <- function(cost,
+                    spread = 1,
+                    min_dist = 0.001,
+                    verbose = FALSE) {
+  ab_params <- find_ab_params(spread = spread, min_dist = min_dist)
+  a <- ab_params[1]
+  b <- ab_params[2]
+  if (verbose) {
+    message("Umap curve parameters = ", formatC(a), ", ", formatC(b))
+  }
+  cost$a <- a
+  cost$b <- b
+  cost
+}

From 6436bd2b7433c93dee6f9767ebefe5c120c27bee Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 11:30:11 -0700
Subject: [PATCH 14/26] formatting

---
 smallvis/R/init.R       | 112 +++++++++++--------
 smallvis/R/perplexity.R | 233 +++++++++++++++++++++++-----------------
 smallvis/R/util.R       |  46 ++++----
 3 files changed, 228 insertions(+), 163 deletions(-)

diff --git a/smallvis/R/init.R b/smallvis/R/init.R
index 2c83c43..2272da4 100644
--- a/smallvis/R/init.R
+++ b/smallvis/R/init.R
@@ -1,16 +1,21 @@
 # Output Initialization ---------------------------------------------------
 
 # Initialization of the output coordinates
-init_out <- function(Y_init, X, n, ndim, pca_preprocessed, verbose = FALSE) {
+init_out <- function(Y_init,
+                     X,
+                     n,
+                     ndim,
+                     pca_preprocessed,
+                     verbose = FALSE) {
   switch(Y_init,
-         pca = {
-           tsmessage("Initializing from PCA scores")
-           pca_init(X, ndim, pca_preprocessed, verbose)
-         },
-         rand = {
-           tsmessage("Initializing from random Gaussian")
-           matrix(stats::rnorm(ndim * n, sd = 1), n)
-         }
+    pca = {
+      tsmessage("Initializing from PCA scores")
+      pca_init(X, ndim, pca_preprocessed, verbose)
+    },
+    rand = {
+      tsmessage("Initializing from random Gaussian")
+      matrix(stats::rnorm(ndim * n, sd = 1), n)
+    }
   )
 }
 
@@ -18,8 +23,7 @@ pca_init <- function(X, ndim, pca_preprocessed, verbose = FALSE) {
   # If we've already done PCA, we can just take the first two columns
   if (pca_preprocessed) {
     X[, 1:2]
-  }
-  else {
+  } else {
     pca_scores(X, ncol = ndim, verbose = verbose)
   }
 }
@@ -43,37 +47,38 @@ pca_init <- function(X, ndim, pca_preprocessed, verbose = FALSE) {
 # top eigenvectors to be extracted. Otherwise, use the slower eigen routine.
 # A must be symmetric and positive semi definite, but not necessarily
 # normalized in any specific way.
-laplacian_eigenmap <- function(A, ndim = 2, use_RSpectra = TRUE) {
+laplacian_eigenmap <- function(A,
+                               ndim = 2,
+                               use_RSpectra = TRUE) {
   # Equivalent to: D <- diag(colSums(A)); M <- solve(D) %*% A
   # This effectively row-normalizes A: colSums is normally faster than rowSums
   # and because A is symmetric, they're equivalent
   M <- A / colSums(A)
-  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE,
-                                       warn.conflicts = FALSE)) {
+  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE, warn.conflicts = FALSE)) {
     tsmessage("Using RSpectra for eigenvectors")
     Re(RSpectra::eigs(M, k = ndim + 1)$vectors[, 2:(ndim + 1)])
-  }
-  else {
+  } else {
     tsmessage("Using eigen for eigenvectors")
     eigen(M, symmetric = FALSE)$vectors[, 2:(ndim + 1)]
   }
 }
 
 # Use a normalized Laplacian. The UMAP approach, taken from version 0.2.1.
-normalized_spectral_init <- function(A, ndim = 2, use_RSpectra = TRUE) {
+normalized_spectral_init <- function(A,
+                                     ndim = 2,
+                                     use_RSpectra = TRUE) {
   n <- nrow(A)
   # Normalized Laplacian: clear and close to UMAP code, but very slow in R
   # I <- diag(1, nrow = n, ncol = n)
   # D <- diag(1 / sqrt(colSums(A)))
   # L <- I - D %*% A %*% D
-  
+
   # A lot faster (order of magnitude when n = 1000)
   Dsq <- sqrt(colSums(A))
   L <- -t(A / Dsq) / Dsq
   diag(L) <- 1 + diag(L)
-  
-  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE,
-                                       warn.conflicts = FALSE)) {
+
+  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE, warn.conflicts = FALSE)) {
     tsmessage("Using RSpectra for eigenvectors")
     k <- ndim + 1
     ncv <- max(2 * k + 1, floor(sqrt(n)))
@@ -82,11 +87,14 @@ normalized_spectral_init <- function(A, ndim = 2, use_RSpectra = TRUE) {
       maxitr = 5 * n,
       tol = 1e-4
     )
-    res <- RSpectra::eigs(L, k = k, which = "SM", opt = opt)
+    res <- RSpectra::eigs(L,
+      k = k,
+      which = "SM",
+      opt = opt
+    )
     vec_indices <- rev(order(res$values, decreasing = TRUE)[1:ndim])
     res <- Re(res$vectors[, vec_indices])
-  }
-  else {
+  } else {
     tsmessage("Using eigen for eigenvectors")
     res <- eigen(L, symmetric = FALSE)
     vec_indices <- order(res$values, decreasing = FALSE)[2:(ndim + 1)]
@@ -111,41 +119,52 @@ shrink_coords <- function(X, sdev = 1e-4) {
 # Calculates a matrix containing the first ncol columns of the PCA scores.
 # Returns the score matrix unless ret_extra is TRUE, in which case a list
 # is returned also containing the eigenvalues
-pca_scores <- function(X, ncol = min(dim(X)), verbose = FALSE,
+pca_scores <- function(X,
+                       ncol = min(dim(X)),
+                       verbose = FALSE,
                        ret_extra = FALSE) {
   if (methods::is(X, "dist")) {
-    res_mds <- stats::cmdscale(X, x.ret = TRUE, eig = TRUE, k = ncol)
-    
+    res_mds <- stats::cmdscale(X,
+      x.ret = TRUE,
+      eig = TRUE,
+      k = ncol
+    )
+
     if (ret_extra || verbose) {
       lambda <- res_mds$eig
       varex <- sum(lambda[1:ncol]) / sum(lambda)
-      tsmessage("PCA (using classical MDS): ", ncol, " components explained ",
-                formatC(varex * 100), "% variance")
+      tsmessage(
+        "PCA (using classical MDS): ",
+        ncol,
+        " components explained ",
+        formatC(varex * 100),
+        "% variance"
+      )
     }
     scores <- res_mds$points
-  }
-  else {
+  } else {
     X <- scale(X, center = TRUE, scale = FALSE)
     # do SVD on X directly rather than forming covariance matrix
     s <- svd(X, nu = ncol, nv = 0)
     D <- diag(c(s$d[1:ncol]))
     if (verbose || ret_extra) {
       # calculate eigenvalues of covariance matrix from singular values
-      lambda <- (s$d ^ 2) / (nrow(X) - 1)
+      lambda <- (s$d^2) / (nrow(X) - 1)
       varex <- sum(lambda[1:ncol]) / sum(lambda)
-      tsmessage("PCA: ", ncol, " components explained ", formatC(varex * 100),
-                "% variance")
+      tsmessage(
+        "PCA: ",
+        ncol,
+        " components explained ",
+        formatC(varex * 100),
+        "% variance"
+      )
     }
     scores <- s$u %*% D
   }
-  
+
   if (ret_extra) {
-    list(
-      scores = scores,
-      lambda = lambda[1:ncol]
-    )
-  }
-  else {
+    list(scores = scores, lambda = lambda[1:ncol])
+  } else {
     scores
   }
 }
@@ -153,7 +172,14 @@ pca_scores <- function(X, ncol = min(dim(X)), verbose = FALSE,
 # Whiten the data by PCA. This both reduces the dimensionality, but also
 # scales the scores by the inverse square root of the equivalent eigenvalue
 # so that the variance of each column is 1.
-pca_whiten <- function(X, ncol = min(dim(X)), eps = 1e-5, verbose = FALSE) {
-  pca <- pca_scores(X, ncol = ncol, verbose = verbose, ret_extra = TRUE)
+pca_whiten <- function(X,
+                       ncol = min(dim(X)),
+                       eps = 1e-5,
+                       verbose = FALSE) {
+  pca <- pca_scores(X,
+    ncol = ncol,
+    verbose = verbose,
+    ret_extra = TRUE
+  )
   sweep(pca$scores, 2, sqrt(pca$lambda + eps), "/")
 }
diff --git a/smallvis/R/perplexity.R b/smallvis/R/perplexity.R
index 9dfe5d0..3e76e31 100644
--- a/smallvis/R/perplexity.R
+++ b/smallvis/R/perplexity.R
@@ -7,48 +7,49 @@
 # NB set default kernel to "exp" to get results closer to the tsne package.
 # This differs from the procedure in the t-SNE paper by exponentially weighting
 # the distances, rather than the squared distances.
-x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
-                  verbose = FALSE, guesses = NULL) {
+x2aff <- function(X,
+                  perplexity = 15,
+                  tol = 1e-5,
+                  kernel = "gauss",
+                  verbose = FALSE,
+                  guesses = NULL) {
   x_is_dist <- methods::is(X, "dist")
   if (x_is_dist) {
     D <- X
     n <- attr(D, "Size")
-    
+
     D <- as.matrix(D)
     if (kernel == "gauss") {
       D <- D * D
     }
-  }
-  else {
+  } else {
     XX <- rowSums(X * X)
     n <- nrow(X)
   }
-  
+
   nperps <- length(perplexity)
   if (nperps > 1 && nperps != n) {
     stop("Must provide one perplexity per point")
   }
-  
+
   if (!is.null(guesses) && length(guesses) != n) {
     stop("Initial guess vector must match number of observations in X")
   }
-  
+
   W <- matrix(0, n, n)
   intd <- rep(0, n)
   if (!is.null(guesses)) {
     beta <- guesses
-  }
-  else {
+  } else {
     beta <- rep(1, n)
   }
   if (nperps == 1) {
     logU <- log(perplexity)
-  }
-  else {
+  } else {
     perps <- perplexity
   }
   bad_perp <- 0
-  
+
   for (i in 1:n) {
     if (nperps > 1) {
       perplexity <- perps[i]
@@ -56,18 +57,17 @@ x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
     }
     betamin <- -Inf
     betamax <- Inf
-    
+
     if (x_is_dist) {
       Di <- D[i, -i]
-    }
-    else {
+    } else {
       Di <- (XX[i] + XX - 2 * colSums(tcrossprod(X[i, ], X)))[-i]
       Di[Di < 0] <- 0
       if (kernel == "exp") {
         Di <- sqrt(Di)
       }
     }
-    
+
     # If we haven't been provided with guesses, then try the initialization used
     # for all points in ELKI according to Schubert & Gertz in "Intrinsic
     # t-Stochastic Neighbor Embedding for Visualization and Outlier Detection: A
@@ -77,12 +77,12 @@ x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
     if (is.null(guesses) && i == 1) {
       beta[1] <- 0.5 * perplexity / mean(Di)
     }
-    
+
     sres <- shannon(Di, beta[i])
     H <- sres$H
     Wi <- sres$W
     sumWi <- sres$Z
-    
+
     Hdiff <- H - logU
     tries <- 0
     while (abs(Hdiff) > tol && tries < 50) {
@@ -101,12 +101,12 @@ x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
           beta[i] <- (beta[i] + betamin) / 2
         }
       }
-      
+
       sres <- shannon(Di, beta[i])
       H <- sres$H
       Wi <- sres$W
       sumWi <- sres$Z
-      
+
       Hdiff <- H - logU
       tries <- tries + 1
     }
@@ -114,14 +114,13 @@ x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
       # Put a weight of 1/perplexity on the perplexity-nearest neighbors
       bad_perp <- bad_perp + 1
       knn_idx <- order(Di, decreasing = FALSE)[1:max(floor(perplexity), 1)]
-      knn_idx[knn_idx >= i] <- knn_idx[knn_idx >= i]  + 1
+      knn_idx[knn_idx >= i] <- knn_idx[knn_idx >= i] + 1
       Wi <- rep(0, n)
       Wi[knn_idx] <- 1 / floor(perplexity)
-      
+
       intd[i] <- 0
       W[i, ] <- Wi
-    }
-    else {
+    } else {
       # if we didn't supply estimates for beta manually, then initialize guess for
       # next point with optimized beta for this point: doesn't save many
       # iterations, but why not?
@@ -131,15 +130,14 @@ x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
       intd[i] <- intd_x2aff(Di, beta[i], Wi, sumWi, H)
       W[i, -i] <- Wi
     }
-    
   }
   sigma <- sqrt(1 / beta)
-  
+
   if (bad_perp > 0) {
     tsmessage("Warning: ", bad_perp, " perplexity calibrations failed!")
     warning(bad_perp, " perplexity calibrations failed")
   }
-  
+
   if (verbose) {
     summarize(sigma, "sigma summary", verbose = verbose)
     summarize(intd, "Dint", verbose = verbose)
@@ -156,37 +154,34 @@ x2aff <- function(X, perplexity = 15, tol = 1e-5, kernel = "gauss",
 shannon <- function(D2, beta) {
   W <- exp(-D2 * beta)
   Z <- sum(W)
-  
+
   if (Z == 0) {
     H <- 0
-  }
-  else {
+  } else {
     H <- log(Z) + beta * sum(D2 * W) / Z
   }
-  list(
-    W = W,
-    Z = Z,
-    H = H
-  )
+  list(W = W, Z = Z, H = H)
 }
 
-x2aff_sigma <- function(X, sigma = 1e-3, verbose = FALSE, use_cpp = FALSE, 
+x2aff_sigma <- function(X,
+                        sigma = 1e-3,
+                        verbose = FALSE,
+                        use_cpp = FALSE,
                         n_threads = 1) {
   x_is_dist <- methods::is(X, "dist")
   if (x_is_dist) {
     D <- X
-    
+
     D <- as.matrix(D)
     D <- D * D
-  }
-  else {
+  } else {
     D <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
   }
   beta <- 1 / (sigma * sigma)
   sres <- shannon(D, beta)
   W <- sres$W
   diag(W) <- 0
-  
+
   list(W = W, beta = beta)
 }
 
@@ -209,8 +204,7 @@ knn_dist <- function(X, k, n_threads, verbose) {
       Di[Di > kdists[i]] <- Inf
       D[, i] <- Di
     }
-  }
-  else {
+  } else {
     # Find the k-nearest indexes and distances of X, and set the corresponding
     # distance matrix elements
     n <- nrow(X)
@@ -221,14 +215,14 @@ knn_dist <- function(X, k, n_threads, verbose) {
     knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
     knn$idx <- knn$idx[, 2:(k + 1)]
     knn$dist <- knn$dist[, 2:(k + 1)]
-    
+
     D <- matrix(Inf, nrow = n, ncol = n)
     diag(D) <- 0
     for (i in 1:n) {
       D[i, knn$idx[i, ]] <- knn$dist[i, ]
     }
   }
-  
+
   # symmetrize
   pmin(D, t(D))
 }
@@ -250,19 +244,18 @@ knn_graph <- function(X, k, n_threads, verbose) {
       Di[Di > kdists[i]] <- 0
       D[, i] <- 1
     }
-  }
-  else {
+  } else {
     # Find the k-nearest indexes and distances of X, and set the corresponding
     # distance matrix elements
     n <- nrow(X)
     if (k > n - 1) {
       stop("k must be not be > n - 1")
     }
-    
+
     tsmessage("Finding ", k + 1, " nearest neighbors")
     knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
     knn$idx <- knn$idx[, 2:(k + 1)]
-    
+
     D <- matrix(0, nrow = n, ncol = n)
     for (i in 1:n) {
       D[i, knn$idx[i, ]] <- 1
@@ -273,19 +266,26 @@ knn_graph <- function(X, k, n_threads, verbose) {
 
 # Given data X and k nearest neighbors, return a geodisic distance matrix
 # Disconnections are treated by using the Euclidean distance.
-geodesic <- function(X, k, fill = TRUE, use_cpp = FALSE, n_threads = 0, 
+geodesic <- function(X,
+                     k,
+                     fill = TRUE,
+                     use_cpp = FALSE,
+                     n_threads = 0,
                      verbose = FALSE) {
   tsmessage("Calculating geodesic distances with k = ", k)
-  
+
   R <- knn_dist(X, k, n_threads = n_threads, verbose = verbose)
   # The hard work is done by Rfast's implementation of Floyd's algorithm
   G <- Rfast::floyd(R)
   if (any(is.infinite(G)) && fill) {
-    tsmessage("k = ", k, " resulted in disconnections: filling with Euclidean distances")
+    tsmessage(
+      "k = ",
+      k,
+      " resulted in disconnections: filling with Euclidean distances"
+    )
     if (methods::is(X, "dist")) {
       R <- as.matrix(X)
-    }
-    else {
+    } else {
       R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
     }
     G[is.infinite(G)] <- R[is.infinite(G)]
@@ -293,54 +293,73 @@ geodesic <- function(X, k, fill = TRUE, use_cpp = FALSE, n_threads = 0,
   G
 }
 
-# Multiscale perplexities: P is an average over the results of multiple 
+# Multiscale perplexities: P is an average over the results of multiple
 # perplexities
 # as described by de Bodt et al in
 # Perplexity-free t-SNE and twice Student tt-SNE (2018)
-msp <- function(X, perplexities = NULL, tol = 1e-5,
-                symmetrize = "symmetric", row_normalize = TRUE,
+msp <- function(X,
+                perplexities = NULL,
+                tol = 1e-5,
+                symmetrize = "symmetric",
+                row_normalize = TRUE,
                 normalize = TRUE,
-                verbose = FALSE, guesses = NULL) {
+                verbose = FALSE,
+                guesses = NULL) {
   if (methods::is(X, "dist")) {
     n <- attr(X, "Size")
-  }
-  else {
+  } else {
     n <- nrow(X)
   }
-  
+
   if (is.null(perplexities)) {
     perplexities <- idp_perps(n)
   }
-  tsmessage("Calculating multi-scale P with perplexities from ",
-            formatC(perplexities[1]), " to ", formatC(last(perplexities)))
-  
+  tsmessage(
+    "Calculating multi-scale P with perplexities from ",
+    formatC(perplexities[1]),
+    " to ",
+    formatC(last(perplexities))
+  )
+
   res <- NULL
   for (perplexity in perplexities) {
-    tsmessage("Commencing calibration for perplexity = ",
-              format_perps(perplexity))
-    x2a_res <- x2aff(X = X, perplexity = perplexity, tol = tol, kernel = "gauss",
-                     verbose = verbose, guesses = guesses)
+    tsmessage(
+      "Commencing calibration for perplexity = ",
+      format_perps(perplexity)
+    )
+    x2a_res <- x2aff(
+      X = X,
+      perplexity = perplexity,
+      tol = tol,
+      kernel = "gauss",
+      verbose = verbose,
+      guesses = guesses
+    )
     P <- x2a_res$W
-    Q <- scale_affinities(P, 
-                          symmetrize = "symmetric", row_normalize = TRUE,
-                          normalize = TRUE)
+    Q <- scale_affinities(
+      P,
+      symmetrize = "symmetric",
+      row_normalize = TRUE,
+      normalize = TRUE
+    )
     if (is.null(res)) {
       res$P <- Q
-    }
-    else {
+    } else {
       res$P <- res$P + Q
     }
   }
-  
+
   if (length(perplexities) > 1) {
     res$P <- res$P / length(perplexities)
   }
-  
+
   if (is.logical(row_normalize)) {
-    tsmessage("Effective perplexity of multiscale P approx = ", 
-              formatC(stats::median(perpp(res$P))))
+    tsmessage(
+      "Effective perplexity of multiscale P approx = ",
+      formatC(stats::median(perpp(res$P)))
+    )
   }
-  
+
   res
 }
 
@@ -348,38 +367,51 @@ msp <- function(X, perplexities = NULL, tol = 1e-5,
 # Scan through the provided perplexities and use the result which maximizes
 # the mean correlation dimension (which is an estimate for the intrinsic
 # dimensionality). Stops at the first maxmimum found.
-idp <- function(X, perplexities = NULL, tol = 1e-5,
-                verbose = FALSE, guesses = NULL) {
+idp <- function(X,
+                perplexities = NULL,
+                tol = 1e-5,
+                verbose = FALSE,
+                guesses = NULL) {
   if (methods::is(X, "dist")) {
     n <- attr(X, "Size")
-  }
-  else {
+  } else {
     n <- nrow(X)
   }
-  
+
   if (is.null(perplexities)) {
     perplexities <- idp_perps(n)
   }
   if (verbose) {
-    tsmessage("Searching for intrinsic dimensionality with perplexities from ",
-              formatC(perplexities[1]), " to ", formatC(last(perplexities)))
+    tsmessage(
+      "Searching for intrinsic dimensionality with perplexities from ",
+      formatC(perplexities[1]),
+      " to ",
+      formatC(last(perplexities))
+    )
   }
-  
+
   corr_dim_max <- -Inf
   idp <- 0
   idp_res <- NULL
   for (perplexity in perplexities) {
     if (verbose) {
-      tsmessage("Commencing calibration for perplexity = ",
-                format_perps(perplexity))
+      tsmessage(
+        "Commencing calibration for perplexity = ",
+        format_perps(perplexity)
+      )
     }
-    x2a_res <- x2aff(X = X, perplexity = perplexity, tol = tol, kernel = "gauss",
-                     verbose = verbose, guesses = guesses)
+    x2a_res <- x2aff(
+      X = X,
+      perplexity = perplexity,
+      tol = tol,
+      kernel = "gauss",
+      verbose = verbose,
+      guesses = guesses
+    )
     corr_dim <- mean(x2a_res$dint)
     if (corr_dim <= corr_dim_max) {
       break
-    }
-    else {
+    } else {
       corr_dim_max <- corr_dim
       idp <- perplexity
       idp_res <- x2a_res
@@ -389,10 +421,14 @@ idp <- function(X, perplexities = NULL, tol = 1e-5,
     stop("Unable to find an IDP: all correlation dimensions were -ve")
   }
   if (verbose) {
-    tsmessage("Found IDP at perplexity = ", formatC(idp),
-              " intrinsic dimensionality = ", formatC(corr_dim_max))
+    tsmessage(
+      "Found IDP at perplexity = ",
+      formatC(idp),
+      " intrinsic dimensionality = ",
+      formatC(corr_dim_max)
+    )
   }
-  
+
   idp_res$idp <- idp
   idp_res
 }
@@ -405,7 +441,7 @@ idp_perps <- function(n) {
   max_u <- min(128, max(2, ceiling(n / 2)))
   max_uexp <- floor(log2(max_u))
   min_uexp <- min(2, max_uexp)
-  2 ^ (min_uexp:max_uexp)
+  2^(min_uexp:max_uexp)
 }
 
 # Is the perplexity argument a string or a list with the first element is a
@@ -415,8 +451,7 @@ perp_method <- function(perplexity) {
   if (is.character(perplexity) || is.list(perplexity)) {
     if (is.list(perplexity)) {
       method <- perplexity[[1]]
-    }
-    else {
+    } else {
       method <- perplexity
     }
   }
@@ -431,4 +466,4 @@ user_idp_perps <- function(perplexity) {
     perplexities <- perplexity[[2]]
   }
   perplexities
-}
\ No newline at end of file
+}
diff --git a/smallvis/R/util.R b/smallvis/R/util.R
index 9d0efc6..01d2c74 100644
--- a/smallvis/R/util.R
+++ b/smallvis/R/util.R
@@ -8,20 +8,22 @@ dist2 <- function(X) {
   D2 + sweep(X %*% t(X) * -2, 2, t(D2), `+`)
 }
 
-calc_d2 <- function(X, use_cpp = FALSE, n_threads = 1) {
+calc_d2 <- function(X,
+                    use_cpp = FALSE,
+                    n_threads = 1) {
   if (use_cpp) {
     dist2_cpp(X, n_threads = n_threads)
-  }
-  else {
+  } else {
     safe_dist2(X)
   }
 }
 
-calc_d <- function(X, use_cpp = FALSE, n_threads = 1) {
+calc_d <- function(X,
+                   use_cpp = FALSE,
+                   n_threads = 1) {
   if (use_cpp) {
     dist_cpp(X, n_threads = n_threads)
-  }
-  else {
+  } else {
     sqrt(safe_dist2(X))
   }
 }
@@ -33,20 +35,22 @@ safe_dist2 <- function(X) {
   D2
 }
 
-calc_d2tweight <- function(D2, use_cpp = FALSE, n_threads = 1) {
+calc_d2tweight <- function(D2,
+                           use_cpp = FALSE,
+                           n_threads = 1) {
   if (use_cpp) {
     d2_to_tweight_cpp(D2, n_threads = n_threads)
-  }
-  else {
+  } else {
     1 / (1 + D2)
   }
 }
 
-calc_tweight <- function(X, use_cpp = FALSE, n_threads = 1) {
+calc_tweight <- function(X,
+                         use_cpp = FALSE,
+                         n_threads = 1) {
   if (use_cpp) {
     tweight_cpp(X, n_threads = n_threads)
-  }
-  else {
+  } else {
     D2 <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
     1 / (1 + D2)
   }
@@ -63,10 +67,13 @@ stime <- function() {
 }
 
 # message with a time stamp
-tsmessage <- function(..., domain = NULL, appendLF = TRUE, force = FALSE,
+tsmessage <- function(...,
+                      domain = NULL,
+                      appendLF = TRUE,
+                      force = FALSE,
                       time_stamp = TRUE) {
   verbose <- get0("verbose", envir = sys.parent())
-  
+
   if (force || (!is.null(verbose) && verbose)) {
     msg <- ""
     if (time_stamp) {
@@ -115,18 +122,15 @@ nnat <- function(x) {
 # log vector information
 summarize <- function(X, msg = "", verbose = FALSE) {
   summary_X <- summary(X, digits = max(3, getOption("digits") - 3))
-  tsmessage(msg, ": ", paste(names(summary_X), ":", summary_X, "|",
-                             collapse = ""))
+  tsmessage(msg, ": ", paste(names(summary_X), ":", summary_X, "|", collapse = ""))
 }
 
 # Format perplexity as a string. Could be a scalar or a vector. In the latter
 # case, just list the first two values and then ellipses
 format_perps <- function(perplexity) {
   if (length(perplexity) > 1) {
-    paste0(formatC(perplexity[1]), ", ",
-           formatC(perplexity[2]), "...")
-  }
-  else {
+    paste0(formatC(perplexity[1]), ", ", formatC(perplexity[2]), "...")
+  } else {
     formatC(perplexity)
   }
 }
@@ -139,4 +143,4 @@ last <- function(x) {
 # remove NULL items from a list
 remove_nulls <- function(l) {
   l[!vapply(l, is.null, logical(1))]
-}
\ No newline at end of file
+}

From d9a76f67952d751a01084e86ca2906f2c6164551 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 11:39:14 -0700
Subject: [PATCH 15/26] move distance-preserving code

---
 smallvis/R/cost.R       | 277 ----------------------------
 smallvis/R/mmds.R       | 395 ++++++++++++++++++++++++++++++++++++++++
 smallvis/R/perplexity.R |  29 +--
 3 files changed, 396 insertions(+), 305 deletions(-)
 create mode 100644 smallvis/R/mmds.R

diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index d1a6768..d929468 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -933,283 +933,6 @@ gsne <- function(perplexity, lambda = 1, inp_kernel = "gaussian",
   )
 }
 
-
-# Distance Preserving Methods ---------------------------------------------
-
-mmds_init <- function(cost, X, max_iter, eps = .Machine$double.eps, 
-                      verbose = FALSE, ret_extra = c(), use_cpp = FALSE,
-                      n_threads = 1) {
-  tsmessage("Calculating pairwise distances")
-  if (methods::is(X, "dist")) {
-    cost$R <- as.matrix(X)
-  }
-  else {
-    cost$R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
-  }
-  cost$eps <- eps
-  cost
-}
-
-# Metric MDS, minimizing strain.
-mmds <- function(eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 1) {
-  list(
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra, 
-                        use_cpp = use_cpp, n_threads = n_threads)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- colSums((cost$R - cost$D) ^ 2)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      if (use_cpp) {
-        cost$G <- mmds_grad_cpp(cost$R, cost$D, Y, eps = eps, n_threads = n_threads)
-      }
-      else {
-        cost$G <- k2g(Y, -4 * (cost$R - cost$D) / (cost$D + cost$eps))
-      }
-      cost
-    },
-    update = function(cost, Y) {
-      cost$D <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
-      cost
-    },
-    sentinel = "D",
-    export = function(cost, val) {
-      res <- NULL
-      switch(val,
-             dx = {
-               res <- cost$R
-             },
-             dy = {
-               res <- cost$D
-             }
-      )
-      res
-    }
-  )
-}
-
-smmds <- function(eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 1) {
-  lreplace(
-    mmds(use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, eps = .Machine$double.eps, verbose = FALSE,
-                    ret_extra = c()) {
-      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra, 
-                        use_cpp = use_cpp, n_threads = n_threads)
-      cost$R2 <- cost$R * cost$R
-      cost$R <- NULL
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- colSums((cost$R2 - cost$D2) ^ 2)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$G <- k2g(Y, -8 * (cost$R2 - cost$D2))
-      cost
-    },
-    export = function(cost, val) {
-      res <- NULL
-      switch(val,
-             dx = {
-               res <- cost$R2
-               res[res < 0] <- 0
-               sqrt(res)
-             },
-             dy = {
-               res <- cost$D2
-               res[res < 0] <- 0
-               sqrt(res)
-             }
-      )
-      res
-    },
-    update = function(cost, Y) {
-      cost$D2 <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      cost
-    },
-    sentinel = "D2"
-  )
-}
-
-sammon <- function(eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 1) {
-  lreplace(mmds(eps = eps, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- mmds_init(cost, X, max_iter, eps, verbose, ret_extra, 
-                        use_cpp = use_cpp, n_threads = n_threads)
-      cost$rsum_inv <- 1 / sum(cost$R)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- colSums((cost$R - cost$D) ^ 2 / (cost$R + cost$eps)) * cost$rsum_inv
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$G <- k2g(Y, -4 * cost$rsum_inv * (cost$R - cost$D) / (cost$R * cost$D + cost$eps))
-      cost
-    }
-  )
-}
-
-
-gmmds <- function(k, eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 0) {
-  lreplace(
-    mmds(use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE,
-                    ret_extra = c()) {
-      cost$R <- geodesic(X, k, n_threads = n_threads, use_cpp = use_cpp,
-                         verbose = verbose)
-      cost$eps <- eps
-      cost
-    },
-    export = function(cost, val) {
-      res <- NULL
-      switch(val,
-             geo = {
-               res <- cost$R
-             },
-             dy = {
-               res <- cost$D
-             }
-      )
-      res
-    }
-  )
-}
-
-# Define neighborhoods using a radius based on a fraction (f) of all input
-# distances (sorted by increasing length), don't correct non-neighborhood
-# distances unless they smaller than the input distance
-ballmmds <- function(f = 0.1, eps = .Machine$double.eps, use_cpp = FALSE,
-                     n_threads = 1) {
-  lreplace(
-    mmds(use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- mmds_init(cost = cost, X = X, max_iter = max_iter, eps = eps, 
-                        verbose = verbose, ret_extra = ret_extra,
-                        use_cpp = use_cpp, n_threads = n_threads)
-      rs <- cost$R[upper.tri(cost$R)]
-      rmax <- Rfast::nth(rs, max(1, round(f * length(rs))))
-      if (verbose) {
-        tsmessage("f = ", formatC(f), " rmax = ", formatC(rmax))
-      }
-      cost$rmax <- rmax
-
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      R <- cost$R
-      D <- cost$D
-      rmax <- cost$rmax
-      Ddiff <- R - D
-      Ddiff[R > rmax & D > R] <- 0
-      cost$pcost <- colSums(Ddiff * Ddiff)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      eps <- cost$eps
-      R <- cost$R
-      D <- cost$D
-
-      K <- -4 * (R - D) / (D + eps)
-
-      rmax <- cost$rmax
-      K[R > rmax & D > R] <- 0
-
-      cost$G <- k2g(Y, K)
-      cost
-    },
-    export = function(cost, val) {
-      res <- NULL
-      switch(val,
-             geo = {
-               res <- cost$R
-             },
-             dy = {
-               res <- cost$D
-             }
-      )
-      res
-    },
-    update = function(cost, Y) {
-      cost$D <- calc_d(Y)
-      cost
-    }
-  )
-}
-
-# Create the symmetrized knn graph, don't correct non-neighborhood distances
-# unless they smaller than the input distance
-knnmmds <- function(k, eps = .Machine$double.eps, use_cpp = FALSE, 
-                    n_threads = 0) {
-  lreplace(
-    mmds(use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- mmds_init(cost = cost, X = X, max_iter = max_iter, eps = eps, 
-                        verbose = verbose, ret_extra = ret_extra,
-                        use_cpp = use_cpp, n_threads = n_threads)
-      knn <- knn_graph(X = X, k = k, n_threads = n_threads, verbose = verbose)
-      # symmetrize
-      cost$knn <- pmax(knn, t(knn))
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      R <- cost$R
-      D <- cost$D
-      knn <- cost$knn
-      Ddiff <- R - D
-      Ddiff[knn == 0 & D > R] <- 0
-      cost$pcost <- colSums(Ddiff * Ddiff)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      eps <- cost$eps
-      R <- cost$R
-      D <- cost$D
-
-      K <- -4 * (R - D) / (D + eps)
-
-      knn <- cost$knn
-      K[knn == 0 & D > R] <- 0
-
-      cost$G <- k2g(Y, K)
-      cost$D <- D
-      cost
-    },
-    export = function(cost, val) {
-      res <- NULL
-      switch(val,
-             geo = {
-               res <- cost$R
-             },
-             dy = {
-               res <- cost$D
-             }
-      )
-      res
-    },
-    update = function(cost, Y) {
-      cost$D <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
-      cost
-    }
-  )
-}
-
 # Elastic Embedding -------------------------------------------------------
 
 # Carreira-Perpinán, M. A. (2010, June).
diff --git a/smallvis/R/mmds.R b/smallvis/R/mmds.R
new file mode 100644
index 0000000..032df7a
--- /dev/null
+++ b/smallvis/R/mmds.R
@@ -0,0 +1,395 @@
+# Distance Preserving Methods ---------------------------------------------
+
+mmds_init <- function(cost,
+                      X,
+                      max_iter,
+                      eps = .Machine$double.eps,
+                      verbose = FALSE,
+                      ret_extra = c(),
+                      use_cpp = FALSE,
+                      n_threads = 1) {
+  tsmessage("Calculating pairwise distances")
+  if (methods::is(X, "dist")) {
+    cost$R <- as.matrix(X)
+  } else {
+    cost$R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
+  }
+  cost$eps <- eps
+  cost
+}
+
+# Metric MDS, minimizing strain.
+mmds <- function(eps = .Machine$double.eps,
+                 use_cpp = FALSE,
+                 n_threads = 1) {
+  list(
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- mmds_init(
+        cost,
+        X,
+        max_iter,
+        eps,
+        verbose,
+        ret_extra,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- colSums((cost$R - cost$D)^2)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      if (use_cpp) {
+        cost$G <- mmds_grad_cpp(cost$R, cost$D, Y, eps = eps, n_threads = n_threads)
+      } else {
+        cost$G <- k2g(Y, -4 * (cost$R - cost$D) / (cost$D + cost$eps))
+      }
+      cost
+    },
+    update = function(cost, Y) {
+      cost$D <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
+      cost
+    },
+    sentinel = "D",
+    export = function(cost, val) {
+      res <- NULL
+      switch(val,
+        dx = {
+          res <- cost$R
+        },
+        dy = {
+          res <- cost$D
+        }
+      )
+      res
+    }
+  )
+}
+
+smmds <- function(eps = .Machine$double.eps,
+                  use_cpp = FALSE,
+                  n_threads = 1) {
+  lreplace(
+    mmds(use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    eps = .Machine$double.eps,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- mmds_init(
+        cost,
+        X,
+        max_iter,
+        eps,
+        verbose,
+        ret_extra,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )
+      cost$R2 <- cost$R * cost$R
+      cost$R <- NULL
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- colSums((cost$R2 - cost$D2)^2)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$G <- k2g(Y, -8 * (cost$R2 - cost$D2))
+      cost
+    },
+    export = function(cost, val) {
+      res <- NULL
+      switch(val,
+        dx = {
+          res <- cost$R2
+          res[res < 0] <- 0
+          sqrt(res)
+        },
+        dy = {
+          res <- cost$D2
+          res[res < 0] <- 0
+          sqrt(res)
+        }
+      )
+      res
+    },
+    update = function(cost, Y) {
+      cost$D2 <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      cost
+    },
+    sentinel = "D2"
+  )
+}
+
+sammon <- function(eps = .Machine$double.eps,
+                   use_cpp = FALSE,
+                   n_threads = 1) {
+  lreplace(
+    mmds(
+      eps = eps,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- mmds_init(
+        cost,
+        X,
+        max_iter,
+        eps,
+        verbose,
+        ret_extra,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )
+      cost$rsum_inv <- 1 / sum(cost$R)
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- colSums((cost$R - cost$D)^2 / (cost$R + cost$eps)) * cost$rsum_inv
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$G <- k2g(Y, -4 * cost$rsum_inv * (cost$R - cost$D) / (cost$R * cost$D + cost$eps))
+      cost
+    }
+  )
+}
+
+
+gmmds <- function(k,
+                  eps = .Machine$double.eps,
+                  use_cpp = FALSE,
+                  n_threads = 0) {
+  lreplace(
+    mmds(use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost$R <- geodesic(X,
+        k,
+        n_threads = n_threads,
+        use_cpp = use_cpp,
+        verbose = verbose
+      )
+      cost$eps <- eps
+      cost
+    },
+    export = function(cost, val) {
+      res <- NULL
+      switch(val,
+        geo = {
+          res <- cost$R
+        },
+        dy = {
+          res <- cost$D
+        }
+      )
+      res
+    }
+  )
+}
+
+# Define neighborhoods using a radius based on a fraction (f) of all input
+# distances (sorted by increasing length), don't correct non-neighborhood
+# distances unless they smaller than the input distance
+ballmmds <- function(f = 0.1,
+                     eps = .Machine$double.eps,
+                     use_cpp = FALSE,
+                     n_threads = 1) {
+  lreplace(
+    mmds(use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- mmds_init(
+        cost = cost,
+        X = X,
+        max_iter = max_iter,
+        eps = eps,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )
+      rs <- cost$R[upper.tri(cost$R)]
+      rmax <- Rfast::nth(rs, max(1, round(f * length(rs))))
+      if (verbose) {
+        tsmessage("f = ", formatC(f), " rmax = ", formatC(rmax))
+      }
+      cost$rmax <- rmax
+
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      R <- cost$R
+      D <- cost$D
+      rmax <- cost$rmax
+      Ddiff <- R - D
+      Ddiff[R > rmax & D > R] <- 0
+      cost$pcost <- colSums(Ddiff * Ddiff)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      eps <- cost$eps
+      R <- cost$R
+      D <- cost$D
+
+      K <- -4 * (R - D) / (D + eps)
+
+      rmax <- cost$rmax
+      K[R > rmax & D > R] <- 0
+
+      cost$G <- k2g(Y, K)
+      cost
+    },
+    export = function(cost, val) {
+      res <- NULL
+      switch(val,
+        geo = {
+          res <- cost$R
+        },
+        dy = {
+          res <- cost$D
+        }
+      )
+      res
+    },
+    update = function(cost, Y) {
+      cost$D <- calc_d(Y)
+      cost
+    }
+  )
+}
+
+# Create the symmetrized knn graph, don't correct non-neighborhood distances
+# unless they are smaller than the input distance
+knnmmds <- function(k,
+                    eps = .Machine$double.eps,
+                    use_cpp = FALSE,
+                    n_threads = 0) {
+  lreplace(
+    mmds(use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- mmds_init(
+        cost = cost,
+        X = X,
+        max_iter = max_iter,
+        eps = eps,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )
+      knn <- knn_graph(
+        X = X,
+        k = k,
+        n_threads = n_threads,
+        verbose = verbose
+      )
+      # symmetrize
+      cost$knn <- pmax(knn, t(knn))
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      R <- cost$R
+      D <- cost$D
+      knn <- cost$knn
+      Ddiff <- R - D
+      Ddiff[knn == 0 & D > R] <- 0
+      cost$pcost <- colSums(Ddiff * Ddiff)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      eps <- cost$eps
+      R <- cost$R
+      D <- cost$D
+
+      K <- -4 * (R - D) / (D + eps)
+
+      knn <- cost$knn
+      K[knn == 0 & D > R] <- 0
+
+      cost$G <- k2g(Y, K)
+      cost$D <- D
+      cost
+    },
+    export = function(cost, val) {
+      res <- NULL
+      switch(val,
+        geo = {
+          res <- cost$R
+        },
+        dy = {
+          res <- cost$D
+        }
+      )
+      res
+    },
+    update = function(cost, Y) {
+      cost$D <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
+      cost
+    }
+  )
+}
+
+# Given data X and k nearest neighbors, return a geodisic distance matrix
+# Disconnections are treated by using the Euclidean distance.
+geodesic <- function(X,
+                     k,
+                     fill = TRUE,
+                     use_cpp = FALSE,
+                     n_threads = 0,
+                     verbose = FALSE) {
+  tsmessage("Calculating geodesic distances with k = ", k)
+
+  R <- knn_dist(X, k, n_threads = n_threads, verbose = verbose)
+  # The hard work is done by Rfast's implementation of Floyd's algorithm
+  G <- Rfast::floyd(R)
+  if (any(is.infinite(G)) && fill) {
+    tsmessage(
+      "k = ",
+      k,
+      " resulted in disconnections: filling with Euclidean distances"
+    )
+    if (methods::is(X, "dist")) {
+      R <- as.matrix(X)
+    } else {
+      R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
+    }
+    G[is.infinite(G)] <- R[is.infinite(G)]
+  }
+  G
+}
diff --git a/smallvis/R/perplexity.R b/smallvis/R/perplexity.R
index 3e76e31..32cc502 100644
--- a/smallvis/R/perplexity.R
+++ b/smallvis/R/perplexity.R
@@ -264,34 +264,7 @@ knn_graph <- function(X, k, n_threads, verbose) {
   D
 }
 
-# Given data X and k nearest neighbors, return a geodisic distance matrix
-# Disconnections are treated by using the Euclidean distance.
-geodesic <- function(X,
-                     k,
-                     fill = TRUE,
-                     use_cpp = FALSE,
-                     n_threads = 0,
-                     verbose = FALSE) {
-  tsmessage("Calculating geodesic distances with k = ", k)
-
-  R <- knn_dist(X, k, n_threads = n_threads, verbose = verbose)
-  # The hard work is done by Rfast's implementation of Floyd's algorithm
-  G <- Rfast::floyd(R)
-  if (any(is.infinite(G)) && fill) {
-    tsmessage(
-      "k = ",
-      k,
-      " resulted in disconnections: filling with Euclidean distances"
-    )
-    if (methods::is(X, "dist")) {
-      R <- as.matrix(X)
-    } else {
-      R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
-    }
-    G[is.infinite(G)] <- R[is.infinite(G)]
-  }
-  G
-}
+
 
 # Multiscale perplexities: P is an average over the results of multiple
 # perplexities

From f060ce385c319542e6ae7dabd20c15ae8aeaa704 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 11:53:31 -0700
Subject: [PATCH 16/26] Split cost into separate method files

---
 smallvis/R/absne.R    |  607 ++++++++++++++++++++
 smallvis/R/cost.R     | 1269 +----------------------------------------
 smallvis/R/ee.R       |   85 +++
 smallvis/R/fdiv.R     |  310 ++++++++++
 smallvis/R/gsne.R     |   92 +++
 smallvis/R/jse.R      |  206 +++++++
 smallvis/R/largevis.R |   71 +++
 smallvis/R/nerv.R     |  281 +++++++++
 8 files changed, 1672 insertions(+), 1249 deletions(-)
 create mode 100644 smallvis/R/absne.R
 create mode 100644 smallvis/R/ee.R
 create mode 100644 smallvis/R/fdiv.R
 create mode 100644 smallvis/R/gsne.R
 create mode 100644 smallvis/R/jse.R
 create mode 100644 smallvis/R/largevis.R
 create mode 100644 smallvis/R/nerv.R

diff --git a/smallvis/R/absne.R b/smallvis/R/absne.R
new file mode 100644
index 0000000..93ce6d4
--- /dev/null
+++ b/smallvis/R/absne.R
@@ -0,0 +1,607 @@
+# ABSNE -------------------------------------------------------------------
+
+# alpha-beta divergence
+absne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  symmetrize = "symmetric",
+                  alpha = 1,
+                  lambda = 1,
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  beta <- lambda - alpha
+
+  eps0 <- 1e-5
+  if (abs(alpha) > eps0 && abs(lambda) < eps0) {
+    # alpha != 0, beta = -alpha (=> lambda == 0)
+    return(
+      absneamb(
+        perplexity = perplexity,
+        inp_kernel = inp_kernel,
+        symmetrize = symmetrize,
+        alpha = alpha,
+        eps = eps
+      )
+    )
+  }
+  if (abs(alpha) > eps0 && abs(beta) < eps0) {
+    # alpha != 0, beta = 0 (=> lambda = alpha)
+    return(
+      absneb0(
+        perplexity = perplexity,
+        inp_kernel = inp_kernel,
+        symmetrize = symmetrize,
+        alpha = alpha,
+        eps = eps
+      )
+    )
+  }
+  if (abs(alpha) < eps0 && abs(beta) > eps0) {
+    # alpha = 0, beta != 0 (=> lambda = beta)
+    return(
+      absnea0(
+        perplexity = perplexity,
+        inp_kernel = inp_kernel,
+        symmetrize = symmetrize,
+        beta = beta,
+        eps = eps
+      )
+    )
+  }
+  if (abs(alpha) < eps0 && abs(beta) < eps0) {
+    # alpha = 0, beta = 0 (=> lambda = 0)
+    return(
+      absne00(
+        perplexity = perplexity,
+        inp_kernel = inp_kernel,
+        symmetrize = symmetrize,
+        eps = eps
+      )
+    )
+  }
+
+  if (abs(lambda) < eps0) {
+    lambda <- ifelse(lambda == 0, 1, sign(lambda)) * eps0
+  }
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = TRUE,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      if (verbose) {
+        tsmessage(
+          "Using ABSNE with alpha = ",
+          formatC(alpha),
+          " beta = ",
+          formatC(beta)
+        )
+      }
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = TRUE
+      )
+      cost$inva4 <- 4 / alpha
+      cost$minvab <- -1 / (alpha * beta)
+      cost$inval <- 1 / (alpha * lambda)
+
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      P <- cost$P
+      eps <- cost$eps
+      cost$Pa <- powm(P, alpha, eps)
+      cost$Plc <- colSums(powm(P, lambda, eps)) / (beta * lambda)
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- cost$minvab * cost$PaQbc + cost$Plc + cost$inval * cost$Qlc
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+      cost$G <- k2g(Y, cost$inva4 * cost$Z * Q *
+        (cost$PaQb - cost$Ql + Q * (cost$Qls - cost$PaQbs)))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      Z <- sum(W)
+      Q <- W / Z
+
+      eps <- cost$eps
+      cost$PaQb <- cost$Pa * powm(Q, beta, eps)
+      cost$PaQbc <- colSums(cost$PaQb)
+      cost$PaQbs <- sum(cost$PaQbc)
+
+      cost$Q <- Q
+      cost$Ql <- powm(Q, lambda, eps)
+      cost$Qlc <- colSums(cost$Ql)
+      cost$Qls <- sum(cost$Qlc)
+
+      cost$Z <- Z
+      cost
+    }
+  )
+}
+
+# alpha != 0, beta = 0 => lambda = alpha
+absneb0 <- function(perplexity,
+                    inp_kernel = "gaussian",
+                    symmetrize = "symmetric",
+                    alpha = 1,
+                    eps = .Machine$double.eps,
+                    n_threads = 0,
+                    use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      if (verbose) {
+        tsmessage("Using ABSNE with alpha = ", formatC(alpha), " beta = 0")
+      }
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+
+      cost$eps <- eps
+
+      cost$inva2 <- 1 / (alpha * alpha)
+      cost$invam4 <- 4 / alpha
+      cost
+    },
+    cache_input = function(cost) {
+      eps <- cost$eps
+      P <- cost$P
+      P[P < eps] <- eps
+      Pa <- powm(P, alpha, eps)
+      cost$Pa <- Pa
+      cost$PlPac <- colSums(P * logm(Pa, eps))
+      cost$Pac <- colSums(Pa)
+      cost$Pas <- sum(cost$Pac)
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- cost$inva2 *
+        (cost$PlPac - colSums(cost$Pa * logm(cost$Qa, cost$eps)) -
+          cost$Pac + cost$Qac)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+
+      cost$G <- k2g(Y, cost$invam4 * Q * cost$Z *
+        (cost$Pa - cost$Qa + Q * (cost$Qas - cost$Pas)))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      Z <- sum(W)
+      Q <- W / Z
+
+      cost$Q <- Q
+      cost$Z <- Z
+
+      Qa <- powm(Q, alpha, cost$eps)
+      cost$Q <- Q
+      cost$Qa <- Qa
+      cost$Qac <- colSums(Qa)
+      cost$Qas <- sum(cost$Qac)
+
+      cost
+    }
+  )
+}
+
+# alpha = -beta != 0 => lambda = 0
+absneamb <- function(perplexity,
+                     inp_kernel = "gaussian",
+                     symmetrize = "symmetric",
+                     alpha = 1,
+                     eps = .Machine$double.eps,
+                     n_threads = 0,
+                     use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      if (verbose) {
+        tsmessage(
+          "Using ABSNE with alpha = ",
+          formatC(alpha),
+          " beta = -",
+          formatC(alpha)
+        )
+      }
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+
+      cost$N <- nrow(cost$P)
+      cost$N2 <- (cost$N - 1) * cost$N
+      cost$eps <- eps
+
+      cost$inva2 <- 1 / (alpha * alpha)
+      cost$invam4 <- 4 / alpha
+      cost
+    },
+    cache_input = function(cost) {
+      eps <- cost$eps
+      P <- cost$P
+      P[P < eps] <- eps
+      Pa <- powm(P, alpha, eps)
+      cost$Pa <- Pa
+      Pa[Pa < eps] <- eps
+      cost$lPac <- colSums(logm(Pa, eps))
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- cost$inva2 * (cost$lQac - cost$lPac + cost$PadivQac - cost$N)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+
+      cost$G <- k2g(Y, cost$invam4 * Q * cost$Z *
+        (cost$PadivQa - 1 + Q * (cost$N2 - cost$PadivQas)))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      Z <- sum(W)
+      Q <- W / Z
+
+      cost$Q <- Q
+      cost$Z <- Z
+
+      Qa <- powm(Q, alpha, cost$eps)
+      cost$lQac <- colSums(logm(Qa, cost$eps))
+      cost$PadivQa <- divm(cost$Pa, Qa)
+      cost$PadivQac <- colSums(cost$PadivQa)
+      cost$PadivQas <- sum(cost$PadivQac)
+
+      cost
+    }
+  )
+}
+
+# alpha = 0, beta != 0 => lambda = beta
+absnea0 <- function(perplexity,
+                    inp_kernel = "gaussian",
+                    symmetrize = "symmetric",
+                    beta = 1,
+                    eps = .Machine$double.eps,
+                    n_threads = 0,
+                    use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      if (verbose) {
+        tsmessage("Using ABSNE with alpha = 0, beta = ", formatC(beta))
+      }
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+
+      cost$eps <- eps
+
+      cost$invb2 <- 1 / (beta * beta)
+      cost$invbm4 <- 4 / beta
+      cost
+    },
+    cache_input = function(cost) {
+      eps <- cost$eps
+      P <- cost$P
+      P[P < eps] <- eps
+      Pb <- powm(P, beta, eps)
+      Pb[Pb < eps] <- eps
+      cost$Pbc <- colSums(Pb)
+      cost$lPb <- logm(Pb, eps)
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Qb <- cost$Qb
+      cost$pcost <- cost$invb2 *
+        (cost$QblQbc - cost$QblPbc + cost$Pbc - cost$Qbc)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+
+      cost$G <- k2g(Y, cost$invbm4 * Q * cost$Z *
+        (cost$QblPb - cost$QblQb + Q * (cost$QblQbs - cost$QblPbs)))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      Z <- sum(W)
+      Q <- W / Z
+
+      cost$Q <- Q
+      cost$Z <- Z
+
+      Qb <- powm(Q, beta, cost$eps)
+      cost$Qbc <- colSums(Qb)
+
+      cost$QblPb <- Qb * cost$lPb
+      cost$QblPbc <- colSums(cost$QblPb)
+      cost$QblPbs <- sum(cost$QblPb)
+
+      cost$QblQb <- Qb * logm(Qb, cost$eps)
+      cost$QblQbc <- colSums(cost$QblQb)
+      cost$QblQbs <- sum(cost$QblQb)
+
+      cost
+    }
+  )
+}
+
+# alpha = 0, beta = 0 => lambda = 0
+absne00 <- function(perplexity,
+                    inp_kernel = "gaussian",
+                    symmetrize = "symmetric",
+                    eps = .Machine$double.eps,
+                    n_threads = 0,
+                    use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      if (verbose) {
+        tsmessage("Using ABSNE with alpha = 0, beta = 0")
+      }
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      eps <- cost$eps
+      P <- cost$P
+      P[P < eps] <- eps
+      lP <- logm(P, eps)
+      cost$lP <- lP
+      cost$lPs <- sum(lP)
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      pcost <- cost$lP - cost$lQ
+      cost$pcost <- 0.5 * colSums(pcost * pcost)
+
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+
+      cost$G <- k2g(Y, 4 * Q * cost$Z *
+        (cost$lP - cost$lQ + Q * (cost$lQs - cost$lPs)))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      Z <- sum(W)
+      Q <- W / Z
+
+      cost$Q <- Q
+      cost$Z <- Z
+
+      lQ <- logm(Q, cost$eps)
+      cost$lQ <- lQ
+      cost$lQs <- sum(lQ)
+
+      cost
+    }
+  )
+}
+
+# alpha-beta divergence
+abssne <- function(perplexity,
+                   inp_kernel = "gaussian",
+                   symmetrize = "symmetric",
+                   alpha = 1,
+                   lambda = 1,
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
+  beta <- lambda - alpha
+
+  eps0 <- 1e-5
+
+  if (abs(alpha) < eps0) {
+    alpha <- ifelse(alpha == 0, 1, sign(alpha)) * eps0
+  }
+  if (abs(beta) < eps0) {
+    beta <- ifelse(beta == 0, 1, sign(beta)) * eps0
+  }
+  if (abs(lambda) < eps0) {
+    lambda <- ifelse(lambda == 0, 1, sign(lambda)) * eps0
+  }
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      if (verbose) {
+        tsmessage(
+          "Using ABSSNE with alpha = ",
+          formatC(alpha),
+          " beta = ",
+          formatC(beta)
+        )
+      }
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$inva4 <- 4 / alpha
+      cost$minvab <- -1 / (alpha * beta)
+      cost$inval <- 1 / (alpha * lambda)
+      cost$ibl <- 1 / (beta * lambda)
+
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      P <- cost$P
+      eps <- cost$eps
+      P[P < eps] <- eps
+      cost$Pa <- powm(P, alpha, eps)
+      cost$Plc <- colSums(powm(P, lambda, eps)) * cost$ibl
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- cost$minvab * cost$PaQbc + cost$Plc + cost$inval * cost$Qlc
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$G <- k2g(Y, cost$inva4 *
+        (cost$PaQb - cost$Ql + cost$Q * (cost$Qls - cost$PaQbs)))
+      cost
+    },
+    update = function(cost, Y) {
+      eps <- cost$eps
+
+      Q <- expQ(
+        Y,
+        cost$eps,
+        is_symmetric = TRUE,
+        matrix_normalize = TRUE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
+
+      cost$PaQb <- cost$Pa * powm(Q, beta, eps)
+      cost$PaQbc <- colSums(cost$PaQb)
+      cost$PaQbs <- sum(cost$PaQbc)
+
+      cost$Q <- Q
+      cost$Ql <- powm(Q, lambda, eps)
+      cost$Qlc <- colSums(cost$Ql)
+      cost$Qls <- sum(cost$Qlc)
+
+      cost
+    }
+  )
+}
diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index d929468..ec18e95 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -1,4 +1,4 @@
-# Generic Functions -------------------------------------------------------
+# Generic Weight/Cost/Gradient functions ----------------------------------
 
 logm <- function(m, eps = .Machine$double.eps) {
   diag(m) <- eps
@@ -16,7 +16,7 @@ divm <- function(m, n, eps = .Machine$double.eps) {
 
 powm <- function(m, n, eps = .Machine$double.eps) {
   diag(m) <- eps
-  m <- m ^ n
+  m <- m^n
   diag(m) <- 0
   m
 }
@@ -29,10 +29,19 @@ k2g <- function(Y, K, symmetrize = FALSE) {
   Y * colSums(K) - (K %*% Y)
 }
 
-cost_init <- function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
+cost_init <- function(cost,
+                      X,
+                      max_iter,
+                      verbose = FALSE,
+                      ret_extra = c()) {
   if (!is.null(cost$init)) {
-    cost <- cost$init(cost, X, verbose = verbose, ret_extra = ret_extra,
-                      max_iter = max_iter)
+    cost <- cost$init(
+      cost,
+      X,
+      verbose = verbose,
+      ret_extra = ret_extra,
+      max_iter = max_iter
+    )
   }
   cost_cache_input(cost)
 }
@@ -51,8 +60,7 @@ cost_point <- function(cost, Y) {
 cost_clear <- function(cost) {
   if (!is.null(cost$sentinel)) {
     cost[[cost$sentinel]] <- NULL
-  }
-  else if (is.null(cost$clear)) {
+  } else if (is.null(cost$clear)) {
     cost <- cost$clear(cost)
   }
   cost
@@ -65,8 +73,7 @@ cost_update <- function(cost, Y) {
     if (is.null(cost[[cost$sentinel]])) {
       cost <- cost$update(cost, Y)
     }
-  }
-  else {
+  } else {
     cost <- cost$update(cost, Y)
   }
   cost
@@ -78,15 +85,11 @@ cost_eval <- function(cost, Y, opt_res = NULL) {
     cost <- cost_point(cost, Y)
     pcosts <- cost$pcost
     cost_val <- sum(pcosts)
-  }
-  else {
+  } else {
     cost_val <- opt_res$f
   }
 
-  list(
-    cost = cost,
-    value = cost_val
-  )
+  list(cost = cost, value = cost_val)
 }
 
 # Default export of values associated with a method
@@ -96,8 +99,7 @@ cost_export <- function(cost, val) {
   res <- NULL
   if (!is.null(cost[[val]])) {
     res <- cost[[val]]
-  }
-  else if (!is.null(cost[[toupper(val)]])) {
+  } else if (!is.null(cost[[toupper(val)]])) {
     res <- cost[[toupper(val)]]
   }
   res
@@ -107,7 +109,7 @@ cost_cache_input <- function(cost) {
   if (!is.null(cost$cache_input)) {
     cost <- cost$cache_input(cost)
   }
-  cost  
+  cost
 }
 
 start_exaggerating <- function(cost, exaggeration_factor) {
@@ -123,1234 +125,3 @@ stop_exaggerating <- function(cost, exaggeration_factor) {
   }
   cost_cache_input(cost)
 }
-
-# LargeVis ----------------------------------------------------------------
-
-largevis <- function(perplexity, inp_kernel = "gaussian", 
-                     symmetrize = "symmetric", gamma = 1, gr_eps = 0.1, 
-                     normalize = TRUE, eps = 1e-9, row_weight = NULL,
-                     use_cpp = FALSE, n_threads = 0) {
-  if (!is.null(row_weight)) {
-    row_normalize <- row_weight
-  }
-  else {
-    row_normalize <- TRUE
-  }
-  lreplace(tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
-     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-       cost <- sne_init(cost, X = X, perplexity = perplexity, 
-                        symmetrize = symmetrize, kernel = inp_kernel,
-                        normalize = normalize, verbose = verbose,
-                        row_normalize = row_normalize, ret_extra = ret_extra,
-                        n_threads = n_threads, use_cpp = use_cpp)
-       cost$eps <- eps
-       cost$greps1 <- gr_eps - 1
-       cost
-     },
-     pfn = function(cost, Y) {
-       cost <- cost_update(cost, Y)
-
-       P <- cost$P
-       eps <- cost$eps
-       W <- cost$W
-
-       cost$pcost <- colSums(-P * logm(W, eps) - gamma * log1p(-W + eps))
-       cost
-     },
-     gr = function(cost, Y) {
-       cost <- cost_update(cost, Y)
-       W <- cost$W
-       cost$G <- k2g(Y, 4 * W * (cost$P - ((gamma * W) / (1 + cost$greps1 * W))))
-       cost
-     },
-     update = function(cost, Y) {
-       W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-       W <- 1 / (1 + W)
-       diag(W) <- 0
-
-       cost$W <- W
-       cost
-     },
-     export = cost_export
-  )
-}
-
-
-# f-Divergences -----------------------------------------------------------
-
-# Reverse KL divergence
-rklsne <- function(perplexity, inp_kernel = "gaussian", 
-                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                   n_threads = 0, use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      P <- cost$P
-      eps <- cost$eps
-      P[P < eps] <- eps
-      cost$lP <- logm(P, eps)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- cost$QlQPcs
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-      cost$G <- k2g(Y,  4 * Q * Q * cost$sumW * (sum(cost$QlQPcs) - cost$lQP))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      sumW <- sum(W)
-      Q <- W / sumW
-
-      cost$lQP <- logm(Q, cost$eps) - cost$lP
-      cost$QlQPcs <- colSums(Q * cost$lQP)
-      cost$Q <- Q
-      cost$sumW <- sumW
-      
-      cost
-    }
-  )
-}
-
-# Jensen-Shannon divergence
-jssne <- function(perplexity, inp_kernel = "gaussian", 
-                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0, use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp == use_cpp,
-         n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      P <- cost$P
-      eps <- cost$eps
-      P[P < eps] <- eps
-      cost$PlP <- colSums(P * logm(P, eps))
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- 0.5 * (cost$PlP + cost$QlQZ - colSums(cost$P * logm(cost$Z, cost$eps)))
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-
-      cost$G <- k2g(Y,  2 * Q * Q * cost$sumW * (cost$QlQZs - cost$lQZ))
-      cost
-    },
-    update = function(cost, Y) {
-      eps <- cost$eps
-      P <- cost$P
-      
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      sumW <- sum(W)
-      
-      Q <- W / sumW
-      Z <- 0.5 * (P + Q)
-      
-      QlQZ <- logm(Q / Z, eps)
-      cost$QlQZ <- colSums(Q * QlQZ)
-      cost$QlQZs <- sum(cost$QlQZ)
-      
-      cost$lQZ <- QlQZ
-      cost$sumW <- sumW
-      cost$Q <- Q
-      cost$Z <- Z 
-      
-      cost
-    }
-  )
-}
-
-# Chi-squared divergence
-chsne <- function(perplexity, inp_kernel = "gaussian", 
-                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0, use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp == use_cpp,
-         n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      P <- cost$P
-      cost$P2 <- P * P
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      PQ <- cost$P - cost$Q
-      
-      cost$pcost <- colSums(PQ * PQ * cost$invQ)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      
-      Q <- cost$Q
-      Z <- cost$Z
-      invQ <- cost$invQ
-      
-      P2Q <- cost$P2 * invQ
-      
-      cost$G <- k2g(Y,  4 * Q * Q * Z * (P2Q * invQ - sum(P2Q)))
-      cost
-    },
-    update = function(cost, Y) {
-      P <- cost$P
-      
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      Z <- sum(W)
-      
-      Q <- W / Z
-
-      cost$Q <- Q
-      cost$Z <- Z
-      invQ <- 1 / Q
-      diag(invQ) <- 0
-      cost$invQ <- invQ
-      
-      cost
-    }
-  )
-}
-
-# Hellinger distance divergence
-hlsne <- function(perplexity, inp_kernel = "gaussian", 
-                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0, use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp == use_cpp,
-         n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = "symmetric", normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      cost$sP <- sqrt(cost$P)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      PQ <- cost$sP - cost$sQ
-      
-      cost$pcost <- colSums(PQ * PQ)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      sP <- cost$sP
-      Q <- cost$Q
-      Z <- cost$Z
-      sQ <- cost$sQ
-      
-      sPQ <- sum(sP * sQ)
-      PQ <- sP / sQ
-      diag(PQ) <- 0
-      
-      cost$G <- k2g(Y,  4 * Q * Q * Z * (PQ - sPQ))
-      cost
-    },
-    update = function(cost, Y) {
-      P <- cost$P
-      
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      Z <- sum(W)
-      
-      Q <- W / Z
-      
-      cost$Q <- Q
-      cost$Z <- Z
-      cost$sQ <- sqrt(Q)
-      
-      cost
-    }
-  )
-}
-
-# ABSNE -------------------------------------------------------------------
-
-# alpha-beta divergence
-absne <- function(perplexity, inp_kernel = "gaussian", 
-                  symmetrize = "symmetric", alpha = 1, lambda = 1,
-                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  beta <- lambda - alpha
-
-  eps0 <- 1e-5
-  if (abs(alpha) > eps0 && abs(lambda) < eps0) {
-    # alpha != 0, beta = -alpha (=> lambda == 0)
-    return(absneamb(perplexity = perplexity, inp_kernel = inp_kernel, 
-                    symmetrize = symmetrize, alpha = alpha, eps = eps))
-  }
-  if (abs(alpha) > eps0 && abs(beta) < eps0) {
-    # alpha != 0, beta = 0 (=> lambda = alpha)
-    return(absneb0(perplexity = perplexity, inp_kernel = inp_kernel, 
-                   symmetrize = symmetrize, alpha = alpha, eps = eps))
-  }
-  if (abs(alpha) < eps0 && abs(beta) > eps0 ) {
-    # alpha = 0, beta != 0 (=> lambda = beta)
-    return(absnea0(perplexity = perplexity, inp_kernel = inp_kernel, 
-                   symmetrize = symmetrize, beta = beta, eps = eps))
-  }
-  if (abs(alpha) < eps0 && abs(beta) < eps0) {
-    # alpha = 0, beta = 0 (=> lambda = 0)
-    return(absne00(perplexity = perplexity, inp_kernel = inp_kernel,
-                   symmetrize = symmetrize, eps = eps))
-  }
-
-  if (abs(lambda) < eps0) {
-    lambda <- ifelse(lambda == 0, 1, sign(lambda)) * eps0
-  }
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = TRUE, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      if (verbose) {
-        tsmessage("Using ABSNE with alpha = ", formatC(alpha), 
-                  " beta = ", formatC(beta))
-      }
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = TRUE)
-      cost$inva4 <- 4 / alpha
-      cost$minvab <- -1 / (alpha * beta)
-      cost$inval <- 1 / (alpha * lambda)
-
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      P <- cost$P
-      eps <- cost$eps
-      cost$Pa <- powm(P, alpha, eps)
-      cost$Plc <- colSums(powm(P, lambda, eps)) / (beta * lambda)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- cost$minvab * cost$PaQbc + cost$Plc + cost$inval * cost$Qlc
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-      cost$G <- k2g(Y, cost$inva4 * cost$Z * Q *  
-                      (cost$PaQb - cost$Ql + Q * (cost$Qls - cost$PaQbs)))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      Z <- sum(W)
-      Q <- W / Z
-
-      eps <- cost$eps
-      cost$PaQb <- cost$Pa * powm(Q, beta, eps)
-      cost$PaQbc <- colSums(cost$PaQb)
-      cost$PaQbs <- sum(cost$PaQbc)
-
-      cost$Q <- Q
-      cost$Ql <- powm(Q, lambda, eps)
-      cost$Qlc <- colSums(cost$Ql)
-      cost$Qls <- sum(cost$Qlc)  
-          
-      cost$Z <- Z
-      cost
-    }
-  )
-}
-
-# alpha != 0, beta = 0 => lambda = alpha
-absneb0 <- function(perplexity, inp_kernel = "gaussian", 
-                    symmetrize = "symmetric", alpha = 1,
-                    eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      if (verbose) {
-        tsmessage("Using ABSNE with alpha = ", formatC(alpha), 
-                  " beta = 0")
-      }
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      
-      cost$eps <- eps
-      
-      cost$inva2 <- 1 / (alpha * alpha)
-      cost$invam4 <- 4 / alpha
-      cost
-    },
-    cache_input = function(cost) {
-      eps <- cost$eps
-      P <- cost$P
-      P[P < eps] <- eps
-      Pa <- powm(P, alpha, eps)
-      cost$Pa <- Pa
-      cost$PlPac <- colSums(P * logm(Pa, eps))
-      cost$Pac <- colSums(Pa)
-      cost$Pas <- sum(cost$Pac)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- cost$inva2 * 
-        (cost$PlPac - colSums(cost$Pa * logm(cost$Qa, cost$eps)) - 
-           cost$Pac + cost$Qac)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-      
-      cost$G <- k2g(Y, cost$invam4 * Q * cost$Z *
-                      (cost$Pa - cost$Qa + Q * (cost$Qas - cost$Pas)))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      Z <- sum(W)
-      Q <- W / Z
-      
-      cost$Q <- Q
-      cost$Z <- Z
-      
-      Qa <- powm(Q, alpha, cost$eps)
-      cost$Q <- Q
-      cost$Qa <- Qa
-      cost$Qac <- colSums(Qa)
-      cost$Qas <- sum(cost$Qac)
-      
-      cost
-    }
-  )
-}
-
-# alpha = -beta != 0 => lambda = 0
-absneamb <- function(perplexity, inp_kernel = "gaussian", 
-                     symmetrize = "symmetric", alpha = 1,
-                     eps = .Machine$double.eps, n_threads = 0,
-                     use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      if (verbose) {
-        tsmessage("Using ABSNE with alpha = ", formatC(alpha), 
-                  " beta = -", formatC(alpha))
-      }
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-
-      cost$N <- nrow(cost$P)
-      cost$N2 <- (cost$N - 1) * cost$N
-      cost$eps <- eps
-
-      cost$inva2 <- 1 / (alpha * alpha)
-      cost$invam4 <- 4 / alpha
-      cost
-    },
-    cache_input = function(cost) {
-      eps <- cost$eps
-      P <- cost$P
-      P[P < eps] <- eps
-      Pa <- powm(P, alpha, eps)
-      cost$Pa <- Pa
-      Pa[Pa < eps] <- eps
-      cost$lPac <- colSums(logm(Pa, eps))
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- cost$inva2 * (cost$lQac - cost$lPac + cost$PadivQac - cost$N)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-
-      cost$G <- k2g(Y, cost$invam4 * Q * cost$Z *
-                    (cost$PadivQa - 1 + Q * (cost$N2 - cost$PadivQas)))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      Z <- sum(W)
-      Q <- W / Z
-      
-      cost$Q <- Q
-      cost$Z <- Z
-      
-      Qa <- powm(Q, alpha, cost$eps)
-      cost$lQac <- colSums(logm(Qa, cost$eps))
-      cost$PadivQa <- divm(cost$Pa, Qa)
-      cost$PadivQac <- colSums(cost$PadivQa)
-      cost$PadivQas <- sum(cost$PadivQac)
-      
-      cost
-    }
-  )
-}
-
-# alpha = 0, beta != 0 => lambda = beta
-absnea0 <- function(perplexity, inp_kernel = "gaussian", 
-                    symmetrize = "symmetric", beta = 1,
-                    eps = .Machine$double.eps, n_threads = 0,
-                    use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      if (verbose) {
-        tsmessage("Using ABSNE with alpha = 0, beta = ", formatC(beta))
-      }
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      
-      cost$eps <- eps
-      
-      cost$invb2 <- 1 / (beta * beta)
-      cost$invbm4 <- 4 / beta
-      cost
-    },
-    cache_input = function(cost) {
-      eps <- cost$eps
-      P <- cost$P
-      P[P < eps] <- eps
-      Pb <- powm(P, beta, eps)
-      Pb[Pb < eps] <- eps
-      cost$Pbc <- colSums(Pb)
-      cost$lPb <- logm(Pb, eps)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Qb <- cost$Qb
-      cost$pcost <- cost$invb2 * 
-        (cost$QblQbc - cost$QblPbc + cost$Pbc - cost$Qbc)
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-      
-      cost$G <- k2g(Y, cost$invbm4 * Q * cost$Z *
-                      (cost$QblPb - cost$QblQb + Q * (cost$QblQbs - cost$QblPbs)))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      Z <- sum(W)
-      Q <- W / Z
-      
-      cost$Q <- Q
-      cost$Z <- Z
-      
-      Qb <- powm(Q, beta, cost$eps)
-      cost$Qbc <- colSums(Qb)
-      
-      cost$QblPb <- Qb * cost$lPb
-      cost$QblPbc <- colSums(cost$QblPb)
-      cost$QblPbs <- sum(cost$QblPb)
-      
-      cost$QblQb <- Qb * logm(Qb, cost$eps)
-      cost$QblQbc <- colSums(cost$QblQb)
-      cost$QblQbs <- sum(cost$QblQb)
-      
-      cost
-    }
-  )
-}
-
-# alpha = 0, beta = 0 => lambda = 0
-absne00 <- function(perplexity, inp_kernel = "gaussian", 
-                    symmetrize = "symmetric", eps = .Machine$double.eps,
-                    n_threads = 0, use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      if (verbose) {
-        tsmessage("Using ABSNE with alpha = 0, beta = 0")
-      }
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      eps <- cost$eps
-      P <- cost$P
-      P[P < eps] <- eps
-      lP <- logm(P, eps)
-      cost$lP <- lP
-      cost$lPs <- sum(lP)
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      pcost <- cost$lP - cost$lQ
-      cost$pcost <- 0.5 * colSums(pcost * pcost)
-      
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-      
-      cost$G <- k2g(Y, 4 * Q * cost$Z *
-                      (cost$lP - cost$lQ + Q * (cost$lQs - cost$lPs)))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      Z <- sum(W)
-      Q <- W / Z
-      
-      cost$Q <- Q
-      cost$Z <- Z
-      
-      lQ <- logm(Q, cost$eps)
-      cost$lQ <- lQ
-      cost$lQs <- sum(lQ)
-      
-      cost
-    }
-  )
-}
-
-# alpha-beta divergence
-abssne <- function(perplexity, inp_kernel = "gaussian", 
-                   symmetrize = "symmetric", alpha = 1, lambda = 1,
-                   eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  beta <- lambda - alpha
-  
-  eps0 <- 1e-5
-
-  if (abs(alpha) < eps0) {
-    alpha <- ifelse(alpha == 0, 1, sign(alpha)) * eps0
-  }
-  if (abs(beta) < eps0) {
-    beta <- ifelse(beta == 0, 1, sign(beta)) * eps0
-  }
-  if (abs(lambda) < eps0) {
-    lambda <- ifelse(lambda == 0, 1, sign(lambda)) * eps0
-  }
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      if (verbose) {
-        tsmessage("Using ABSSNE with alpha = ", formatC(alpha), 
-                  " beta = ", formatC(beta))
-      }
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$inva4 <- 4 / alpha
-      cost$minvab <- -1 / (alpha * beta)
-      cost$inval <- 1 / (alpha * lambda)
-      cost$ibl <- 1 / (beta * lambda)
-      
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      P <- cost$P
-      eps <- cost$eps
-      P[P < eps] <- eps
-      cost$Pa <- powm(P, alpha, eps)
-      cost$Plc <- colSums(powm(P, lambda, eps)) * cost$ibl
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$pcost <- cost$minvab * cost$PaQbc + cost$Plc + cost$inval * cost$Qlc
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$G <- k2g(Y, cost$inva4 *  
-                      (cost$PaQb - cost$Ql + cost$Q * (cost$Qls - cost$PaQbs)))
-      cost
-    },
-    update = function(cost, Y) {
-      eps <- cost$eps
-      
-      Q <- expQ(Y, cost$eps, is_symmetric = TRUE, matrix_normalize = TRUE,
-                use_cpp = use_cpp, n_threads = n_threads)$Q
-      
-      cost$PaQb <- cost$Pa * powm(Q, beta, eps)
-      cost$PaQbc <- colSums(cost$PaQb)
-      cost$PaQbs <- sum(cost$PaQbc)
-      
-      cost$Q <- Q
-      cost$Ql <- powm(Q, lambda, eps)
-      cost$Qlc <- colSums(cost$Ql)
-      cost$Qls <- sum(cost$Qlc)  
-      
-      cost
-    }
-  )
-}
-
-# Other Divergences -------------------------------------------------------
-
-# global-SNE
-gsne <- function(perplexity, lambda = 1, inp_kernel = "gaussian", 
-                 symmetrize = "symmetric", eps = .Machine$double.eps,
-                 use_cpp = FALSE, n_threads = 0) {
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      
-      # Phat isn't affected by early exaggeration, so we cache it once only
-      if (methods::is(X, "dist")) {
-        Phat <- as.matrix(X)
-      }
-      else {
-        Phat <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
-      }
-      
-      Phat <- Phat + 1
-      diag(Phat) <- 0
-      
-      Phat <- Phat / sum(Phat)
-      cost$Phat <- Phat
-      cost$phlogph <- colSums(Phat * logm(Phat, eps))
-      
-      cost
-    },
-    cache_input = function(cost) {
-      eps <- cost$eps
-      P <- cost$P
-      P[P < eps] <- eps
-      cost$plogp <- colSums(P * logm(P, eps))
-
-      cost$plamphat <- P - lambda * cost$Phat
-      
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      
-      eps <- cost$eps
-      kl <- cost$plogp - colSums(cost$P * logm(cost$W / cost$Z, eps))
-      klhat <- cost$phlogph - colSums(cost$Phat * logm(cost$What / cost$Zhat, eps))
-      cost$pcost <- kl + lambda * klhat
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      
-      qlampqhat <- (cost$W / cost$Z) - lambda * (cost$What / cost$Zhat)
-      cost$G <- k2g(Y, 4 * cost$W * (cost$plamphat - qlampqhat))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      What <- 1 + W
-      W <- 1 / What
-      diag(W) <- 0
-      cost$Z <- sum(W)
-      cost$W <- W
-
-      diag(What) <- 0
-      cost$What <- What
-      cost$Zhat <- sum(What)
-
-      cost
-    }
-  )
-}
-
-# Elastic Embedding -------------------------------------------------------
-
-# Carreira-Perpinán, M. A. (2010, June).
-# The Elastic Embedding Algorithm for Dimensionality Reduction.
-# In \emph{Proceedings of the 27th International Conference on Machine Learning (ICML-10)} (pp. 167-174).
-# http://faculty.ucmerced.edu/mcarreira-perpinan/papers/icml10.pdf (PDF)
-# lambda control the strength of repulsive vs attractive forces
-# if neg_weights is true, the repulsive contribution is weighted based on the
-# squared input distances. Otherwise, no weighting is applied.
-ee <- function(perplexity, lambda = 100, neg_weights = TRUE, 
-               inp_kernel = "gaussian", symmetrize = "symmetric", 
-               eps = .Machine$double.eps, use_cpp = FALSE, n_threads = 0) {
-  list(
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      if (neg_weights) {
-        if (methods::is(X, "dist")) {
-          R <- X
-        }
-        else {
-          R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
-        }
-        cost$Vn <- R / sum(R)
-      }
-      else {
-        cost$Vn <- 1
-      }
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      Vp <- cost$P
-      Vn <- cost$Vn
-      W <- cost$W
-      cost$pcost <- colSums(-Vp * logm(W, cost$eps) + lambda * (Vn * W))
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      Vp <- cost$P
-      Vn <- cost$Vn
-      cost$G <- k2g(Y,  4 * (Vp - lambda * Vn * cost$W))
-      cost
-    },
-    export = function(cost, val) {
-      res <- NULL
-      if (!is.null(cost[[val]])) {
-        res <- cost[[val]]
-      }
-      else if (!is.null(cost[[toupper(val)]])) {
-        res <- cost[[toupper(val)]]
-      }
-      res
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- exp(-W)
-      diag(W) <- 0
-      cost$W <- W
-      cost
-    },
-    sentinel = "W"
-  )
-}
-
-
-# JSE and NeRV ------------------------------------------------------------
-
-# Venna, J., Peltonen, J., Nybo, K., Aidos, H., & Kaski, S. (2010).
-# Information retrieval perspective to nonlinear dimensionality reduction for
-# data visualization.
-# \emph{Journal of Machine Learning Research}, \emph{11}, 451-490.
-#
-# Unlike original publication, won't transfer input precisions to output kernel
-# lambda = 1 gives ASNE results
-# default lambda = 0.9 from "Majorization-Minimization for Manifold Embedding"
-# Yang, Peltonen, Kaski 2015
-nerv <- function(perplexity, lambda = 0.9, inp_kernel = "gaussian", 
-                 eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  lambda2 <- 2 * lambda
-  oml <- 1 - lambda
-  oml2 <- 2 * oml
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = "none", normalize = FALSE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      eps <- cost$eps
-      P <- cost$P
-      P[P < eps] <- eps
-      cost$P <- P
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      P <- cost$P
-
-      kl_fwd <- rowSums(P * cost$lPQ)
-
-      cost$pcost <- lambda * kl_fwd + oml * cost$kl_rev
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-
-      # Total K including multiplying by 2 in gradient
-      K <- lambda2 * (cost$P - Q) + oml2 * Q * (cost$lPQ + cost$kl_rev)
-
-      cost$G <- k2g(Y, K, symmetrize = TRUE)
-      cost
-    },
-    update = function(cost, Y) {
-      eps <- cost$eps
-
-      Q <- expQ(Y, eps, is_symmetric = TRUE, use_cpp = use_cpp, 
-                n_threads = n_threads)$Q
-      cost$Q <- Q
-      
-      # Reverse KL gradient
-      cost$lPQ <- logm(cost$P / Q, eps)
-      # for KLrev we want Q * log(Q/P), so take -ve of log(P/Q)
-      cost$kl_rev <- rowSums(Q * -cost$lPQ)
-      cost
-    },
-    sentinel = "Q",
-    export = function(cost, val) {
-      res <- cost_export(cost, val)
-      res
-    }
-    )
-}
-
-snerv <- function(perplexity, lambda = 0.9, inp_kernel = "gaussian", 
-                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0, use_cpp = FALSE) {
-  lambda4 <- 4 * lambda
-  oml <- 1 - lambda
-  oml4 <- 4 * oml
-  lreplace(
-    ssne(perplexity = perplexity, inp_kernel = inp_kernel, 
-         symmetrize = symmetrize, eps = eps, n_threads = n_threads),
-    cache_input = function(cost) {
-      eps <- cost$eps
-      P <- cost$P
-      P[P < eps] <- eps
-      cost$P <- P
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      
-      P <- cost$P
-      
-      kl_fwd <- colSums(P * cost$lPQ)
-      
-      cost$pcost <- lambda * kl_fwd + oml * cost$kl_rev
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-      
-      # Total K including multiplying by 4 in gradient
-      K <- lambda4 * (cost$P - Q) + oml4 * Q * (cost$lPQ + cost$QlPQs)
-      
-      cost$G <- k2g(Y, K, symmetrize = FALSE)
-      cost
-    },
-    update = function(cost, Y) {
-      eps <- cost$eps
-      
-      Q <- expQ(Y, eps, is_symmetric = TRUE, matrix_normalize = TRUE,
-                use_cpp = use_cpp, n_threads = n_threads)$Q      
-      cost$Q <- Q
-      
-      # Reverse KL gradient
-      cost$lPQ <- logm(cost$P / Q, eps)
-      cost$kl_rev <- colSums(Q * -cost$lPQ)
-      cost$QlPQs <- sum(cost$kl_rev)
-      cost
-    },
-    export = function(cost, val) {
-      res <- cost_export(cost, val)
-      res
-    }
-  )
-}
-
-# Lee, J. A., Renard, E., Bernard, G., Dupont, P., & Verleysen, M. (2013).
-# Type 1 and 2 mixtures of Kullback-Leibler divergences as cost functions in
-# dimensionality reduction based on similarity preservation.
-# \emph{Neurocomputing}, \emph{112}, 92-108.
-# kappa = 0 behaves like ASNE
-# kappa = 1 behaves like NeRV with lambda = 0. Yes that's confusing.
-jse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian", 
-                eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  eps0 <- 1e-5
-  kappa <- max(kappa, eps0)
-  kappa <- min(kappa, 1 - eps0)
-
-  kappa_inv <- 1 / kappa
-  m2_kappa_inv <- -2 * kappa_inv
-  om_kappa <- 1 - kappa
-  om_kappa_inv <- 1 / om_kappa
-
-  lreplace(
-    ssne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = "none", normalize = FALSE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost
-    },
-    cache_input = function(cost) {
-      eps <- cost$eps
-      P <- cost$P
-      P[P < eps] <- eps
-      cost$plogp <- rowSums(P * logm(P, eps))
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      eps <- cost$eps
-
-      cost$pcost <- 
-        om_kappa_inv * (cost$plogp - rowSums(cost$P * logm(cost$Z, eps))) +
-        kappa_inv * cost$QlQZc
-      
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      K <- m2_kappa_inv * cost$Q * (cost$lQZ - cost$QlQZc)
-      cost$G <- k2g(Y, K, 
-                    symmetrize = TRUE)
-
-      cost
-    },
-    update = function(cost, Y) {
-      eps <- cost$eps
-
-      Q <- expQ(Y, eps = eps, is_symmetric = TRUE, use_cpp = use_cpp,
-                n_threads = n_threads)$Q
-
-      Z <- kappa * cost$P + om_kappa * Q
-      Z[Z < eps] <- eps
-      diag(Z) <- 0
-      
-      cost$Q <- Q
-      cost$Z <- Z
-      cost$lQZ <- logm(Q / Z, eps)
-      cost$QlQZc <- rowSums(Q * cost$lQZ)
-
-      cost
-    },
-    export = cost_export
-  )
-}
-
-
-sjse <- function(perplexity, kappa = 0.5, inp_kernel = "gaussian", 
-                 symmetrize = "symmetric", eps = .Machine$double.eps,
-                 n_threads = 0, use_cpp = FALSE) {
-  eps0 <- 1e-5
-  kappa <- max(kappa, eps0)
-  kappa <- min(kappa, 1 - eps0)
-  
-  kappa_inv <- 1 / kappa
-  m4_kappa_inv <- -4 * kappa_inv
-  om_kappa <- 1 - kappa
-  om_kappa_inv <- 1 / om_kappa
-  
-  lreplace(
-    ssne(perplexity = perplexity, inp_kernel = inp_kernel, 
-         symmetrize = symmetrize, eps = eps, n_threads = n_threads,
-         use_cpp = use_cpp),
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      eps <- cost$eps
-      
-      cost$pcost <- 
-        om_kappa_inv * (cost$plogp - colSums(cost$P * logm(cost$Z, eps))) +
-        kappa_inv * cost$QlQZc
-      
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      K <- m4_kappa_inv * cost$Q * (cost$lQZ - cost$QlQZs)
-      cost$G <- k2g(Y, K, symmetrize = FALSE)
-
-      cost
-    },
-    update = function(cost, Y) {
-      eps <- cost$eps
-      
-      Q <- expQ(Y, cost$eps, is_symmetric = TRUE, matrix_normalize = TRUE,
-                use_cpp = use_cpp, n_threads = n_threads)$Q      
-      Z <- kappa * cost$P + om_kappa * Q
-      Z[Z < eps] <- eps
-      diag(Z) <- 0
-      
-      cost$Q <- Q
-      cost$Z <- Z
-      cost$lQZ <- logm(Q / Z, eps)
-      cost$QlQZc <- colSums(Q * cost$lQZ)
-      cost$QlQZs <- sum(cost$QlQZc)
-      
-      cost
-    },
-    export = cost_export
-  )
-}
-
-
-rsrnerv <- function(perplexity, lambda = 0.9, eps = .Machine$double.eps,
-                    n_threads = 0, use_cpp = FALSE) {
-  lreplace(nerv(perplexity = perplexity, lambda = lambda, use_cpp = use_cpp, 
-                n_threads = n_threads),
-           init = function(cost, X, max_iter, verbose = FALSE,
-                           ret_extra = c()) {
-             cost <- sne_init(cost, X, perplexity = perplexity,
-                              symmetrize = "symmetric", normalize = FALSE,
-                              verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads, use_cpp = use_cpp)
-             P <- cost$P
-             P <- P / rowSums(P)
-             cost$P <- P
-
-             cost$eps <- eps
-             cost
-           }
-  )
-}
-
-rsrjse <- function(perplexity, kappa = 0.5, eps = .Machine$double.eps, 
-                   n_threads = 0, use_cpp = FALSE) {
-  lreplace(jse(perplexity = perplexity, kappa = kappa, use_cpp = use_cpp,
-               n_threads = n_threads),
-           init = function(cost, X, max_iter, verbose = FALSE,
-                           ret_extra = c()) {
-             cost <- sne_init(cost, X, perplexity = perplexity,
-                              symmetrize = "symmetric", normalize = FALSE,
-                              verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads, use_cpp = use_cpp)
-             P <- cost$P
-             P <- P / rowSums(P)
-             cost$P <- P
-
-             cost$eps <- eps
-             cost
-           }
-  )
-}
-
-# NeRV with input bandwidths transferred to the output kernel, as in the
-# original paper.
-bnerv <- function(perplexity, lambda = 0.9, eps = .Machine$double.eps, 
-                  n_threads = 0, use_cpp = FALSE) {
-  lambda2 <- 2 * lambda
-  oml <- 1 - lambda
-  oml2 <- 2 * oml
-  lreplace(
-    nerv(perplexity = perplexity, lambda = lambda, use_cpp = use_cpp, 
-         n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      ret_extra <- unique(c(ret_extra, 'beta'))
-
-      cost <- sne_init(cost, X, perplexity = perplexity,
-                       symmetrize = "none", normalize = FALSE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost$eps <- eps
-      cost$lambda2b <- lambda2 * cost$beta
-      cost$oml2b <- oml2 * cost$beta
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      Q <- cost$Q
-      
-      # Total K including multiplying by 2 * beta in gradient
-      K <- cost$lambda2b * (cost$P - Q) + cost$oml2b * Q * (cost$lPQ + cost$kl_rev)
-      
-      cost$G <- k2g(Y, K, symmetrize = TRUE)
-      cost
-      
-      cost
-    },
-    update = function(cost, Y) {
-      eps <- cost$eps
-
-      Q <- expQ(Y, eps, beta = cost$beta, is_symmetric = FALSE,
-                use_cpp = use_cpp, n_threads = n_threads)$Q
-      cost$Q <- Q
-      
-      # Reverse KL gradient
-      cost$lPQ <- logm(cost$P / Q, eps)
-      cost$kl_rev <- rowSums(Q * -cost$lPQ)
-
-      cost
-    }
-  )
-}
diff --git a/smallvis/R/ee.R b/smallvis/R/ee.R
new file mode 100644
index 0000000..87be379
--- /dev/null
+++ b/smallvis/R/ee.R
@@ -0,0 +1,85 @@
+# Elastic Embedding -------------------------------------------------------
+
+# Carreira-Perpinán, M. A. (2010, June).
+# The Elastic Embedding Algorithm for Dimensionality Reduction.
+# In \emph{Proceedings of the 27th International Conference on Machine Learning (ICML-10)} (pp. 167-174).
+# http://faculty.ucmerced.edu/mcarreira-perpinan/papers/icml10.pdf (PDF)
+# lambda control the strength of repulsive vs attractive forces
+# if neg_weights is true, the repulsive contribution is weighted based on the
+# squared input distances. Otherwise, no weighting is applied.
+ee <- function(perplexity,
+               lambda = 100,
+               neg_weights = TRUE,
+               inp_kernel = "gaussian",
+               symmetrize = "symmetric",
+               eps = .Machine$double.eps,
+               use_cpp = FALSE,
+               n_threads = 0) {
+  list(
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      if (neg_weights) {
+        if (methods::is(X, "dist")) {
+          R <- X
+        } else {
+          R <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
+        }
+        cost$Vn <- R / sum(R)
+      } else {
+        cost$Vn <- 1
+      }
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      Vp <- cost$P
+      Vn <- cost$Vn
+      W <- cost$W
+      cost$pcost <- colSums(-Vp * logm(W, cost$eps) + lambda * (Vn * W))
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      Vp <- cost$P
+      Vn <- cost$Vn
+      cost$G <- k2g(Y, 4 * (Vp - lambda * Vn * cost$W))
+      cost
+    },
+    export = function(cost, val) {
+      res <- NULL
+      if (!is.null(cost[[val]])) {
+        res <- cost[[val]]
+      } else if (!is.null(cost[[toupper(val)]])) {
+        res <- cost[[toupper(val)]]
+      }
+      res
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- exp(-W)
+      diag(W) <- 0
+      cost$W <- W
+      cost
+    },
+    sentinel = "W"
+  )
+}
diff --git a/smallvis/R/fdiv.R b/smallvis/R/fdiv.R
new file mode 100644
index 0000000..3be1365
--- /dev/null
+++ b/smallvis/R/fdiv.R
@@ -0,0 +1,310 @@
+# f-Divergences -----------------------------------------------------------
+
+# Reverse KL divergence
+rklsne <- function(perplexity,
+                   inp_kernel = "gaussian",
+                   symmetrize = "symmetric",
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      P <- cost$P
+      eps <- cost$eps
+      P[P < eps] <- eps
+      cost$lP <- logm(P, eps)
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- cost$QlQPcs
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+      cost$G <- k2g(Y, 4 * Q * Q * cost$sumW * (sum(cost$QlQPcs) - cost$lQP))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      sumW <- sum(W)
+      Q <- W / sumW
+
+      cost$lQP <- logm(Q, cost$eps) - cost$lP
+      cost$QlQPcs <- colSums(Q * cost$lQP)
+      cost$Q <- Q
+      cost$sumW <- sumW
+
+      cost
+    }
+  )
+}
+
+# Jensen-Shannon divergence
+jssne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  symmetrize = "symmetric",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      use_cpp == use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      P <- cost$P
+      eps <- cost$eps
+      P[P < eps] <- eps
+      cost$PlP <- colSums(P * logm(P, eps))
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- 0.5 * (cost$PlP + cost$QlQZ - colSums(cost$P * logm(cost$Z, cost$eps)))
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+
+      cost$G <- k2g(Y, 2 * Q * Q * cost$sumW * (cost$QlQZs - cost$lQZ))
+      cost
+    },
+    update = function(cost, Y) {
+      eps <- cost$eps
+      P <- cost$P
+
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      sumW <- sum(W)
+
+      Q <- W / sumW
+      Z <- 0.5 * (P + Q)
+
+      QlQZ <- logm(Q / Z, eps)
+      cost$QlQZ <- colSums(Q * QlQZ)
+      cost$QlQZs <- sum(cost$QlQZ)
+
+      cost$lQZ <- QlQZ
+      cost$sumW <- sumW
+      cost$Q <- Q
+      cost$Z <- Z
+
+      cost
+    }
+  )
+}
+
+# Chi-squared divergence
+chsne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  symmetrize = "symmetric",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      use_cpp == use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      P <- cost$P
+      cost$P2 <- P * P
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      PQ <- cost$P - cost$Q
+
+      cost$pcost <- colSums(PQ * PQ * cost$invQ)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      Q <- cost$Q
+      Z <- cost$Z
+      invQ <- cost$invQ
+
+      P2Q <- cost$P2 * invQ
+
+      cost$G <- k2g(Y, 4 * Q * Q * Z * (P2Q * invQ - sum(P2Q)))
+      cost
+    },
+    update = function(cost, Y) {
+      P <- cost$P
+
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      Z <- sum(W)
+
+      Q <- W / Z
+
+      cost$Q <- Q
+      cost$Z <- Z
+      invQ <- 1 / Q
+      diag(invQ) <- 0
+      cost$invQ <- invQ
+
+      cost
+    }
+  )
+}
+
+# Hellinger distance divergence
+hlsne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  symmetrize = "symmetric",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      use_cpp == use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "symmetric",
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      cost$sP <- sqrt(cost$P)
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      PQ <- cost$sP - cost$sQ
+
+      cost$pcost <- colSums(PQ * PQ)
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      sP <- cost$sP
+      Q <- cost$Q
+      Z <- cost$Z
+      sQ <- cost$sQ
+
+      sPQ <- sum(sP * sQ)
+      PQ <- sP / sQ
+      diag(PQ) <- 0
+
+      cost$G <- k2g(Y, 4 * Q * Q * Z * (PQ - sPQ))
+      cost
+    },
+    update = function(cost, Y) {
+      P <- cost$P
+
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      Z <- sum(W)
+
+      Q <- W / Z
+
+      cost$Q <- Q
+      cost$Z <- Z
+      cost$sQ <- sqrt(Q)
+
+      cost
+    }
+  )
+}
diff --git a/smallvis/R/gsne.R b/smallvis/R/gsne.R
new file mode 100644
index 0000000..f1dbc87
--- /dev/null
+++ b/smallvis/R/gsne.R
@@ -0,0 +1,92 @@
+# global-SNE
+gsne <- function(perplexity,
+                 lambda = 1,
+                 inp_kernel = "gaussian",
+                 symmetrize = "symmetric",
+                 eps = .Machine$double.eps,
+                 use_cpp = FALSE,
+                 n_threads = 0) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+
+      # Phat isn't affected by early exaggeration, so we cache it once only
+      if (methods::is(X, "dist")) {
+        Phat <- as.matrix(X)
+      } else {
+        Phat <- calc_d2(X, use_cpp = use_cpp, n_threads = n_threads)
+      }
+
+      Phat <- Phat + 1
+      diag(Phat) <- 0
+
+      Phat <- Phat / sum(Phat)
+      cost$Phat <- Phat
+      cost$phlogph <- colSums(Phat * logm(Phat, eps))
+
+      cost
+    },
+    cache_input = function(cost) {
+      eps <- cost$eps
+      P <- cost$P
+      P[P < eps] <- eps
+      cost$plogp <- colSums(P * logm(P, eps))
+
+      cost$plamphat <- P - lambda * cost$Phat
+
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      eps <- cost$eps
+      kl <- cost$plogp - colSums(cost$P * logm(cost$W / cost$Z, eps))
+      klhat <- cost$phlogph - colSums(cost$Phat * logm(cost$What / cost$Zhat, eps))
+      cost$pcost <- kl + lambda * klhat
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      qlampqhat <- (cost$W / cost$Z) - lambda * (cost$What / cost$Zhat)
+      cost$G <- k2g(Y, 4 * cost$W * (cost$plamphat - qlampqhat))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      What <- 1 + W
+      W <- 1 / What
+      diag(W) <- 0
+      cost$Z <- sum(W)
+      cost$W <- W
+
+      diag(What) <- 0
+      cost$What <- What
+      cost$Zhat <- sum(What)
+
+      cost
+    }
+  )
+}
diff --git a/smallvis/R/jse.R b/smallvis/R/jse.R
new file mode 100644
index 0000000..fbc84fd
--- /dev/null
+++ b/smallvis/R/jse.R
@@ -0,0 +1,206 @@
+# JSE ---------------------------------------------------------------------
+
+# Lee, J. A., Renard, E., Bernard, G., Dupont, P., & Verleysen, M. (2013).
+# Type 1 and 2 mixtures of Kullback-Leibler divergences as cost functions in
+# dimensionality reduction based on similarity preservation.
+# \emph{Neurocomputing}, \emph{112}, 92-108.
+# kappa = 0 behaves like ASNE
+# kappa = 1 behaves like NeRV with lambda = 0. Yes that's confusing.
+jse <- function(perplexity,
+                kappa = 0.5,
+                inp_kernel = "gaussian",
+                eps = .Machine$double.eps,
+                n_threads = 0,
+                use_cpp = FALSE) {
+  eps0 <- 1e-5
+  kappa <- max(kappa, eps0)
+  kappa <- min(kappa, 1 - eps0)
+
+  kappa_inv <- 1 / kappa
+  m2_kappa_inv <- -2 * kappa_inv
+  om_kappa <- 1 - kappa
+  om_kappa_inv <- 1 / om_kappa
+
+  lreplace(
+    ssne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "none",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      eps <- cost$eps
+      P <- cost$P
+      P[P < eps] <- eps
+      cost$plogp <- rowSums(P * logm(P, eps))
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      eps <- cost$eps
+
+      cost$pcost <-
+        om_kappa_inv * (cost$plogp - rowSums(cost$P * logm(cost$Z, eps))) +
+        kappa_inv * cost$QlQZc
+
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      K <- m2_kappa_inv * cost$Q * (cost$lQZ - cost$QlQZc)
+      cost$G <- k2g(Y, K, symmetrize = TRUE)
+
+      cost
+    },
+    update = function(cost, Y) {
+      eps <- cost$eps
+
+      Q <- expQ(
+        Y,
+        eps = eps,
+        is_symmetric = TRUE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
+
+      Z <- kappa * cost$P + om_kappa * Q
+      Z[Z < eps] <- eps
+      diag(Z) <- 0
+
+      cost$Q <- Q
+      cost$Z <- Z
+      cost$lQZ <- logm(Q / Z, eps)
+      cost$QlQZc <- rowSums(Q * cost$lQZ)
+
+      cost
+    },
+    export = cost_export
+  )
+}
+
+
+sjse <- function(perplexity,
+                 kappa = 0.5,
+                 inp_kernel = "gaussian",
+                 symmetrize = "symmetric",
+                 eps = .Machine$double.eps,
+                 n_threads = 0,
+                 use_cpp = FALSE) {
+  eps0 <- 1e-5
+  kappa <- max(kappa, eps0)
+  kappa <- min(kappa, 1 - eps0)
+
+  kappa_inv <- 1 / kappa
+  m4_kappa_inv <- -4 * kappa_inv
+  om_kappa <- 1 - kappa
+  om_kappa_inv <- 1 / om_kappa
+
+  lreplace(
+    ssne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      symmetrize = symmetrize,
+      eps = eps,
+      n_threads = n_threads,
+      use_cpp = use_cpp
+    ),
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      eps <- cost$eps
+
+      cost$pcost <-
+        om_kappa_inv * (cost$plogp - colSums(cost$P * logm(cost$Z, eps))) +
+        kappa_inv * cost$QlQZc
+
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      K <- m4_kappa_inv * cost$Q * (cost$lQZ - cost$QlQZs)
+      cost$G <- k2g(Y, K, symmetrize = FALSE)
+
+      cost
+    },
+    update = function(cost, Y) {
+      eps <- cost$eps
+
+      Q <- expQ(
+        Y,
+        cost$eps,
+        is_symmetric = TRUE,
+        matrix_normalize = TRUE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
+      Z <- kappa * cost$P + om_kappa * Q
+      Z[Z < eps] <- eps
+      diag(Z) <- 0
+
+      cost$Q <- Q
+      cost$Z <- Z
+      cost$lQZ <- logm(Q / Z, eps)
+      cost$QlQZc <- colSums(Q * cost$lQZ)
+      cost$QlQZs <- sum(cost$QlQZc)
+
+      cost
+    },
+    export = cost_export
+  )
+}
+
+rsrjse <- function(perplexity,
+                   kappa = 0.5,
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
+  lreplace(
+    jse(
+      perplexity = perplexity,
+      kappa = kappa,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        symmetrize = "symmetric",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      P <- cost$P
+      P <- P / rowSums(P)
+      cost$P <- P
+
+      cost$eps <- eps
+      cost
+    }
+  )
+}
diff --git a/smallvis/R/largevis.R b/smallvis/R/largevis.R
new file mode 100644
index 0000000..2df54bd
--- /dev/null
+++ b/smallvis/R/largevis.R
@@ -0,0 +1,71 @@
+# LargeVis ----------------------------------------------------------------
+
+largevis <- function(perplexity,
+                     inp_kernel = "gaussian",
+                     symmetrize = "symmetric",
+                     gamma = 1,
+                     gr_eps = 0.1,
+                     normalize = TRUE,
+                     eps = 1e-9,
+                     row_weight = NULL,
+                     use_cpp = FALSE,
+                     n_threads = 0) {
+  if (!is.null(row_weight)) {
+    row_normalize <- row_weight
+  } else {
+    row_normalize <- TRUE
+  }
+  lreplace(
+    tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      cost <- sne_init(
+        cost,
+        X = X,
+        perplexity = perplexity,
+        symmetrize = symmetrize,
+        kernel = inp_kernel,
+        normalize = normalize,
+        verbose = verbose,
+        row_normalize = row_normalize,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost$greps1 <- gr_eps - 1
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      P <- cost$P
+      eps <- cost$eps
+      W <- cost$W
+
+      cost$pcost <- colSums(-P * logm(W, eps) - gamma * log1p(-W + eps))
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      W <- cost$W
+      cost$G <- k2g(Y, 4 * W * (cost$P - ((gamma * W) / (
+        1 + cost$greps1 * W
+      ))))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+
+      cost$W <- W
+      cost
+    },
+    export = cost_export
+  )
+}
diff --git a/smallvis/R/nerv.R b/smallvis/R/nerv.R
new file mode 100644
index 0000000..5202828
--- /dev/null
+++ b/smallvis/R/nerv.R
@@ -0,0 +1,281 @@
+# NeRV --------------------------------------------------------------------
+
+# Venna, J., Peltonen, J., Nybo, K., Aidos, H., & Kaski, S. (2010).
+# Information retrieval perspective to nonlinear dimensionality reduction for
+# data visualization.
+# \emph{Journal of Machine Learning Research}, \emph{11}, 451-490.
+#
+# Unlike original publication, won't transfer input precisions to output kernel
+# lambda = 1 gives ASNE results
+# default lambda = 0.9 from "Majorization-Minimization for Manifold Embedding"
+# Yang, Peltonen, Kaski 2015
+nerv <- function(perplexity,
+                 lambda = 0.9,
+                 inp_kernel = "gaussian",
+                 eps = .Machine$double.eps,
+                 n_threads = 0,
+                 use_cpp = FALSE) {
+  lambda2 <- 2 * lambda
+  oml <- 1 - lambda
+  oml2 <- 2 * oml
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "none",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      eps <- cost$eps
+      P <- cost$P
+      P[P < eps] <- eps
+      cost$P <- P
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      P <- cost$P
+
+      kl_fwd <- rowSums(P * cost$lPQ)
+
+      cost$pcost <- lambda * kl_fwd + oml * cost$kl_rev
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+
+      # Total K including multiplying by 2 in gradient
+      K <- lambda2 * (cost$P - Q) + oml2 * Q * (cost$lPQ + cost$kl_rev)
+
+      cost$G <- k2g(Y, K, symmetrize = TRUE)
+      cost
+    },
+    update = function(cost, Y) {
+      eps <- cost$eps
+
+      Q <- expQ(
+        Y,
+        eps,
+        is_symmetric = TRUE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
+      cost$Q <- Q
+
+      # Reverse KL gradient
+      cost$lPQ <- logm(cost$P / Q, eps)
+      # for KLrev we want Q * log(Q/P), so take -ve of log(P/Q)
+      cost$kl_rev <- rowSums(Q * -cost$lPQ)
+      cost
+    },
+    sentinel = "Q",
+    export = function(cost, val) {
+      res <- cost_export(cost, val)
+      res
+    }
+  )
+}
+
+snerv <- function(perplexity,
+                  lambda = 0.9,
+                  inp_kernel = "gaussian",
+                  symmetrize = "symmetric",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lambda4 <- 4 * lambda
+  oml <- 1 - lambda
+  oml4 <- 4 * oml
+  lreplace(
+    ssne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      symmetrize = symmetrize,
+      eps = eps,
+      n_threads = n_threads
+    ),
+    cache_input = function(cost) {
+      eps <- cost$eps
+      P <- cost$P
+      P[P < eps] <- eps
+      cost$P <- P
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      P <- cost$P
+
+      kl_fwd <- colSums(P * cost$lPQ)
+
+      cost$pcost <- lambda * kl_fwd + oml * cost$kl_rev
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+
+      # Total K including multiplying by 4 in gradient
+      K <- lambda4 * (cost$P - Q) + oml4 * Q * (cost$lPQ + cost$QlPQs)
+
+      cost$G <- k2g(Y, K, symmetrize = FALSE)
+      cost
+    },
+    update = function(cost, Y) {
+      eps <- cost$eps
+
+      Q <- expQ(
+        Y,
+        eps,
+        is_symmetric = TRUE,
+        matrix_normalize = TRUE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
+      cost$Q <- Q
+
+      # Reverse KL gradient
+      cost$lPQ <- logm(cost$P / Q, eps)
+      cost$kl_rev <- colSums(Q * -cost$lPQ)
+      cost$QlPQs <- sum(cost$kl_rev)
+      cost
+    },
+    export = function(cost, val) {
+      res <- cost_export(cost, val)
+      res
+    }
+  )
+}
+
+# NeRV with input bandwidths transferred to the output kernel, as in the
+# original paper.
+bnerv <- function(perplexity,
+                  lambda = 0.9,
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lambda2 <- 2 * lambda
+  oml <- 1 - lambda
+  oml2 <- 2 * oml
+  lreplace(
+    nerv(
+      perplexity = perplexity,
+      lambda = lambda,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      ret_extra <- unique(c(ret_extra, "beta"))
+
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        symmetrize = "none",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost$lambda2b <- lambda2 * cost$beta
+      cost$oml2b <- oml2 * cost$beta
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      Q <- cost$Q
+
+      # Total K including multiplying by 2 * beta in gradient
+      K <- cost$lambda2b * (cost$P - Q) + cost$oml2b * Q * (cost$lPQ + cost$kl_rev)
+
+      cost$G <- k2g(Y, K, symmetrize = TRUE)
+      cost
+
+      cost
+    },
+    update = function(cost, Y) {
+      eps <- cost$eps
+
+      Q <- expQ(
+        Y,
+        eps,
+        beta = cost$beta,
+        is_symmetric = FALSE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
+      cost$Q <- Q
+
+      # Reverse KL gradient
+      cost$lPQ <- logm(cost$P / Q, eps)
+      cost$kl_rev <- rowSums(Q * -cost$lPQ)
+
+      cost
+    }
+  )
+}
+
+rsrnerv <- function(perplexity,
+                    lambda = 0.9,
+                    eps = .Machine$double.eps,
+                    n_threads = 0,
+                    use_cpp = FALSE) {
+  lreplace(
+    nerv(
+      perplexity = perplexity,
+      lambda = lambda,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        symmetrize = "symmetric",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      P <- cost$P
+      P <- P / rowSums(P)
+      cost$P <- P
+
+      cost$eps <- eps
+      cost
+    }
+  )
+}

From 4c851ab80400dabecd3bb4594bed503e9ba66bb4 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 12:30:38 -0700
Subject: [PATCH 17/26] reorganize some SNE functions

---
 smallvis/R/cost.R       |  94 +++++++++++-
 smallvis/R/ee.R         |  76 ++++++++++
 smallvis/R/perplexity.R | 204 ++++++++++++++++++++++++++
 smallvis/R/sne.R        | 316 ----------------------------------------
 4 files changed, 369 insertions(+), 321 deletions(-)

diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index ec18e95..524b29b 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -1,5 +1,13 @@
 # Generic Weight/Cost/Gradient functions ----------------------------------
 
+# Convert Force constant to Gradient
+k2g <- function(Y, K, symmetrize = FALSE) {
+  if (symmetrize) {
+    K <- K + t(K)
+  }
+  Y * colSums(K) - (K %*% Y)
+}
+
 logm <- function(m, eps = .Machine$double.eps) {
   diag(m) <- eps
   m <- log(m)
@@ -21,12 +29,88 @@ powm <- function(m, n, eps = .Machine$double.eps) {
   m
 }
 
-# Convert Force constant to Gradient
-k2g <- function(Y, K, symmetrize = FALSE) {
-  if (symmetrize) {
-    K <- K + t(K)
+# Calculates shifted exponential column-wise: exp(X - a)
+# where a is the column max.
+# This is the log-sum-exp trick to avoid numeric underflow:
+# log sum_i exp x_i = a + log sum_i exp(x_i - a)
+# => sum_i exp x_i = exp a * sum_i exp(x_i - a)
+# with a = max x_i
+# exp(max x_i) can still underflow so we don't return Z (the sum)
+# Use Q directly (exp a appears in numerator and denominator, so cancels).
+# https://statmodeling.stat.columbia.edu/2016/06/11/log-sum-of-exponentials/
+# https://www.xarg.org/2016/06/the-log-sum-exp-trick-in-machine-learning/
+# http://wittawat.com/posts/log-sum_exp_underflow.html
+exp_shift <- function(X) {
+  X <- exp(sweep(X, 2, apply(X, 2, max)))
+}
+
+expQ <- function(Y, eps = .Machine$double.eps, beta = NULL,
+                 A = NULL,
+                 is_symmetric = FALSE,
+                 matrix_normalize = FALSE,
+                 use_cpp = FALSE,
+                 n_threads = 1) {
+  W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+  
+  if (!is.null(beta)) {
+    W <- exp_shift(-W * beta)
   }
-  Y * colSums(K) - (K %*% Y)
+  else {
+    W <- exp_shift(-W)
+  }
+  
+  if (!is.null(A)) {
+    W <- A * W
+  }
+  diag(W) <- 0
+  
+  if (matrix_normalize) {
+    Z <- sum(W)
+  }
+  else {
+    if (is_symmetric) {
+      Z <- colSums(W)
+    }
+    else {
+      Z <- rowSums(W)
+    }
+  }
+  # cost of division (vs storing 1/Z and multiplying) seems small
+  Q <- W / Z
+  
+  if (eps > 0) {
+    Q[Q < eps] <- eps
+  }
+  diag(Q) <- 0
+  
+  list(
+    Q = Q,
+    Z = Z
+  )
+}
+
+# KL divergence using Q directly
+kl_costQ <- function(cost, Y) {
+  cost <- cost_update(cost, Y)
+  
+  # P log(P / Q) = P log P - P log Q
+  cost$pcost <- cost$plogp - colSums(cost$P * logm(cost$Q, cost$eps))
+  cost
+}
+
+kl_costQr <- function(cost, Y) {
+  cost <- cost_update(cost, Y)
+  
+  # P log(P / Q) = P log P - P log Q
+  cost$pcost <- cost$plogp - rowSums(cost$P * logm(cost$Q, cost$eps))
+  cost
+}
+
+kl_cost <- function(cost, Y) {
+  cost <- cost_update(cost, Y)
+  # P log(P / Q) = P log P - P log Q
+  cost$pcost <- cost$plogp - colSums(cost$P * logm(cost$W / cost$Z, cost$eps))
+  cost
 }
 
 cost_init <- function(cost,
diff --git a/smallvis/R/ee.R b/smallvis/R/ee.R
index 87be379..1d8911b 100644
--- a/smallvis/R/ee.R
+++ b/smallvis/R/ee.R
@@ -83,3 +83,79 @@ ee <- function(perplexity,
     sentinel = "W"
   )
 }
+
+
+# t-Distributed Elastic Embedding
+# EE-like cost function in terms of I-Divergence
+# Scaled to give a gradient similar in form to t-SNE
+tee <- function(perplexity,
+                inp_kernel = "gaussian",
+                symmetrize = "symmetric",
+                lambda = 0.01,
+                eps = .Machine$double.eps,
+                n_threads = 0,
+                use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      ret_extra <- unique(c(ret_extra, "V", "dint"))
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      V <- cost$P
+      cost$eps <- eps
+      cost$invN <- 1 / sum(V)
+      cost$gradconst <- 4 * cost$invN
+      cost$lambda <- lambda
+
+      V[V < eps] <- eps
+      cost$constV <- cost$invN * (colSums(V * logm(V, eps)) - lambda * colSums(V))
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$G <- k2g(Y, cost$gradconst * cost$W * (cost$P - cost$W * cost$lambda))
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+      cost$W <- W
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      V <- cost$P
+      W <- cost$W
+      eps <- cost$eps
+
+      cost$pcost <- cost$constV +
+        cost$invN * (cost$lambda * colSums(W) - colSums(V * logm(W, eps)))
+      cost
+    },
+    exaggerate = function(cost, exaggeration_factor) {
+      cost$V <- cost$V * exaggeration_factor
+      cost
+    }
+  )
+}
diff --git a/smallvis/R/perplexity.R b/smallvis/R/perplexity.R
index 32cc502..d2e59f1 100644
--- a/smallvis/R/perplexity.R
+++ b/smallvis/R/perplexity.R
@@ -440,3 +440,207 @@ user_idp_perps <- function(perplexity) {
   }
   perplexities
 }
+
+# Some symmetrization options mean "actually, no symmetrization please". This
+# function returns the ones that will actually produce a symmetric matrix,
+# necessary for symmetric methods (e.g. tsne vs asne).
+true_symmetrize_options <- function() {
+  c("symmetric", "average", "mutual", "umap", "fuzzy")
+}
+
+scale_affinities <- function(P,
+                             symmetrize = "symmetric",
+                             row_normalize = TRUE,
+                             normalize = TRUE) {
+  # row normalization before anything else
+  if (nnat(row_normalize)) {
+    if (symmetrize == "rowsymm") {
+      P <- 0.5 * (P + t(P))
+      symmetrize <- "none"
+    }
+    P <- P / rowSums(P)
+  } else if (is.numeric(row_normalize)) {
+    P <- row_normalize * P / rowSums(P)
+  }
+
+  # Symmetrize
+  P <- switch(symmetrize,
+    none = P,
+    symmetric = 0.5 * (P + t(P)),
+    average = 0.5 * (P + t(P)),
+    mutual = sqrt(P * t(P)),
+    umap = fuzzy_set_union(P),
+    fuzzy = fuzzy_set_union(P),
+    stop("unknown symmetrization: ", symmetrize)
+  )
+  # Normalize
+  if (normalize) {
+    P <- P / sum(P)
+  }
+  P
+}
+
+sne_init <- function(cost,
+                     X,
+                     perplexity,
+                     kernel = "gaussian",
+                     symmetrize = "symmetric",
+                     row_normalize = TRUE,
+                     normalize = TRUE,
+                     n_threads = 0,
+                     use_cpp = use_cpp,
+                     verbose = FALSE,
+                     ret_extra = c()) {
+  if (tolower(kernel) == "knn") {
+    if (is.character(perplexity) || is.list(perplexity)) {
+      stop("Can't use intrinsic dimensionality with knn kernel")
+    }
+    if (length(perplexity) > 1) {
+      stop("Can't use multiple perplexities with knn kernel")
+    }
+    tsmessage("Using knn kernel with k = ", formatC(perplexity))
+    P <- knn_graph(X,
+      k = perplexity,
+      n_threads = n_threads,
+      verbose = verbose
+    )
+    x2ares <- list(W = P)
+  } else if (tolower(kernel) == "skd") {
+    P <- smooth_knn_distances(
+      X,
+      k = perplexity,
+      tol = 1e-5,
+      n_threads = n_threads,
+      verbose = verbose
+    )$P
+    row_normalize <- FALSE
+    x2ares <- list(W = P)
+  } else if (perp_method(perplexity) == "idp") {
+    perplexities <- NULL
+    if (is.list(perplexity) && length(perplexity) == 2) {
+      perplexities <- perplexity[[2]]
+    }
+
+    x2ares <- idp(X,
+      perplexities = perplexities,
+      tol = 1e-5,
+      verbose = verbose
+    )
+    P <- x2ares$W
+    ret_extra <- unique(c(ret_extra, "idp"))
+  } else if (perp_method(perplexity) == "multiscale") {
+    perplexities <- NULL
+    if (is.list(perplexity) && length(perplexity) == 2) {
+      perplexities <- perplexity[[2]]
+    }
+
+    mspres <- msp(
+      X,
+      perplexities = perplexities,
+      tol = 1e-5,
+      symmetrize = symmetrize,
+      row_normalize = row_normalize,
+      normalize = normalize,
+      verbose = verbose
+    )
+    cost$P <- mspres$P
+    return(cost)
+  } else if (tolower(kernel) == "sigma") {
+    tsmessage("Using fixed sigma = ", formatC(perplexity))
+    x2ares <- x2aff_sigma(
+      X,
+      sigma = perplexity,
+      n_threads = n_threads,
+      use_cpp = use_cpp,
+      verbose = verbose
+    )
+    P <- x2ares$W
+  } else {
+    if (!is.numeric(perplexity)) {
+      stop("Unknown perplexity method, '", perplexity[[1]], "'")
+    }
+    tsmessage(
+      "Commencing calibration for perplexity = ",
+      format_perps(perplexity)
+    )
+    if (use_cpp) {
+      P <- find_beta_cpp(X, perplexity, tol = 1e-5, n_threads = n_threads)$W
+    } else {
+      x2ares <- x2aff(
+        X,
+        perplexity,
+        tol = 1e-5,
+        kernel = kernel,
+        verbose = verbose
+      )
+      P <- x2ares$W
+    }
+  }
+
+  P <- scale_affinities(
+    P,
+    symmetrize = symmetrize,
+    row_normalize = row_normalize,
+    normalize = normalize
+  )
+  cost$P <- P
+
+  if (is.logical(row_normalize)) {
+    tsmessage(
+      "Effective perplexity of P approx = ",
+      formatC(stats::median(perpp(P)))
+    )
+  }
+
+  for (r in unique(tolower(ret_extra))) {
+    switch(r,
+      v = {
+        cost$V <- x2ares$W
+      },
+      dint = {
+        if (!is.null(x2ares$dint)) {
+          cost$dint <- x2ares$dint
+        }
+      },
+      beta = {
+        if (!is.null(x2ares$beta)) {
+          cost$beta <- x2ares$beta
+        }
+      },
+      adegc = {
+        cost$adegc <- 0.5 * rowSums(x2ares$W) + colSums(x2ares$W)
+      },
+      adegin = {
+        cost$adegin <- rowSums(x2ares$W)
+      },
+      adegout = {
+        cost$adegout <- colSums(x2ares$W)
+      },
+      pdeg = {
+        cost$pdeg <- colSums(P)
+      },
+      idp = {
+        if (!is.null(x2ares$idp)) {
+          cost$idp <- x2ares$idp
+        }
+      }
+    )
+  }
+  cost
+}
+
+# The intrinsic dimensionality associated with a gaussian affinity vector
+# Convenient only from in x2aff, where all these values are available
+intd_x2aff <- function(D2, beta, W, Z, H, eps = .Machine$double.eps) {
+  P <- W / Z
+  -2 * beta * sum(D2 * P * (log(P + eps) + H))
+}
+
+shannonpr <- function(P, eps = .Machine$double.eps) {
+  P <- P / rowSums(P)
+  rowSums(-P * log(P + eps))
+}
+
+perpp <- function(P) {
+  exp(shannonpr(P))
+}
diff --git a/smallvis/R/sne.R b/smallvis/R/sne.R
index e58c611..318432b 100644
--- a/smallvis/R/sne.R
+++ b/smallvis/R/sne.R
@@ -1,87 +1,3 @@
-# Calculates shifted exponential column-wise: exp(X - a)
-# where a is the column max.
-# This is the log-sum-exp trick to avoid numeric underflow:
-# log sum_i exp x_i = a + log sum_i exp(x_i - a)
-# => sum_i exp x_i = exp a * sum_i exp(x_i - a)
-# with a = max x_i
-# exp(max x_i) can still underflow so we don't return Z (the sum)
-# Use Q directly (exp a appears in numerator and denominator, so cancels).
-# https://statmodeling.stat.columbia.edu/2016/06/11/log-sum-of-exponentials/
-# https://www.xarg.org/2016/06/the-log-sum-exp-trick-in-machine-learning/
-# http://wittawat.com/posts/log-sum_exp_underflow.html
-exp_shift <- function(X) {
-  X <- exp(sweep(X, 2, apply(X, 2, max)))
-}
-
-expQ <- function(Y, eps = .Machine$double.eps, beta = NULL,
-                 A = NULL,
-                 is_symmetric = FALSE,
-                 matrix_normalize = FALSE,
-                 use_cpp = FALSE,
-                 n_threads = 1) {
-  W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-  
-  if (!is.null(beta)) {
-    W <- exp_shift(-W * beta)
-  }
-  else {
-    W <- exp_shift(-W)
-  }
-  
-  if (!is.null(A)) {
-    W <- A * W
-  }
-  diag(W) <- 0
-  
-  if (matrix_normalize) {
-    Z <- sum(W)
-  }
-  else {
-    if (is_symmetric) {
-      Z <- colSums(W)
-    }
-    else {
-      Z <- rowSums(W)
-    }
-  }
-  # cost of division (vs storing 1/Z and multiplying) seems small
-  Q <- W / Z
-  
-  if (eps > 0) {
-    Q[Q < eps] <- eps
-  }
-  diag(Q) <- 0
-  
-  list(
-    Q = Q,
-    Z = Z
-  )
-}
-
-# KL divergence using Q directly
-kl_costQ <- function(cost, Y) {
-  cost <- cost_update(cost, Y)
-  
-  # P log(P / Q) = P log P - P log Q
-  cost$pcost <- cost$plogp - colSums(cost$P * logm(cost$Q, cost$eps))
-  cost
-}
-
-kl_costQr <- function(cost, Y) {
-  cost <- cost_update(cost, Y)
-  
-  # P log(P / Q) = P log P - P log Q
-  cost$pcost <- cost$plogp - rowSums(cost$P * logm(cost$Q, cost$eps))
-  cost
-}
-
-kl_cost <- function(cost, Y) {
-  cost <- cost_update(cost, Y)
-  # P log(P / Q) = P log P - P log Q
-  cost$pcost <- cost$plogp - colSums(cost$P * logm(cost$W / cost$Z, cost$eps))
-  cost
-}
-
 # t-SNE
 tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric", 
                  normalize = TRUE, row_normalize = TRUE,
@@ -512,61 +428,6 @@ pstsne <- function(perplexity, inp_kernel = "gaussian",
   )
 }
 
-# t-Distributed Elastic Embedding
-# EE-like cost function in terms of I-Divergence
-# Scaled to give a gradient similar in form to t-SNE
-tee <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
-                lambda = 0.01, eps = .Machine$double.eps, n_threads = 0,
-                use_cpp = FALSE) {
-  lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      ret_extra = unique(c(ret_extra, "V", "dint"))
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = FALSE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      V <- cost$P
-      cost$eps <- eps
-      cost$invN <- 1 / sum(V)
-      cost$gradconst <- 4 * cost$invN
-      cost$lambda <- lambda
-      
-      V[V < eps] <- eps
-      cost$constV <- cost$invN * (colSums(V * logm(V, eps)) - lambda * colSums(V))
-      cost
-    },
-    gr = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-      cost$G <- k2g(Y, cost$gradconst * cost$W * (cost$P - cost$W * cost$lambda))
-      cost
-    },
-    update = function(cost, Y) {
-      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-      W <- 1 / (1 + W)
-      diag(W) <- 0
-      cost$W <- W
-      cost
-    },
-    pfn = function(cost, Y) {
-      cost <- cost_update(cost, Y)
-
-      V <- cost$P
-      W <- cost$W
-      eps <- cost$eps
-
-      cost$pcost <- cost$constV +
-        cost$invN * (cost$lambda * colSums(W) - colSums(V * logm(W, eps)))
-      cost
-    },
-    exaggerate = function(cost, exaggeration_factor) {
-      cost$V <- cost$V * exaggeration_factor
-      cost
-    }
-  )
-}
-
 # UMAP/t-SNE Hybrids ------------------------------------------------------
 
 # Calculate P via normalized smooth knn-distances
@@ -951,181 +812,4 @@ arsrsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0,
   )
 }
 
-# Perplexity Calibration --------------------------------------------------
-
-# Some symmetrization options mean "actually, no symmetrization please". This
-# function returns the ones that will actually produce a symmetric matrix,
-# necessary for symmetric methods (e.g. tsne vs asne).
-true_symmetrize_options <- function() {
-  c("symmetric", "average", "mutual", "umap", "fuzzy")
-}
-
-scale_affinities <- function(P, symmetrize = "symmetric", row_normalize = TRUE,
-                             normalize = TRUE) {
-  # row normalization before anything else
-  if (nnat(row_normalize)) {
-    if (symmetrize == "rowsymm") {
-      P <- 0.5 * (P + t(P))
-      symmetrize <- "none"
-    }
-    P <- P / rowSums(P)
-  }
-  else if (is.numeric(row_normalize)) {
-    P <- row_normalize * P / rowSums(P)
-  }
-  
-  # Symmetrize
-  P <- switch(symmetrize,
-              none = P,
-              symmetric = 0.5 * (P + t(P)),
-              average = 0.5 * (P + t(P)),
-              mutual = sqrt(P * t(P)),
-              umap = fuzzy_set_union(P),
-              fuzzy = fuzzy_set_union(P),
-              stop("unknown symmetrization: ", symmetrize))
-  # Normalize
-  if (normalize) {
-    P <- P / sum(P)
-  }
-  P
-}
-
-sne_init <- function(cost,
-                     X,
-                     perplexity,
-                     kernel = "gaussian",
-                     symmetrize = "symmetric",
-                     row_normalize = TRUE,
-                     normalize = TRUE,
-                     n_threads = 0,
-                     use_cpp = use_cpp,
-                     verbose = FALSE,
-                     ret_extra = c()) {
-  
-  if (tolower(kernel) == "knn") {
-    if (is.character(perplexity) || is.list(perplexity)) {
-      stop("Can't use intrinsic dimensionality with knn kernel")
-    }
-    if (length(perplexity) > 1) {
-      stop("Can't use multiple perplexities with knn kernel")
-    }
-    tsmessage("Using knn kernel with k = ", formatC(perplexity))
-    P <- knn_graph(X, k = perplexity, n_threads = n_threads, verbose = verbose)
-    x2ares <- list(W = P)
-  }
-  else if (tolower(kernel) == "skd") {
-    P <- smooth_knn_distances(X, k = perplexity, tol = 1e-5,
-                              n_threads = n_threads, verbose = verbose)$P
-    row_normalize <- FALSE
-    x2ares <- list(W = P)
-  }
-  else if (perp_method(perplexity) == "idp") {
-    perplexities <- NULL
-    if (is.list(perplexity) && length(perplexity) == 2) {
-      perplexities <- perplexity[[2]]
-    }
-
-    x2ares <- idp(X, perplexities = perplexities, tol = 1e-5,
-                  verbose = verbose)
-    P <- x2ares$W
-    ret_extra <- unique(c(ret_extra, "idp"))
-  }
-  else if (perp_method(perplexity) == "multiscale") {
-    perplexities <- NULL
-    if (is.list(perplexity) && length(perplexity) == 2) {
-      perplexities <- perplexity[[2]]
-    }
-    
-    mspres <- msp(X, perplexities = perplexities, tol = 1e-5,
-                  symmetrize = symmetrize, 
-                  row_normalize = row_normalize,
-                  normalize = normalize,
-                  verbose = verbose)
-    cost$P <- mspres$P
-    return(cost)
-  }
-  else if (tolower(kernel) == "sigma") {
-    tsmessage("Using fixed sigma = ", formatC(perplexity))
-    x2ares <- x2aff_sigma(X, sigma = perplexity, n_threads = n_threads, 
-                          use_cpp = use_cpp, verbose = verbose)
-    P <- x2ares$W
-  }
-  else {
-    if (!is.numeric(perplexity)) {
-      stop("Unknown perplexity method, '", perplexity[[1]], "'")
-    }
-    tsmessage("Commencing calibration for perplexity = ",
-                format_perps(perplexity))
-    if (use_cpp) {
-      P <- find_beta_cpp(X, perplexity, tol = 1e-5, n_threads = n_threads)$W
-    }
-    else {
-      x2ares <- x2aff(X, perplexity, tol = 1e-5, kernel = kernel, verbose = verbose)
-      P <- x2ares$W
-    }
-  }
-
-  P <- scale_affinities(P, 
-                        symmetrize = symmetrize, 
-                        row_normalize = row_normalize,
-                        normalize = normalize)
-  cost$P <- P
-
-  if (is.logical(row_normalize)) {
-    tsmessage("Effective perplexity of P approx = ", 
-              formatC(stats::median(perpp(P))))
-  }
-  
-  for (r in unique(tolower(ret_extra))) {
-    switch(r,
-           v = {
-             cost$V <- x2ares$W
-           },
-           dint = {
-             if (!is.null(x2ares$dint)) {
-               cost$dint <- x2ares$dint
-             }
-           },
-           beta = {
-             if (!is.null(x2ares$beta)) {
-               cost$beta <- x2ares$beta
-             }
-           },
-           adegc = {
-             cost$adegc <- 0.5 * rowSums(x2ares$W) + colSums(x2ares$W)
-           },
-           adegin = {
-             cost$adegin <- rowSums(x2ares$W)
-           },
-           adegout = {
-             cost$adegout <- colSums(x2ares$W)
-           },
-           pdeg = {
-             cost$pdeg <- colSums(P)
-           },
-           idp = {
-             if (!is.null(x2ares$idp)) {
-              cost$idp <- x2ares$idp
-             }
-           }
-    )
-  }
-  cost
-}
-
-# The intrinsic dimensionality associated with a gaussian affinity vector
-# Convenient only from in x2aff, where all these values are available
-intd_x2aff <- function(D2, beta, W, Z, H, eps = .Machine$double.eps) {
-  P <- W / Z
-  -2 * beta * sum(D2 * P * (log(P + eps) + H))
-}
-
-shannonpr <- function(P, eps = .Machine$double.eps) {
-  P <- P / rowSums(P)
-  rowSums(-P * log(P + eps))
-}
-
-perpp <- function(P) {
-  exp(shannonpr(P))
-}
 

From 4daa1a773ee5f7c84eb2860805ad35e0dc9f967d Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 12:32:36 -0700
Subject: [PATCH 18/26] reformat

---
 smallvis/R/cost.R |   32 +-
 smallvis/R/sne.R  | 1097 ++++++++++++++++++++++++++++++---------------
 2 files changed, 761 insertions(+), 368 deletions(-)

diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index 524b29b..de9219c 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -44,55 +44,51 @@ exp_shift <- function(X) {
   X <- exp(sweep(X, 2, apply(X, 2, max)))
 }
 
-expQ <- function(Y, eps = .Machine$double.eps, beta = NULL,
+expQ <- function(Y,
+                 eps = .Machine$double.eps,
+                 beta = NULL,
                  A = NULL,
                  is_symmetric = FALSE,
                  matrix_normalize = FALSE,
                  use_cpp = FALSE,
                  n_threads = 1) {
   W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-  
+
   if (!is.null(beta)) {
     W <- exp_shift(-W * beta)
-  }
-  else {
+  } else {
     W <- exp_shift(-W)
   }
-  
+
   if (!is.null(A)) {
     W <- A * W
   }
   diag(W) <- 0
-  
+
   if (matrix_normalize) {
     Z <- sum(W)
-  }
-  else {
+  } else {
     if (is_symmetric) {
       Z <- colSums(W)
-    }
-    else {
+    } else {
       Z <- rowSums(W)
     }
   }
   # cost of division (vs storing 1/Z and multiplying) seems small
   Q <- W / Z
-  
+
   if (eps > 0) {
     Q[Q < eps] <- eps
   }
   diag(Q) <- 0
-  
-  list(
-    Q = Q,
-    Z = Z
-  )
+
+  list(Q = Q, Z = Z)
 }
 
 # KL divergence using Q directly
 kl_costQ <- function(cost, Y) {
   cost <- cost_update(cost, Y)
-  
+
   # P log(P / Q) = P log P - P log Q
   cost$pcost <- cost$plogp - colSums(cost$P * logm(cost$Q, cost$eps))
   cost
@@ -100,7 +96,7 @@ kl_costQ <- function(cost, Y) {
 
 kl_costQr <- function(cost, Y) {
   cost <- cost_update(cost, Y)
-  
+
   # P log(P / Q) = P log P - P log Q
   cost$pcost <- cost$plogp - rowSums(cost$P * logm(cost$Q, cost$eps))
   cost
diff --git a/smallvis/R/sne.R b/smallvis/R/sne.R
index 318432b..e6acda2 100644
--- a/smallvis/R/sne.R
+++ b/smallvis/R/sne.R
@@ -1,16 +1,32 @@
 # t-SNE
-tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric", 
-                 normalize = TRUE, row_normalize = TRUE,
-                 eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+tsne <- function(perplexity,
+                 inp_kernel = "gaussian",
+                 symmetrize = "symmetric",
+                 normalize = TRUE,
+                 row_normalize = TRUE,
+                 eps = .Machine$double.eps,
+                 n_threads = 0,
+                 use_cpp = FALSE) {
   list(
-    init = function(cost, X, max_iter, verbose = FALSE,
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
                     ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = normalize,
-                       row_normalize = row_normalize,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = normalize,
+        row_normalize = row_normalize,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
       cost$eps <- eps
       cost
     },
@@ -30,8 +46,7 @@ tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
       P <- cost$P
       if (use_cpp) {
         cost$G <- tsne_grad_cpp(P, cost$W, cost$Z, Y, n_threads = n_threads)
-      }
-      else {
+      } else {
         cost$G <- k2g(Y, 4 * cost$W * (P - cost$W / cost$Z))
       }
       cost
@@ -41,9 +56,10 @@ tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
 
       if (is.null(res)) {
         switch(val,
-               q = {
-                 res <- cost$W / cost$Z
-               })
+          q = {
+            res <- cost$W / cost$Z
+          }
+        )
       }
       res
     },
@@ -66,23 +82,37 @@ tsne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
 # Cook, J., Sutskever, I., Mnih, A., & Hinton, G. E. (2007).
 # Visualizing similarity data with a mixture of maps.
 # In \emph{International Conference on Artificial Intelligence and Statistics} (pp. 67-74).
-ssne <- function(perplexity, inp_kernel = "gaussian", 
-                 symmetrize = "symmetric", eps = .Machine$double.eps,
-                 n_threads = 0, use_cpp = FALSE) {
+ssne <- function(perplexity,
+                 inp_kernel = "gaussian",
+                 symmetrize = "symmetric",
+                 eps = .Machine$double.eps,
+                 n_threads = 0,
+                 use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, 
-         symmetrize = symmetrize, eps = eps, n_threads = n_threads,
-         use_cpp = use_cpp),
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      symmetrize = symmetrize,
+      eps = eps,
+      n_threads = n_threads,
+      use_cpp = use_cpp
+    ),
     pfn = kl_costQ,
     gr = function(cost, Y) {
       cost <- cost$update(cost, Y)
       cost$G <- k2g(Y, 4 * (cost$P - cost$Q), symmetrize = FALSE)
-      
+
       cost
     },
     update = function(cost, Y) {
-      cost$Q <- expQ(Y, cost$eps, is_symmetric = TRUE, matrix_normalize = TRUE,
-                     use_cpp = use_cpp, n_threads = n_threads)$Q
+      cost$Q <- expQ(
+        Y,
+        cost$eps,
+        is_symmetric = TRUE,
+        matrix_normalize = TRUE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
       cost
     },
     sentinel = "Q"
@@ -92,15 +122,30 @@ ssne <- function(perplexity, inp_kernel = "gaussian",
 # Hinton, G. E., & Roweis, S. T. (2002).
 # Stochastic neighbor embedding.
 # In \emph{Advances in neural information processing systems} (pp. 833-840).
-asne <- function(perplexity, inp_kernel = "gaussian", 
-                 eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  lreplace(tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE,
+asne <- function(perplexity,
+                 inp_kernel = "gaussian",
+                 eps = .Machine$double.eps,
+                 n_threads = 0,
+                 use_cpp = FALSE) {
+  lreplace(
+    tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
                     ret_extra = c()) {
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = "none", normalize = FALSE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "none",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
       cost$eps <- eps
       cost
     },
@@ -112,8 +157,13 @@ asne <- function(perplexity, inp_kernel = "gaussian",
       cost
     },
     update = function(cost, Y) {
-      cost$Q <- expQ(Y, eps = cost$eps, is_symmetric = FALSE,
-                     use_cpp = use_cpp, n_threads = n_threads)$Q
+      cost$Q <- expQ(
+        Y,
+        eps = cost$eps,
+        is_symmetric = FALSE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
       cost
     },
     sentinel = "Q"
@@ -124,14 +174,23 @@ asne <- function(perplexity, inp_kernel = "gaussian",
 # Yang, Z., King, I., Xu, Z., & Oja, E. (2009).
 # Heavy-tailed symmetric stochastic neighbor embedding.
 # In \emph{Advances in neural information processing systems} (pp. 2169-2177).
-hssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian", 
-                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0, use_cpp = FALSE) {
+hssne <- function(perplexity,
+                  alpha = 0.5,
+                  inp_kernel = "gaussian",
+                  symmetrize = "symmetric",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
   alpha <- max(alpha, 1e-8)
   apow <- -1 / alpha
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, 
-         symmetrize = symmetrize, eps = eps, n_threads = n_threads),
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      symmetrize = symmetrize,
+      eps = eps,
+      n_threads = n_threads
+    ),
     gr = function(cost, Y) {
       cost <- cost_update(cost, Y)
       # to include bandwidth
@@ -156,21 +215,40 @@ hssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian",
 
 # exists to demonstrate that constant beta doesn't have any meaningful effect
 # on the results.
-bhssne <- function(perplexity, alpha = 0.5, beta = 1,
-                   eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+bhssne <- function(perplexity,
+                   alpha = 0.5,
+                   beta = 1,
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
   alpha <- max(alpha, 1e-8)
   beta <- max(beta, 1e-8)
   lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
     b4 = 4 * beta,
     ab = alpha * beta,
     apow = -1 / alpha,
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      ret_extra <- unique(c(ret_extra, 'beta'))
-      cost <- sne_init(cost, X, perplexity = perplexity,
-                       symmetrize = "symmetric", normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      ret_extra <- unique(c(ret_extra, "beta"))
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        symmetrize = "symmetric",
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
       # override input bandwidths with fixed beta (although this doesn't do much)
       if (!is.null(beta)) {
         cost$beta <- beta
@@ -189,7 +267,7 @@ bhssne <- function(perplexity, alpha = 0.5, beta = 1,
       W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- powm(cost$ab * W + 1, cost$apow, cost$eps)
       diag(W) <- 0
-      
+
       cost$Z <- sum(W)
       cost$W <- W
       cost
@@ -198,17 +276,29 @@ bhssne <- function(perplexity, alpha = 0.5, beta = 1,
 }
 
 # A version of HSSNE where alpha is allowed to vary at every epoch
-dhssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian", 
-                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                   n_threads = 0, use_cpp = FALSE) {
+dhssne <- function(perplexity,
+                   alpha = 0.5,
+                   inp_kernel = "gaussian",
+                   symmetrize = "symmetric",
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
   alpha_min <- 1e-8
   alpha <- max(alpha, alpha_min)
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, 
-         symmetrize = symmetrize, eps = eps, n_threads = n_threads),
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      symmetrize = symmetrize,
+      eps = eps,
+      n_threads = n_threads
+    ),
     gr = function(cost, Y) {
       cost <- cost_update(cost, Y)
-      cost$G <- k2g(Y, 4 * (cost$P - cost$W / cost$Z) * powm(cost$W, cost$alpha, cost$eps))
+      cost$G <- k2g(
+        Y,
+        4 * (cost$P - cost$W / cost$Z) * powm(cost$W, cost$alpha, cost$eps)
+      )
       cost
     },
     epoch = function(opt, cost, iter, Y, fn_val) {
@@ -261,18 +351,37 @@ dhssne <- function(perplexity, alpha = 0.5, inp_kernel = "gaussian",
 # Optimization equivalence of divergences improves neighbor embedding.
 # In \emph{Proceedings of the 31st International Conference on Machine Learning (ICML-14)}
 # (pp. 460-468).
-wtsne <- function(perplexity, inp_kernel = "gaussian", 
-                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0, use_cpp = FALSE) {
-  lreplace(tsne(perplexity = perplexity, use_cpp = use_cpp,
-                n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
+wtsne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  symmetrize = "symmetric",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
       ret_extra <- c(ret_extra, "pdeg")
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
       # P matrix degree centrality: column sums
       deg <- cost$pdeg
       if (verbose) {
@@ -303,56 +412,99 @@ wtsne <- function(perplexity, inp_kernel = "gaussian",
   )
 }
 
-wssne <- function(perplexity, inp_kernel = "gaussian", 
-                  symmetrize = "symmetric", eps = .Machine$double.eps,
-                  n_threads = 0, use_cpp = FALSE) {
-  lreplace(ssne(perplexity = perplexity, use_cpp = use_cpp,
-                n_threads = n_threads),
-     init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-       ret_extra <- c(ret_extra, "pdeg")
-       cost <- sne_init(cost, X, perplexity = perplexity,
-                        kernel = inp_kernel, symmetrize = symmetrize, 
-                        normalize = TRUE, verbose = verbose, 
-                        ret_extra = ret_extra, n_threads = n_threads,
-                        use_cpp = use_cpp)
-       # P matrix degree centrality: column sums
-       deg <- cost$pdeg
-       if (verbose) {
-         summarize(deg, "deg", verbose = verbose)
-       }
-       cost$M <- outer(deg, deg)
-       cost$invM <- 1 / cost$M
-       cost$eps <- eps
-       
-       cost
-     },
-     gr = function(cost, Y) {
-       cost <- cost_update(cost, Y)
-       cost$G <- k2g(Y, 4 * (cost$P - cost$Q))
-       cost
-     },
-     update = function(cost, Y) {
-       cost$Q <- expQ(Y, cost$eps, A = cost$M, matrix_normalize = TRUE, 
-                      use_cpp = use_cpp, n_threads = n_threads)$Q
-       cost
-     }
+wssne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  symmetrize = "symmetric",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    ssne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
+      ret_extra <- c(ret_extra, "pdeg")
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      # P matrix degree centrality: column sums
+      deg <- cost$pdeg
+      if (verbose) {
+        summarize(deg, "deg", verbose = verbose)
+      }
+      cost$M <- outer(deg, deg)
+      cost$invM <- 1 / cost$M
+      cost$eps <- eps
+
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$G <- k2g(Y, 4 * (cost$P - cost$Q))
+      cost
+    },
+    update = function(cost, Y) {
+      cost$Q <- expQ(
+        Y,
+        cost$eps,
+        A = cost$M,
+        matrix_normalize = TRUE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
+      cost
+    }
   )
 }
 
 # t-SNE but with the gradient defined in terms of un-normalized weights
 # Exists entirely as an academic exercise
-tsneu <- function(perplexity, inp_kernel = "gaussian", 
-                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+tsneu <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp = use_cpp,
-         n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      ret_extra = unique(c(ret_extra, "V"))
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = "symmetric", normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      ret_extra <- unique(c(ret_extra, "V"))
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "symmetric",
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
 
       cost$eps <- eps
 
@@ -374,17 +526,36 @@ tsneu <- function(perplexity, inp_kernel = "gaussian",
 
 # A pseudo-separable approximation of t-SNE, where the output weight sum is only
 # recalculated during the epoch
-pstsne <- function(perplexity, inp_kernel = "gaussian",
-                   eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+pstsne <- function(perplexity,
+                   inp_kernel = "gaussian",
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, use_cpp = use_cpp,
-         n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      ret_extra = unique(c(ret_extra, "V"))
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = "symmetric", normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      ret_extra <- unique(c(ret_extra, "V"))
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "symmetric",
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
 
       # need to row-normalize and symmetrize affinities
       cost$V <- cost$V / rowSums(cost$V)
@@ -401,7 +572,7 @@ pstsne <- function(perplexity, inp_kernel = "gaussian",
       P[P < eps] <- eps
       cost$plogp <- colSums(P * logm(P, eps))
       cost$Z <- nrow(P) * nrow(P)
-      
+
       cost
     },
     gr = function(cost, Y) {
@@ -431,25 +602,59 @@ pstsne <- function(perplexity, inp_kernel = "gaussian",
 # UMAP/t-SNE Hybrids ------------------------------------------------------
 
 # Calculate P via normalized smooth knn-distances
-skdtsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0,
+skdtsne <- function(perplexity,
+                    eps = .Machine$double.eps,
+                    n_threads = 0,
                     use_cpp = FALSE) {
-  tsne(perplexity = perplexity, inp_kernel = "skd", symmetrize = "umap", 
-       eps = eps, n_threads = n_threads, use_cpp = use_cpp)
+  tsne(
+    perplexity = perplexity,
+    inp_kernel = "skd",
+    symmetrize = "umap",
+    eps = eps,
+    n_threads = n_threads,
+    use_cpp = use_cpp
+  )
 }
 
 # Use the UMAP curve family in output kernel
-usne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric", 
-                 spread = 1, min_dist = 0.001, gr_eps = 0.1, 
-                 eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+usne <- function(perplexity,
+                 inp_kernel = "gaussian",
+                 symmetrize = "symmetric",
+                 spread = 1,
+                 min_dist = 0.001,
+                 gr_eps = 0.1,
+                 eps = .Machine$double.eps,
+                 n_threads = 0,
+                 use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
       symmetrize <- match.arg(tolower(symmetrize), true_symmetrize_options())
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = symmetrize, normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
-      cost <- init_ab(cost, spread = spread, min_dist = min_dist, verbose = verbose)
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = symmetrize,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost <- init_ab(cost,
+        spread = spread,
+        min_dist = min_dist,
+        verbose = verbose
+      )
       cost$eps <- eps
       cost
     },
@@ -457,7 +662,7 @@ usne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
       D2 <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       D2[D2 < 0] <- 0
 
-      W <- 1 / (1 + cost$a * D2 ^ cost$b)
+      W <- 1 / (1 + cost$a * D2^cost$b)
       diag(W) <- 0
 
       cost$Z <- sum(W)
@@ -474,60 +679,75 @@ usne <- function(perplexity, inp_kernel = "gaussian", symmetrize = "symmetric",
 }
 
 # UMAP cross entropy cost instead of KL divergence
-cetsne <- function(perplexity, inp_kernel = "gaussian", 
-                   symmetrize = "symmetric", eps = .Machine$double.eps,
-                   n_threads = 0, use_cpp = FALSE) {
-  lreplace(tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
-           init = function(cost, X, max_iter, verbose = FALSE,
-                           ret_extra = c()) {
-             cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                              symmetrize = "symmetric", normalize = TRUE,
-                              verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads, use_cpp = use_cpp)
-
-             cost$eps <- eps
-             cost
-           },
-           cache_input = function(cost) {
-             P <- cost$P
-             eps <- cost$eps
-             P[P < eps] <- eps
-             cost$Cp <- colSums(P * logm(P, eps) + (1 - P) * log1p(-P))
-             cost
-           },
-           pfn = function(cost, Y) {
-             cost <- cost_update(cost, Y)
-
-             P <- cost$P
-             eps <- cost$eps
-             Q <- cost$Q
-
-             cost$pcost <- colSums(-P * logm(Q, eps) - (1 - P) * log1p(-Q + eps)) + cost$Cp
-             cost
-           },
-           gr = function(cost, Y) {
-             cost <- cost_update(cost, Y)
-             cost$G <- k2g(Y,  4 * cost$W * (cost$C - cost$sumC * cost$Q))
-             cost
-           },
-           update = function(cost, Y) {
-             P <- cost$P
-             W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-             W <- 1 / (1 + W)
-             diag(W) <- 0
-
-             Q <- W / sum(W)
-             C <- (P - Q) / (1 - Q)
-             sumC <- sum(C)
-
-             cost$W <- W
-             cost$Q <- Q
-             cost$C <- C
-             cost$sumC <- sumC
-
-             cost
-           },
-           export = cost_export
+cetsne <- function(perplexity,
+                   inp_kernel = "gaussian",
+                   symmetrize = "symmetric",
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
+  lreplace(
+    tsne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "symmetric",
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      P <- cost$P
+      eps <- cost$eps
+      P[P < eps] <- eps
+      cost$Cp <- colSums(P * logm(P, eps) + (1 - P) * log1p(-P))
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      P <- cost$P
+      eps <- cost$eps
+      Q <- cost$Q
+
+      cost$pcost <- colSums(-P * logm(Q, eps) - (1 - P) * log1p(-Q + eps)) + cost$Cp
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$G <- k2g(Y, 4 * cost$W * (cost$C - cost$sumC * cost$Q))
+      cost
+    },
+    update = function(cost, Y) {
+      P <- cost$P
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+
+      Q <- W / sum(W)
+      C <- (P - Q) / (1 - Q)
+      sumC <- sum(C)
+
+      cost$W <- W
+      cost$Q <- Q
+      cost$C <- C
+      cost$sumC <- sumC
+
+      cost
+    },
+    export = cost_export
   )
 }
 
@@ -535,16 +755,37 @@ cetsne <- function(perplexity, inp_kernel = "gaussian",
 # Bandwidth Experiments --------------------------------------------------
 
 # t-SNE with input kernel bandwidths transferred to output
-btsne <- function(perplexity, inp_kernel = "gaussian", beta = NULL,
-                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  lreplace(tsne(perplexity = perplexity, inp_kernel = inp_kernel,
-                use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      ret_extra <- unique(c(ret_extra, 'beta'))
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = "symmetric", normalize = TRUE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
+btsne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  beta = NULL,
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      ret_extra <- unique(c(ret_extra, "beta"))
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "symmetric",
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
       # override input bandwidths with fixed beta (although this doesn't do much)
       if (!is.null(beta)) {
         cost$beta <- beta
@@ -570,49 +811,94 @@ btsne <- function(perplexity, inp_kernel = "gaussian", beta = NULL,
 }
 
 # SSNE with input kernel bandwidths transferred to output
-bssne <- function(perplexity, inp_kernel = "gaussian", beta = NULL, 
-                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  lreplace(ssne(perplexity = perplexity, inp_kernel = inp_kernel, 
-                use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-       ret_extra <- unique(c(ret_extra, 'beta'))
-       cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                        symmetrize = "symmetric", normalize = TRUE,
-                        verbose = verbose, ret_extra = ret_extra,
-                        n_threads = n_threads, use_cpp = use_cpp)
-       if (!is.null(beta)) {
-         cost$beta <- beta
-       }
-       cost$eps <- eps
-       cost
+bssne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  beta = NULL,
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    ssne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      ret_extra <- unique(c(ret_extra, "beta"))
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "symmetric",
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      if (!is.null(beta)) {
+        cost$beta <- beta
+      }
+      cost$eps <- eps
+      cost
     },
     pfn = kl_costQ,
     gr = function(cost, Y) {
       cost <- cost$update(cost, Y)
       cost$G <- k2g(Y, 2 * cost$beta * (cost$P - cost$Q), symmetrize = TRUE)
-      
+
       cost
     },
     update = function(cost, Y) {
-      cost$Q <- expQ(Y, cost$eps, beta = cost$beta, matrix_normalize = TRUE,
-                     use_cpp = use_cpp, n_threads = n_threads)$Q
+      cost$Q <- expQ(
+        Y,
+        cost$eps,
+        beta = cost$beta,
+        matrix_normalize = TRUE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
       cost
     }
   )
 }
 
 # ASNE with input kernel bandwidths transferred to output
-basne <- function(perplexity, beta = NULL, eps = .Machine$double.eps, 
-                  n_threads = 0, use_cpp = FALSE) {
+basne <- function(perplexity,
+                  beta = NULL,
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
   lreplace(
-    asne(perplexity = perplexity, use_cpp = use_cpp, n_threads = n_threads),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      ret_extra <- unique(c(ret_extra, 'beta'))
-
-      cost <- sne_init(cost, X, perplexity = perplexity,
-                       symmetrize = "none", normalize = FALSE,
-                       verbose = verbose, ret_extra = ret_extra,
-                       n_threads = n_threads, use_cpp = use_cpp)
+    asne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      ret_extra <- unique(c(ret_extra, "beta"))
+
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        symmetrize = "none",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
       # override input bandwidths with fixed beta (although this doesn't do much)
       if (!is.null(beta)) {
         cost$beta <- beta
@@ -622,36 +908,50 @@ basne <- function(perplexity, beta = NULL, eps = .Machine$double.eps,
       cost
     },
     update = function(cost, Y) {
-      cost$Q <- expQ(Y, eps = cost$eps, beta = cost$beta, is_symmetric = FALSE,
-                     use_cpp = use_cpp, n_threads = n_threads)$Q
+      cost$Q <- expQ(
+        Y,
+        eps = cost$eps,
+        beta = cost$beta,
+        is_symmetric = FALSE,
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )$Q
       cost
     },
     gr = function(cost, Y) {
       cost <- cost$update(cost, Y)
       cost$G <- k2g(Y, 2 * cost$beta * (cost$P - cost$Q), symmetrize = TRUE)
       cost
-   }
+    }
   )
 }
 
 # t-ASNE with input kernel bandwidths transferred to output
-btasne <- function(perplexity, beta = NULL, eps = .Machine$double.eps, 
-                   n_threads = 0, use_cpp = FALSE) {
-  lreplace(basne(perplexity = perplexity, beta = beta, eps = eps,
-                 n_threads = n_threads, use_cpp = use_cpp),
+btasne <- function(perplexity,
+                   beta = NULL,
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
+  lreplace(
+    basne(
+      perplexity = perplexity,
+      beta = beta,
+      eps = eps,
+      n_threads = n_threads,
+      use_cpp = use_cpp
+    ),
     pfn = kl_cost,
     gr = function(cost, Y) {
-    cost <- cost_update(cost, Y)
-     W <- cost$W
-     cost$G <- k2g(Y, 2 * cost$beta * W * (cost$P - W / cost$Z), 
-                   symmetrize = TRUE)
-     cost
+      cost <- cost_update(cost, Y)
+      W <- cost$W
+      cost$G <- k2g(Y, 2 * cost$beta * W * (cost$P - W / cost$Z), symmetrize = TRUE)
+      cost
     },
     update = function(cost, Y) {
       W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
       W <- 1 / (1 + cost$beta * W)
       diag(W) <- 0
-       
+
       cost$W <- W
       cost$Z <- rowSums(W)
       cost
@@ -659,34 +959,50 @@ btasne <- function(perplexity, beta = NULL, eps = .Machine$double.eps,
   )
 }
 
-tasne <- function(perplexity, n_threads = 0, use_cpp = FALSE) {
-  lreplace(tsne(perplexity = perplexity, use_cpp = use_cpp,
-                n_threads = n_threads),
-           init = function(cost, X, max_iter, eps = .Machine$double.eps, 
-                           verbose = FALSE, ret_extra = c()) {
-             cost <- sne_init(cost, X, perplexity = perplexity,
-                              symmetrize = "none", normalize = FALSE,
-                              verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads, use_cpp = use_cpp)
-             cost$eps <- eps
-             cost
-           },
-           gr = function(cost, Y) {
-             cost <- cost_update(cost, Y)
-             
-             cost$G <- k2g(Y, 2 * cost$W * (cost$P - cost$W / cost$Z), 
-                           symmetrize = TRUE)
-             cost
-           },
-           update = function(cost, Y) {
-             W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-             W <- 1 / (1 + W)
-             diag(W) <- 0
-             
-             cost$W <- W
-             cost$Z <- 1 / rowSums(W)
-             cost
-           }
+tasne <- function(perplexity,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      use_cpp = use_cpp,
+      n_threads = n_threads
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    eps = .Machine$double.eps,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        symmetrize = "none",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      cost$G <- k2g(Y, 2 * cost$W * (cost$P - cost$W / cost$Z), symmetrize = TRUE)
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+
+      cost$W <- W
+      cost$Z <- 1 / rowSums(W)
+      cost
+    }
   )
 }
 
@@ -694,56 +1010,93 @@ tasne <- function(perplexity, n_threads = 0, use_cpp = FALSE) {
 # Normalization Experiments -----------------------------------------------
 
 # ASNE but with the t-distributed kernel
-tasne <- function(perplexity, inp_kernel = "gaussian", 
-                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  lreplace(tsne(perplexity = perplexity, n_threads = n_threads,
-                use_cpp = use_cpp),
-  init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-    cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                     symmetrize = "none", normalize = FALSE,
-                     verbose = verbose, ret_extra = ret_extra,
-                     n_threads = n_threads, use_cpp = use_cpp)
-    cost$eps <- eps
-    cost
-  },
-  gr = function(cost, Y) {
-    cost <- cost_update(cost, Y)
-
-    cost$G <- k2g(Y, 2 * cost$W * (cost$P - cost$W / cost$Z), 
-                  symmetrize = TRUE)
-    cost
-  },
-  update = function(cost, Y) {
-    W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
-    W <- 1 / (1 + W)
-    diag(W) <- 0
-
-    cost$W <- W
-    cost$Z <- rowSums(W)
-    cost
-  }
+tasne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    tsne(
+      perplexity = perplexity,
+      n_threads = n_threads,
+      use_cpp = use_cpp
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "none",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+
+      cost$G <- k2g(Y, 2 * cost$W * (cost$P - cost$W / cost$Z), symmetrize = TRUE)
+      cost
+    },
+    update = function(cost, Y) {
+      W <- calc_d2(Y, use_cpp = use_cpp, n_threads = n_threads)
+      W <- 1 / (1 + W)
+      diag(W) <- 0
+
+      cost$W <- W
+      cost$Z <- rowSums(W)
+      cost
+    }
   )
 }
 
 # t-RM-SNE
 # t-SNE without symmetrization of P (but still pair-normalizing)
 # row-normalize, then matrix normalize
-trmsne <- function(perplexity, inp_kernel = "gaussian", 
-                   eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
+trmsne <- function(perplexity,
+                   inp_kernel = "gaussian",
+                   eps = .Machine$double.eps,
+                   n_threads = 0,
+                   use_cpp = FALSE) {
   lreplace(
-    tsne(perplexity = perplexity, inp_kernel = inp_kernel, 
-         n_threads = n_threads, use_cpp = use_cpp),
-    init = function(cost, X, max_iter, verbose = FALSE, ret_extra = c()) {
-      cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                       symmetrize = "none", normalize = TRUE, 
-                       verbose = verbose, ret_extra = ret_extra, 
-                       n_threads = n_threads, use_cpp = use_cpp)
+    tsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      n_threads = n_threads,
+      use_cpp = use_cpp
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "none",
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
       cost$eps <- eps
       cost
     },
     gr = function(cost, Y) {
       cost <- cost_update(cost, Y)
-      
+
       W <- cost$W
       cost$G <- k2g(Y, 2 * W * (cost$P - W / cost$Z), symmetrize = TRUE)
       cost
@@ -754,62 +1107,106 @@ trmsne <- function(perplexity, inp_kernel = "gaussian",
 # t-M-SNE
 # t-SNE but without row-normalizing or symmetrizing, just matrix normalization
 # Not recommended
-tmsne <- function(perplexity, inp_kernel = "gaussian", 
-                  eps = .Machine$double.eps, n_threads = 0, use_cpp = FALSE) {
-  lreplace(trmsne(perplexity = perplexity, inp_kernel = inp_kernel, 
-                  eps = eps, n_threads = n_threads, use_cpp = use_cpp),
-           init = function(cost, X, max_iter, verbose = FALSE,
-                           ret_extra = c()) {
-             cost <- sne_init(cost, X, perplexity = perplexity, kernel = inp_kernel,
-                              symmetrize = "none", row_normalize = FALSE,
-                              normalize = TRUE, verbose = verbose, 
-                              ret_extra = ret_extra, n_threads = n_threads,
-                              use_cpp = use_cpp)
-             cost$eps <- eps
-             cost
-           }
+tmsne <- function(perplexity,
+                  inp_kernel = "gaussian",
+                  eps = .Machine$double.eps,
+                  n_threads = 0,
+                  use_cpp = FALSE) {
+  lreplace(
+    trmsne(
+      perplexity = perplexity,
+      inp_kernel = inp_kernel,
+      eps = eps,
+      n_threads = n_threads,
+      use_cpp = use_cpp
+    ),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "none",
+        row_normalize = FALSE,
+        normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      cost$eps <- eps
+      cost
+    }
   )
 }
 
 # RSR row-normalize, symmetrize, then row-normalize again
 # Might work a tiny bit better than t-ASNE?
-trsrsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0,
+trsrsne <- function(perplexity,
+                    eps = .Machine$double.eps,
+                    n_threads = 0,
                     use_cpp = FALSE) {
-  lreplace(tasne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
-           init = function(cost, X, max_iter, verbose = FALSE, 
-                           ret_extra = c()) {
-             cost <- sne_init(cost, X, perplexity = perplexity,
-                              symmetrize = "symmetric", normalize = FALSE,
-                              verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads, use_cpp = use_cpp)
-             P <- cost$P
-             P <- P / rowSums(P)
-             cost$P <- P
-
-             cost$eps <- eps
-             cost
-           }
+  lreplace(
+    tasne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        symmetrize = "symmetric",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      P <- cost$P
+      P <- P / rowSums(P)
+      cost$P <- P
+
+      cost$eps <- eps
+      cost
+    }
   )
 }
 
 
-arsrsne <- function(perplexity, eps = .Machine$double.eps, n_threads = 0,
+arsrsne <- function(perplexity,
+                    eps = .Machine$double.eps,
+                    n_threads = 0,
                     use_cpp = FALSE) {
-  lreplace(asne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
-           init = function(cost, X, max_iter, verbose = FALSE,
-                           ret_extra = c()) {
-             cost <- sne_init(cost, X, perplexity = perplexity,
-                              symmetrize = "symmetric", normalize = FALSE,
-                              verbose = verbose, ret_extra = ret_extra,
-                              n_threads = n_threads, use_cpp = use_cpp)
-             P <- cost$P
-             P <- P / rowSums(P)
-             cost$P <- P
-
-             cost$eps <- eps
-             cost
-           }
+  lreplace(
+    asne(perplexity, use_cpp = use_cpp, n_threads = n_threads),
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        symmetrize = "symmetric",
+        normalize = FALSE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = use_cpp
+      )
+      P <- cost$P
+      P <- P / rowSums(P)
+      cost$P <- P
+
+      cost$eps <- eps
+      cost
+    }
   )
 }
-
-

From 66d6391cd4563abf6070a7f76bf94a321bca2a39 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 12:45:50 -0700
Subject: [PATCH 19/26] breakup smallvis.R a bit more

---
 smallvis/R/epoch.R    |   49 ++
 smallvis/R/input.R    |   76 +++
 smallvis/R/iterated.R |  293 +++++++++
 smallvis/R/smallvis.R | 1303 ++++++++++++++++++++---------------------
 4 files changed, 1060 insertions(+), 661 deletions(-)
 create mode 100644 smallvis/R/epoch.R
 create mode 100644 smallvis/R/input.R
 create mode 100644 smallvis/R/iterated.R

diff --git a/smallvis/R/epoch.R b/smallvis/R/epoch.R
new file mode 100644
index 0000000..15ac847
--- /dev/null
+++ b/smallvis/R/epoch.R
@@ -0,0 +1,49 @@
+# Epoch Functions ---------------------------------------------------------
+
+do_epoch <- function(opt, cost, iter, Y, fn_val) {
+  if (!is.null(cost$epoch)) {
+    res <- cost$epoch(opt, cost, iter, Y, fn_val)
+    if (!is.null(res$opt)) {
+      opt <- res$opt
+    }
+    if (!is.null(res$cost)) {
+      cost <- res$cost
+    }
+  }
+
+  list(opt = opt, cost = cost)
+}
+
+# Helper function for epoch callback, allowing user to supply callbacks with
+# multiple arities.
+do_callback <- function(cb,
+                        Y,
+                        iter,
+                        cost = NULL,
+                        cost_fn = NULL,
+                        opt = NULL) {
+  nfs <- length(formals(cb))
+  switch(nfs,
+    "1" = cb(Y),
+    "2" = cb(Y, iter),
+    "3" = cb(Y, iter, cost),
+    "4" = cb(Y, iter, cost, cost_fn),
+    "5" = cb(Y, iter, cost, cost_fn, opt)
+  )
+}
+
+# Create a callback for visualization
+make_smallvis_cb <- function(df) {
+  force(df)
+  palette <- NULL
+  function(Y, iter, cost = NULL) {
+    if (is.null(palette)) {
+      palette <- vizier:::color_helper(df, color_scheme = grDevices::rainbow)$palette
+    }
+    title <- paste0("iter: ", iter)
+    if (!(is.null(cost) || is.na(cost))) {
+      title <- paste0(title, " cost = ", formatC(cost))
+    }
+    vizier::embed_plot(Y, df, title = title, color_scheme = palette)
+  }
+}
diff --git a/smallvis/R/input.R b/smallvis/R/input.R
new file mode 100644
index 0000000..0d2cd21
--- /dev/null
+++ b/smallvis/R/input.R
@@ -0,0 +1,76 @@
+# Input Preprocess --------------------------------------------------------
+
+# Scale X according to various strategies
+scale_input <- function(X, scale, verbose = FALSE) {
+  if (is.null(scale)) {
+    scale <- "none"
+  }
+  if (is.logical(scale)) {
+    if (scale) {
+      scale <- "scale"
+    } else {
+      scale <- "none"
+    }
+  }
+  scale <- match.arg(tolower(scale), c("none", "scale", "range", "absmax"))
+
+  switch(scale,
+    range = {
+      tsmessage("Range scaling X")
+      X <- as.matrix(X)
+      X <- X - min(X)
+      X <- X / max(X)
+    },
+    absmax = {
+      tsmessage("Normalizing by abs-max")
+      X <- base::scale(X, scale = FALSE)
+      X <- X / abs(max(X))
+    },
+    scale = {
+      tsmessage("Scaling to zero mean and unit variance")
+      X <- Filter(stats::var, X)
+      tsmessage("Kept ", ncol(X), " non-zero-variance columns")
+      X <- base::scale(X, scale = TRUE)
+    },
+    none = {
+      X <- as.matrix(X)
+    }
+  )
+  X
+}
+
+# Reduce input dimensionality via PCA and also optionally whiten data
+pca_preprocess <- function(X, pca, whiten, initial_dims, verbose = FALSE) {
+  # We won't do PCA if the rank of the input is less than the requested
+  # initial dimensionality
+  if (pca) {
+    pca <- min(nrow(X), ncol(X)) >= initial_dims
+  }
+  if (pca) {
+    if (whiten) {
+      tsmessage(
+        "Reducing initial dimensionality with PCA and ",
+        "whitening to ",
+        initial_dims,
+        " dims"
+      )
+      X <- pca_whiten(
+        X = X,
+        ncol = initial_dims,
+        verbose = verbose
+      )
+    } else {
+      tsmessage(
+        "Reducing initial dimensionality with PCA to ",
+        initial_dims,
+        " dims"
+      )
+      X <- pca_scores(
+        X = X,
+        ncol = initial_dims,
+        verbose = verbose
+      )
+    }
+  }
+  X
+}
diff --git a/smallvis/R/iterated.R b/smallvis/R/iterated.R
new file mode 100644
index 0000000..ab1f7d0
--- /dev/null
+++ b/smallvis/R/iterated.R
@@ -0,0 +1,293 @@
+#' Best t-SNE Result From Multiple Initializations
+#'
+#' Run t-SNE multiple times from a random initialization, and return the
+#' embedding with the lowest cost.
+#'
+#' This function ignores any value of \code{Y_init} you set, and uses
+#' \code{Y_init = "rand"}.
+#'
+#' @param nrep Number of repeats.
+#' @param keep_all If \code{TRUE}, then the return value is a list of lists,
+#' indexed from 1 .. \code{nrep}, with each entry the result from each
+#' \code{\link{smallvis}} run. Otherwise just the result with the lowest error
+#' is returned.
+#' @param ... Arguments to apply to each \code{\link{smallvis}} run.
+#' @return The \code{\link{smallvis}} result with the lowest final cost, or
+#' if \code{keep_all} is \code{TRUE} all results as a list, indexed as 1 ..
+#' \code{nrep}. If \code{ret_extra} is not \code{FALSE}, then the final costs for all
+#' \code{nrep} runs are also included in the return value list as a vector
+#' called \code{all_costs}. In this case, if \code{keep_all} is \code{TRUE}, then
+#' \code{all_costs} appears as an extra item on all results. Additionally,
+#' each result will have an extra entry \code{best_rep}, giving the index of the
+#' result with the lowest cost.
+#' @examples
+#' \dontrun{
+#' # Return best result out of five random initializations
+#' tsne_iris_best <- smallvis_rep(
+#'   nrep = 5, X = iris, perplexity = 50, method = "tsne",
+#'   ret_extra = TRUE
+#' )
+#' # How much do the costs vary between runs?
+#' range(tsne_iris_best$all_costs)
+#' # Display best embedding found
+#' plot(tsne_iris_best$Y)
+#'
+#' # Keep all results
+#' # First result is in tsne_iris_rep[[1]], second in tsne_iris_rep[[2]] etc.
+#' tsne_iris_rep <- smallvis_rep(
+#'   nrep = 5, X = iris, perplexity = 50, method = "tsne",
+#'   ret_extra = TRUE, keep_all = TRUE
+#' )
+#' # Index of result with smallest error is in special list item 'best_rep'
+#' best_iris <- tsne_iris_rep[[tsne_iris_rep[[1]]$best_rep]]
+#' }
+#' @export
+smallvis_rep <- function(nrep = 10,
+                         keep_all = FALSE,
+                         ...) {
+  if (nrep < 1) {
+    stop("nrep must be 1 or greater")
+  }
+  varargs <- list(...)
+  best_res <- NULL
+  best_cost <- Inf
+  all_costs <- c()
+  # Keep requested return type for final result
+  ret_extra <- varargs$ret_extra
+
+  # always return extra so we can find the cost
+  if (!should_ret_extra(ret_extra)) {
+    varargs$ret_extra <- TRUE
+  }
+
+  varargs$Y_init <- "rand"
+  ret <- list()
+
+  for (i in 1:nrep) {
+    # If verbose is not explicitly set to FALSE, it's TRUE by default
+    if (nnat(varargs$verbose) || is.null(varargs$verbose)) {
+      tsmessage("Starting embedding # ", i, " of ", nrep)
+    }
+    res <- do.call(smallvis, varargs)
+
+    if (keep_all) {
+      ret[[i]] <- res
+    }
+
+    final_cost <- res$itercosts[length(res$itercosts)]
+    names(final_cost) <- NULL
+    all_costs <- c(all_costs, final_cost)
+
+    if (!keep_all) {
+      if (final_cost < best_cost) {
+        best_cost <- final_cost
+        best_res <- res
+      }
+    }
+  }
+
+  if (keep_all) {
+    # if keep_all is TRUE and we asked for extra return info
+    # also add the final costs and the index of the best result to each result
+    if (should_ret_extra(ret_extra)) {
+      best_rep <- which.min(all_costs)
+      for (i in 1:nrep) {
+        ret[[i]]$all_costs <- all_costs
+        ret[[i]]$best_rep <- best_rep
+      }
+    }
+    # otherwise just store the Y-coordinates of each result
+    else {
+      for (i in 1:nrep) {
+        ret[[i]] <- ret[[i]]$Y
+      }
+    }
+  } else {
+    # Only keeping one result
+    if (should_ret_extra(ret_extra)) {
+      # store info about other results on best result list
+      best_res$all_costs <- all_costs
+      ret <- best_res
+    } else {
+      ret <- best_res$Y
+    }
+  }
+  ret
+}
+
+# If ret_extra is NULL or FALSE, we aren't returning extra info
+# If ret_extra is TRUE or a vector, we are returning extra info
+should_ret_extra <- function(ret_extra) {
+  !is.null(ret_extra) &&
+    ((methods::is(ret_extra, "logical") && ret_extra) ||
+      methods::is(ret_extra, "character"))
+}
+
+#' Dimensionality Reduction With Perplexity Stepping
+#'
+#' Carry out dimensionality reduction of a (small) dataset using one of a
+#' variety of neighbor embedding methods, using a decreasing value of
+#' perplexity to avoid bad local minima.
+#'
+#' This function uses ideas similar to those in the NeRV (Venna et al., 2010)
+#' and JSE (Lee et al., 2013), where to avoid local minima, the initial
+#' optimization steps use affinities with larger bandwidths (NeRV) or larger
+#' perplexity values (JSE). This implementation uses a series of decreasing
+#' perplexity values, as in JSE.
+#'
+#' For details on the arguments that can be passed to the dimensionality
+#' reduction routine, see the help text for \code{\link{smallvis}}.
+#'
+#' To avoid spending too much extra time in perplexity calibrations, the extra
+#' perplexities start at the power of 2 closest to, but not greater than,
+#' half the dataset size (in terms of number of objects). Further calibrations
+#' are then carried out halving the perplexity each time, until the perplexity
+#' specified by the user is reached.
+#'
+#' The number of iterations spent in the larger perplexity values is specified
+#' by the \code{step_iter} parameter. This determines the total number
+#' of iterations, e.g. if \code{step_iter = 250} and extra optimizations
+#' at a perplexity of 1024, 512, 256, 128 and 64 will be carried out, these will
+#' run for 50 iterations each. To keep the number of iterations equivalent to
+#' that used by a single run of \code{\link{smallvis}}, the value of
+#' \code{step_iter} is subtracted from the value of \code{max_iter} before
+#' the optimization at the target perplexity is carried out, e.g. if
+#' \code{max_iter = 1000} and \code{step_iter = 250}, the final
+#' optimization will run for 750 iterations only.
+#'
+#' Any value of \code{tol}, \code{exaggeration_factor} and
+#' \code{stop_lying_iter} provided is used only with the final optimization.
+#'
+#' @param step_iter Number of iterations to carry out the perplexity
+#'  stepping. Must be < the value of \code{max_iter}.
+#' @param ... Arguments to be passed to \code{\link{smallvis}}. See 'Details'
+#'  for information on which arguments may be modified or ignored during certain
+#'  parts of the embedding.
+#' @return The result of the final run of \code{\link{smallvis}} at the target
+#'   perplexity.
+#' @examples
+#' \dontrun{
+#' # t-SNE on the iris with L-BFGS optimization
+#' # The 1000 max_iter is split between 250 iterations at perplexity = 64
+#' # and then 750 iterations at perplexity = 40.
+#' iris_lbfgs_pstep <- smallvis_perpscale(
+#'   step_iter = 250, X = iris, scale = FALSE, verbose = TRUE, Y_init = "spca",
+#'   ret_extra = c("DX", "DY"), perplexity = 40, max_iter = 1000, opt = list("l-bfgs")
+#' )
+#' }
+#' @export
+#' @references
+#' Venna, J., Peltonen, J., Nybo, K., Aidos, H., & Kaski, S. (2010).
+#' Information retrieval perspective to nonlinear dimensionality reduction for
+#' data visualization.
+#' \emph{Journal of Machine Learning Research}, \emph{11}, 451-490.
+#'
+#' Lee, J. A., Renard, E., Bernard, G., Dupont, P., & Verleysen, M. (2013).
+#' Type 1 and 2 mixtures of Kullback-Leibler divergences as cost functions in
+#' dimensionality reduction based on similarity preservation.
+#' \emph{Neurocomputing}, \emph{112}, 92-108.
+smallvis_perpstep <- function(step_iter = 250, ...) {
+  varargs <- list(...)
+  max_iter <- varargs$max_iter
+  if (is.null(max_iter)) {
+    max_iter <- 1000
+  }
+  if (max_iter <= step_iter) {
+    stop("max_iter must be > step_iter")
+  }
+
+  target_perplexity <- varargs$perplexity
+  if (is.null(target_perplexity)) {
+    target_perplexity <- 30
+  }
+
+  X <- varargs$X
+  if (methods::is(X, "dist")) {
+    n <- attr(X, "Size")
+  } else {
+    n <- nrow(X)
+  }
+  perps <- scale_perps(n = n, target_perp = target_perplexity)
+
+  nperps <- length(perps)
+  if (nperps > 0) {
+    max_iter_step <- max(1, floor(step_iter / nperps))
+    max_iter_target <- max(1, max_iter - step_iter)
+
+    # Save/Modify some options between step iterations and final optimization
+    ret_extra <- varargs$ret_extra
+    varargs$ret_extra <- FALSE
+    varargs$max_iter <- max_iter_step
+    tol <- varargs$tol
+    varargs$tol <- 0
+    exaggeration_factor <- varargs$exaggeration_factor
+    varargs$exaggeration_factor <- 1
+
+    epoch <- varargs$epoch
+    varargs$epoch <- max_iter_step
+
+    # Loop over initial perplexities
+    res <- NULL
+    for (i in 1:nperps) {
+      if (nnat(varargs$verbose)) {
+        tsmessage(
+          "Optimizing at step perplexity ",
+          formatC(perps[i]),
+          " for ",
+          max_iter_step,
+          " iterations"
+        )
+      }
+      varargs$perplexity <- perps[i]
+      if (i > 1) {
+        varargs$Y_init <- res
+      }
+      res <- do.call(smallvis, varargs)
+    }
+
+    varargs$Y_init <- res
+    varargs$max_iter <- max_iter_target
+    # Put the old arguments back before final optimization
+    varargs$perplexity <- target_perplexity
+    if (!is.null(ret_extra)) {
+      varargs$ret_extra <- ret_extra
+    }
+    if (!is.null(tol)) {
+      varargs$tol <- tol
+    }
+    if (!is.null(epoch)) {
+      varargs$epoch <- epoch
+    }
+    if (!is.null(exaggeration_factor)) {
+      varargs$exaggeration_factor <- exaggeration_factor
+    }
+  }
+
+  if (nnat(varargs$verbose)) {
+    tsmessage(
+      "Optimizing at target perplexity ",
+      formatC(target_perplexity),
+      " for ",
+      max_iter_target,
+      " iterations"
+    )
+  }
+  do.call(smallvis, varargs)
+}
+
+# Utility function for perplexity step
+scale_perps <- function(n, target_perp) {
+  max_perp <- n / 2
+  max_perp <- 2^floor(log(max_perp, 2))
+  perp <- max_perp
+
+  if (max_perp > target_perp) {
+    perps <- c()
+    while (perp > target_perp) {
+      perps <- c(perps, perp)
+      perp <- perp / 2
+    }
+  }
+
+  perps
+}
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index 4a3989f..241e7a0 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -47,17 +47,17 @@
 #'   co-workers, use method \code{"bnerv"}.
 #'   \item \code{"jse"} The Jensen-Shannon Embedding method of Lee and
 #'   co-workers (2013).
-#'   \item \code{"absne"} The alpha-beta SNE method of Narayan and co-workers 
+#'   \item \code{"absne"} The alpha-beta SNE method of Narayan and co-workers
 #'   (2015).
-#'   \item \code{"chsne"} The chi-squared divergence version of t-SNE 
+#'   \item \code{"chsne"} The chi-squared divergence version of t-SNE
 #'   (Im and co-workers, 2018).
-#'   \item \code{"hlsne"} The Hellinger distance divergence version of t-SNE 
+#'   \item \code{"hlsne"} The Hellinger distance divergence version of t-SNE
 #'   (Im and co-workers, 2018).
 #'   \item \code{"rklsne"} The reverse Kullback-Leibler divergence version of
 #'   t-SNE (Im and co-workers, 2018).
 #'   \item \code{"jssne"} The Jensen-Shannon divergence version of t-SNE (Im and
 #'   co-workers, 2018).
-#'   \item \code{"gsne"}, The global SNE (g-SNE) method of Zhou and Sharpee 
+#'   \item \code{"gsne"}, The global SNE (g-SNE) method of Zhou and Sharpee
 #'   (2018).
 #' }
 #'
@@ -73,9 +73,9 @@
 #' used. To control these, instead of passing a name to the \code{method}
 #' parameter, pass a list. The first element is the name of the method you wish
 #' to use. Subsequent elements must be named values specifying the parameters.
-#' 
+#'
 #' Some parameters are available for all (nor nearly all) methods.
-#' 
+#'
 #' \itemize{
 #'    \item{\code{inp_kernel}} the input kernel function. Can be one of:
 #'       \code{"gauss"} (the default), \code{"exp"} or \code{"knn"}.
@@ -88,9 +88,9 @@
 #'       sparsification is carried out with this kernel, so there are no
 #'       memory or performance improvements to be had with this setting.
 #'       \code{"skd"} uses the smooth knn distances method as used by UMAP.
-#'    \item{\code{symmetrize}} the type of symmetrization, used by symmetric 
+#'    \item{\code{symmetrize}} the type of symmetrization, used by symmetric
 #'       methods only. Can be one of:
-#'       \code{"symmetric"} symmetric nearest neighbor style, by arithmetic 
+#'       \code{"symmetric"} symmetric nearest neighbor style, by arithmetic
 #'       averaging, as in t-SNE.
 #'       \code{"fuzzy"} symmetrization by fuzzy set union as used in UMAP.
 #'       \code{"mutual"} mutual nearest neighbor style as suggested by Schubert
@@ -101,10 +101,10 @@
 #'    \item \code{"LargeVis"}
 #'    \itemize{
 #'    \item{\code{gamma}} Weighting term for the repulsive versus attractive
-#'     forces. Default is \code{1}. The implementation by the creators of 
-#'     LargeVis uses a default \code{gamma = 7}, but note that this is for 
-#'     stochastic gradient descent with limited sampling of the repulsive 
-#'     contributions so it's unlikely to be a good choice with the 
+#'     forces. Default is \code{1}. The implementation by the creators of
+#'     LargeVis uses a default \code{gamma = 7}, but note that this is for
+#'     stochastic gradient descent with limited sampling of the repulsive
+#'     contributions so it's unlikely to be a good choice with the
 #'     implementation used in this package.
 #'    \item{\code{gr_eps}} Epsilon used in the gradient to prevent
 #'     division by zero. Default is \code{0.1}.
@@ -163,9 +163,9 @@
 #'    }
 #'    \item \code{"ABSNE"}
 #'    \itemize{
-#'    \item{\code{alpha}} Alpha value for the alpha-beta divergence. Set 
+#'    \item{\code{alpha}} Alpha value for the alpha-beta divergence. Set
 #'    \code{alpha < 1} to produce more smaller, finer-grained clusters, and
-#'    \code{alpha > 1} to produce fewer, larger clusters, with more emphasis 
+#'    \code{alpha > 1} to produce fewer, larger clusters, with more emphasis
 #'    on global structure. Default is \code{1.0}, to give t-SNE-like behavior.
 #'    \item{\code{lambda}} Sum of alpha + beta, where beta is the beta value
 #'    for the alpha-beta divergence. Set \code{lambda < 1} to increase cluster
@@ -174,8 +174,8 @@
 #'    }
 #'    \item \code{"gsne"}
 #'    \itemize{
-#'    \item{\code{lambda}} Weighting factor to put increasing emphasis on 
-#'    preserving global similarities. Set to \code{0} to get t-SNE (no 
+#'    \item{\code{lambda}} Weighting factor to put increasing emphasis on
+#'    preserving global similarities. Set to \code{0} to get t-SNE (no
 #'    extra emphasis on global structure), and to \code{1.0} to get equal
 #'    weighting between the local and global divergences. Default is \code{1.0}.
 #'    }
@@ -193,7 +193,7 @@
 #'   \item{A matrix}: which must have dimensions \code{n} by \code{k}, where
 #'   \code{n} is the number of rows in \code{X}.
 #'   \item{\code{"rand"}}: initialize from a Gaussian distribution with mean 0
-#'   and standard deviation 1e-4, the default used by t-SNE. The standard 
+#'   and standard deviation 1e-4, the default used by t-SNE. The standard
 #'   deviation can be controlled with \code{Y_init_sdev} (see below).
 #'   \item{\code{"pca"}}: use the first \code{k} scores of the
 #'   PCA: columns are centered, but no scaling beyond that which is applied by
@@ -229,15 +229,15 @@
 #' related to t-SNE (see Carreira-Perpinan, 2010, and Linderman and
 #' Steinerberger, 2017): it may therefore unnecessary to use the
 #' \code{exaggeration_factor} setting.
-#' 
+#'
 #' The \code{Y_init_sdev} parameter, if provided, will scale the input
 #' coordinates such that the standard deviation of each dimension is the
-#' provided value. The default is to do no scaling, except for 
-#' \code{Y_init = "spca"} and \code{Y_init = "rand"} where a scaling to a 
+#' provided value. The default is to do no scaling, except for
+#' \code{Y_init = "spca"} and \code{Y_init = "rand"} where a scaling to a
 #' standard deviation of \code{1e-4} is used, as in t-SNE initialization.
 #' \code{Y_init = "spca"} is effectively an alias for \code{Y_init = "pca",
 #' Y_init_sdev = 1e-4}.
-#' 
+#'
 #' Depending on the embedding method, a particular initialization method may
 #' result in initial coordinates with too small or large inter-point distances,
 #' which can result in too large or small gradients, respectively. In turn this
@@ -282,21 +282,21 @@
 #' \url{https://jlmelville.github.io/smallvis/idp.html}.
 #'
 #' @section Multiscale perplexities:
-#' 
+#'
 #' Another technique to combine multiple perplexities is to use the multiscale
-#' approach given by de Bodt and co-workers (2018). As with IDP, a series of 
+#' approach given by de Bodt and co-workers (2018). As with IDP, a series of
 #' candidate perplexities are used, but all the affinity matrices are used to
 #' create an average matrix which is used as the final probability matrix.
-#' To use this method, set \code{perplexity = "multiscale"}. Default and custom 
-#' list of perplexities to use can be provided in the same way as with IDP. 
-#' 
+#' To use this method, set \code{perplexity = "multiscale"}. Default and custom
+#' list of perplexities to use can be provided in the same way as with IDP.
+#'
 #' Note that previous work by this group described a slightly more complex
 #' approach where the number of individual perplexity results are introduced
 #' into the average sequentially over the course of the optimization, and
-#' the output probabilities are also generated by an averaging. Although also 
+#' the output probabilities are also generated by an averaging. Although also
 #' referred to as "multiscale", these variations are not implemented. Also,
-#' if \code{ret_extra = TRUE} is used, extra data associated with a specific 
-#' perplexity (e.g. degree centrality, intrinsic dimensionality) will not 
+#' if \code{ret_extra = TRUE} is used, extra data associated with a specific
+#' perplexity (e.g. degree centrality, intrinsic dimensionality) will not
 #' be returned.
 #'
 #' @section Alternative optimizers:
@@ -419,7 +419,7 @@
 #' @param perplexity The target perplexity for parameterizing the input
 #'   probabilities. For method \code{"umap"}, controls the neighborhood size
 #'   for parameterizing the smoothed k-nearest neighbor distances. See also the
-#'   'Intrinsic dimensionality perplexity' and 'Multiscale perplexities' 
+#'   'Intrinsic dimensionality perplexity' and 'Multiscale perplexities'
 #'   sections.
 #' @param max_iter Maximum number of iterations in the optimization.
 #' @param pca If \code{TRUE}, apply PCA to reduce the dimensionality of
@@ -457,22 +457,22 @@
 #' @param epoch After every \code{epoch} number of steps, calculates and
 #'   displays the cost value and calls \code{epoch_callback}, if supplied.
 #' @param momentum Initial momentum value.
-#' @param final_momentum Final momentum value. If 
+#' @param final_momentum Final momentum value. If
 #'   \code{late_exaggeration_factor > 1}, then during late exaggeration, the
 #'   momentum is switched back to \code{momentum} from this value.
 #' @param mom_switch_iter Iteration at which the momentum will switch from
-#'   \code{momentum} to \code{final_momentum}. If 
+#'   \code{momentum} to \code{final_momentum}. If
 #'   \code{exaggeration_factor > 1}, then this should occur at some point
 #'   after \code{stop_lying_iter} (default is 150 iterations after). If
 #'   the early exaggeration phase stops early, this value is treated as being
 #'   relative to when early exaggeration stops, to avoid wasting iterations
 #'   at the lower momentum value. For example, if \code{stop_lying_iter = 100}
 #'   and \code{mom_switch_iter = 250} (the defaults), but early exaggeration
-#'   converges at iteration 50, the switch iteration will occur at iteration 
+#'   converges at iteration 50, the switch iteration will occur at iteration
 #'   150.
 #' @param eta Learning rate value, a positive number. Or set to \code{"optsne"},
 #'   to use the formula suggested by Belkina and co-workers (2018) in their
-#'   opt-SNE package (the size of the dataset divided by the 
+#'   opt-SNE package (the size of the dataset divided by the
 #'   \code{exaggeration_factor}).
 #' @param min_gain Minimum gradient descent step size.
 #' @param opt Optional list specifying alternative minimization method. See
@@ -510,8 +510,8 @@
 #'   number of iterations before monitoring the relative rate of change of the
 #'   cost function during early exaggeration.
 #' @param ee_mon_buffer If \code{ee_mon_epoch} is non-\code{NULL}, then ignore
-#'   this number of occurences of the relative rate of change of the cost 
-#'   function decreasing, which would otherwise signal termination of 
+#'   this number of occurences of the relative rate of change of the cost
+#'   function decreasing, which would otherwise signal termination of
 #'   the early exaggeration stage. This is to prevent erroneous termination of
 #'   early exaggeration under conditions when the cost can fluctuate noisily.
 #' @param tol_wait Wait this number of iterations during standard optimization
@@ -525,7 +525,7 @@
 #'   \code{Value} section for details.
 #' @param n_threads Number of threads to use in multi-threaded code. Default is
 #'   0, which means no multi-threading. Mainly affects the calculation of things
-#'   like distance matrices and perplexity calibration if you set 
+#'   like distance matrices and perplexity calibration if you set
 #'   \code{use_cpp = TRUE}. Otherwise, only methods that need to calculate
 #'   nearest neighbors will be affected.
 #' @param use_cpp If \code{TRUE} use multi-threaded C++ code for some
@@ -722,14 +722,14 @@
 #' (pp. 585-591).
 #' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
 #'
-#' Belkina, A. C., Ciccolella, C. O., Anno, R., Spidlen, J., Halpert, R., & Snyder-Cappione, J. (2018). 
-#' Automated optimal parameters for T-distributed stochastic neighbor embedding improve visualization and allow analysis of large datasets. 
+#' Belkina, A. C., Ciccolella, C. O., Anno, R., Spidlen, J., Halpert, R., & Snyder-Cappione, J. (2018).
+#' Automated optimal parameters for T-distributed stochastic neighbor embedding improve visualization and allow analysis of large datasets.
 #' \emph{bioRxiv}, 451690.
 #' \url{https://www.biorxiv.org/content/10.1101/451690v2.abstract}
 #'
-#' De Bodt, C., Mulders, D., Verleysen, M., & Lee, J. A. (2018). 
-#' Perplexity-free t-SNE and twice Student tt-SNE. 
-#' In \emph{European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning (ESANN 2018)} (pp. 123-128). 
+#' De Bodt, C., Mulders, D., Verleysen, M., & Lee, J. A. (2018).
+#' Perplexity-free t-SNE and twice Student tt-SNE.
+#' In \emph{European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning (ESANN 2018)} (pp. 123-128).
 #' \url{http://hdl.handle.net/2078.1/200844}
 #'
 #' Borg, I., & Groenen, P. J. (2005).
@@ -748,13 +748,13 @@
 #' Stochastic neighbor embedding.
 #' In \emph{Advances in neural information processing systems} (pp. 833-840).
 #'
-#' Im, D. J., Verma, N., & Branson, K. (2018). 
-#' Stochastic Neighbor Embedding under f-divergences. 
+#' Im, D. J., Verma, N., & Branson, K. (2018).
+#' Stochastic Neighbor Embedding under f-divergences.
 #' \emph{arXiv preprint} \emph{arXiv}:1811.01247.
 #' \url{https://arxiv.org/abs/1811.01247}
 #'
 #' Kobak, D., & Berens, P. (2018).
-#' The art of using t-SNE for single-cell transcriptomics. 
+#' The art of using t-SNE for single-cell transcriptomics.
 #' \emph{bioRxiv}, 453449.
 #' \url{https://doi.org/10.1101/453449}
 #'
@@ -778,13 +778,13 @@
 #' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
 #' \url{https://arxiv.org/abs/1802.03426}
 #'
-#' Narayan, K. S., Punjani, A., & Abbeel, P. (2015, June). 
-#' Alpha-Beta Divergences Discover Micro and Macro Structures in Data. 
+#' Narayan, K. S., Punjani, A., & Abbeel, P. (2015, June).
+#' Alpha-Beta Divergences Discover Micro and Macro Structures in Data.
 #' In \emph{Proceedings of the 32nd International Conference on Machine Learning (ICML-14)}
 #' (pp 796-804).
 #' \url{http://proceedings.mlr.press/v37/narayan15.html}
 #'
-#' Schubert, E., & Gertz, M. (2017, October). 
+#' Schubert, E., & Gertz, M. (2017, October).
 #' Intrinsic t-stochastic neighbor embedding for visualization and outlier detection.
 #' In \emph{International Conference on Similarity Search and Applications}
 #' (pp. 188-203). Springer, Cham.
@@ -832,39 +832,47 @@
 #' In \emph{Proceedings of the 18th International Conference on Artificial Intelligence and Statistics (AISTATS 2015)}
 #' (pp. 1088-1097).
 #'
-#' Zhou, Y., & Sharpee, T. (2018). 
-#' Using global t-SNE to preserve inter-cluster data structure. 
-#' \emph{bioRxiv}, 331611. 
+#' Zhou, Y., & Sharpee, T. (2018).
+#' Using global t-SNE to preserve inter-cluster data structure.
+#' \emph{bioRxiv}, 331611.
 #' \url{https://doi.org/10.1101/331611}
 #'
 #' @export
-smallvis <- function(X, k = 2, scale = "absmax", 
-                     Y_init = "rand", Y_init_sdev = NULL,
-                 perplexity = 30, max_iter = 1000,
-                 pca = FALSE, initial_dims = 50,
-                 method = "tsne",
-                 epoch_callback = TRUE,
-                 epoch = max(1, base::round(max_iter / 10)),
-                 min_cost = 0, tol = 1e-7, g2tol = NULL,
-                 momentum = 0.5, final_momentum = 0.8, 
-                 mom_switch_iter = stop_lying_iter + 150,
-                 eta = 500, min_gain = 0.01,
-                 opt = list("dbd"),
-                 exaggeration_factor = 1,
-                 stop_lying_iter = max(1, floor(max_iter / 10)),
-                 late_exaggeration_factor = 1,
-                 start_late_lying_iter = max_iter - max(1, floor(max_iter / 10)),
-                 iter0_cost = FALSE,
-                 ee_mon_epoch = NULL,
-                 ee_mon_wait = 15,
-                 ee_mon_buffer = 2,
-                 tol_wait = 15,
-                 ret_extra = FALSE,
-                 n_threads = 0,
-                 use_cpp = FALSE,
-                 eps = .Machine$double.eps,
-                 verbose = TRUE) {
-
+smallvis <- function(X,
+                     k = 2,
+                     scale = "absmax",
+                     Y_init = "rand",
+                     Y_init_sdev = NULL,
+                     perplexity = 30,
+                     max_iter = 1000,
+                     pca = FALSE,
+                     initial_dims = 50,
+                     method = "tsne",
+                     epoch_callback = TRUE,
+                     epoch = max(1, base::round(max_iter / 10)),
+                     min_cost = 0,
+                     tol = 1e-7,
+                     g2tol = NULL,
+                     momentum = 0.5,
+                     final_momentum = 0.8,
+                     mom_switch_iter = stop_lying_iter + 150,
+                     eta = 500,
+                     min_gain = 0.01,
+                     opt = list("dbd"),
+                     exaggeration_factor = 1,
+                     stop_lying_iter = max(1, floor(max_iter / 10)),
+                     late_exaggeration_factor = 1,
+                     start_late_lying_iter = max_iter - max(1, floor(max_iter / 10)),
+                     iter0_cost = FALSE,
+                     ee_mon_epoch = NULL,
+                     ee_mon_wait = 15,
+                     ee_mon_buffer = 2,
+                     tol_wait = 15,
+                     ret_extra = FALSE,
+                     n_threads = 0,
+                     use_cpp = FALSE,
+                     eps = .Machine$double.eps,
+                     verbose = TRUE) {
   if (is.logical(epoch_callback)) {
     if (epoch_callback) {
       epoch_callback <- make_smallvis_cb(X)
@@ -876,73 +884,347 @@ smallvis <- function(X, k = 2, scale = "absmax",
   else if (is.function(epoch_callback)) {
     force(epoch_callback)
   }
-
+  
   # The embedding method
-  method_names <- c("tsne", "largevis", "umap", "tumap",
-                    "ntumap", "mmds", "gmmds",
-                    "asne", "ssne", "wtsne", "wssne",
-                    "hssne", "ee", "nerv", "snerv", "jse", "sjse",
-                    "smmds", "sammon",
-                    "tasne", "trmsne", "trsrsne", "tmsne", "arsrsne",
-                    "rsrjse", "rsrnerv",
-                    "btsne", "bssne", "basne", "btasne", "bnerv",
-                    "ballmmds", "knnmmds",
-                    "dhssne", "pstsne", "tsneu",
-                    "skdtsne", "usne", "cetsne",
-                    "tee", "absne", "chsne", "hlsne", "rklsne", "jssne",
-                    "gsne", "abssne", "bhssne")
+  method_names <- c(
+    "tsne",
+    "largevis",
+    "umap",
+    "tumap",
+    "ntumap",
+    "mmds",
+    "gmmds",
+    "asne",
+    "ssne",
+    "wtsne",
+    "wssne",
+    "hssne",
+    "ee",
+    "nerv",
+    "snerv",
+    "jse",
+    "sjse",
+    "smmds",
+    "sammon",
+    "tasne",
+    "trmsne",
+    "trsrsne",
+    "tmsne",
+    "arsrsne",
+    "rsrjse",
+    "rsrnerv",
+    "btsne",
+    "bssne",
+    "basne",
+    "btasne",
+    "bnerv",
+    "ballmmds",
+    "knnmmds",
+    "dhssne",
+    "pstsne",
+    "tsneu",
+    "skdtsne",
+    "usne",
+    "cetsne",
+    "tee",
+    "absne",
+    "chsne",
+    "hlsne",
+    "rklsne",
+    "jssne",
+    "gsne",
+    "abssne",
+    "bhssne"
+  )
   if (is.character(method)) {
     method <- match.arg(tolower(method), method_names)
-    cost_fn <- switch(method,
-         tsne = tsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         umap = umap(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         largevis = largevis(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         tumap = tumap(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         ntumap = ntumap(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         mmds = mmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         gmmds = gmmds(k = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         asne = asne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         ssne = ssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         wtsne = wtsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         wssne = wssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         hssne = hssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         ee = ee(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         nerv = nerv(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         snerv = snerv(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         jse = jse(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         sjse = sjse(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         smmds = smmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         sammon = sammon(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         tasne = tasne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         trmsne = trmsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         trsrsne = trsrsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         tmsne = tmsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         arsrsne = arsrsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         rsrjse = rsrjse(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         rsrnerv = rsrnerv(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         btsne = btsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         bssne = bssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         basne = basne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         btasne = btasne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         bnerv = bnerv(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         ballmmds = ballmmds(n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         knnmmds = knnmmds(k = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         dhssne = dhssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         tsneu = tsneu(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         pstsne = pstsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         skdtsne = skdtsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         usne = usne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         cetsne = cetsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         tee = tee(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         absne = absne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         chsne = chsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         hlsne = hlsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         gsne = gsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         rklsne = rklsne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         jssne = jssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),      
-         abssne = abssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         bhssne = bhssne(perplexity = perplexity, n_threads = n_threads, eps = eps, use_cpp = use_cpp),
-         stop("BUG: someone forgot to implement option: '", method, "'")
+    cost_fn <- switch(
+      method,
+      tsne = tsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      umap = umap(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      largevis = largevis(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tumap = tumap(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      ntumap = ntumap(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      mmds = mmds(
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      gmmds = gmmds(
+        k = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      asne = asne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      ssne = ssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      wtsne = wtsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      wssne = wssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      hssne = hssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      ee = ee(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      nerv = nerv(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      snerv = snerv(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      jse = jse(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      sjse = sjse(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      smmds = smmds(
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      sammon = sammon(
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tasne = tasne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      trmsne = trmsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      trsrsne = trsrsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tmsne = tmsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      arsrsne = arsrsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      rsrjse = rsrjse(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      rsrnerv = rsrnerv(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      btsne = btsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      bssne = bssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      basne = basne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      btasne = btasne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      bnerv = bnerv(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      ballmmds = ballmmds(
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      knnmmds = knnmmds(
+        k = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      dhssne = dhssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tsneu = tsneu(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      pstsne = pstsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      skdtsne = skdtsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      usne = usne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      cetsne = cetsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tee = tee(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      absne = absne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      chsne = chsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      hlsne = hlsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      gsne = gsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      rklsne = rklsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      jssne = jssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      abssne = abssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      bhssne = bhssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      stop("BUG: someone forgot to implement option: '", method, "'")
     )
   }
   else {
@@ -965,7 +1247,7 @@ smallvis <- function(X, k = 2, scale = "absmax",
     else {
       stop("Bad method parameter type")
     }
-
+    
     param_names <- names(formals(fn))
     if ("perplexity" %in% param_names) {
       methodlist$perplexity <- perplexity
@@ -975,25 +1257,25 @@ smallvis <- function(X, k = 2, scale = "absmax",
     }
     cost_fn <- do.call(fn, methodlist)
   }
-
+  
   if (exaggeration_factor != 1) {
     if (stop_lying_iter < 1) {
       stop("stop_lying_iter must be >= 1")
     }
   }
-
+  
   if (late_exaggeration_factor != 1) {
     if (start_late_lying_iter < 1) {
       stop("start_late_lying_iter must be >= 1")
     }
   }
-
+  
   if (exaggeration_factor != 1 && late_exaggeration_factor != 1) {
     if (start_late_lying_iter < stop_lying_iter) {
       stop("start_late_lying_iter must be >= stop_lying_iter")
     }
   }
-
+  
   if (methods::is(pca, "character") && pca == "whiten") {
     pca <- TRUE
     whiten <- TRUE
@@ -1005,18 +1287,18 @@ smallvis <- function(X, k = 2, scale = "absmax",
     stop("Initial PCA dimensionality must be larger than desired output ",
          "dimension")
   }
-
+  
   start_time <- NULL
   ret_optionals <- c()
   if (methods::is(ret_extra, "character")) {
     ret_optionals <- ret_extra
     ret_extra <- TRUE
   }
-
+  
   if (ret_extra) {
     start_time <- Sys.time()
   }
-
+  
   if (methods::is(X, "dist")) {
     n <- attr(X, "Size")
   }
@@ -1029,45 +1311,56 @@ smallvis <- function(X, k = 2, scale = "absmax",
       }
       X <- X[, indexes]
     }
-
+    
     X <- scale_input(X, scale, verbose = verbose)
     X <- pca_preprocess(X, pca, whiten, initial_dims, verbose = verbose)
     n <- nrow(X)
   }
-
+  
   # Check for NA
   if (any(is.na(X))) {
     stop("Input data contains NA: missing data is not allowed")
   }
-
+  
   # Fail early as possible if matrix initializer is invalid
   if (methods::is(Y_init, "matrix")) {
     if (nrow(Y_init) != n || ncol(Y_init) != k) {
       stop("Y_init matrix does not match necessary configuration for X")
     }
   }
-
+  
   # Optimizer
   if (opt[[1]] == "dbd" || opt[[1]] == "ndbd") {
     if (eta == "optsne") {
       eta <- n / exaggeration_factor
       tsmessage("Using opt-SNE learning rate = ", formatC(eta))
     }
-    opt_list <- lmerge(opt, list(momentum = momentum,
-                       final_momentum = final_momentum,
-                       mom_switch_iter = mom_switch_iter,
-                       eta = eta, min_gain = min_gain,
-                       verbose = verbose))
+    opt_list <- lmerge(
+      opt,
+      list(
+        momentum = momentum,
+        final_momentum = final_momentum,
+        mom_switch_iter = mom_switch_iter,
+        eta = eta,
+        min_gain = min_gain,
+        verbose = verbose
+      )
+    )
   }
   else {
     opt_list <- opt
   }
   opt <- opt_create(opt_list, verbose = verbose)
-
+  
   # Initialize the cost function and create P
-  cost_fn <- cost_init(cost_fn, X, max_iter = max_iter, verbose = verbose,
-                       ret_extra = ret_optionals)
-
+  cost_fn <- cost_init(
+    cost_fn,
+    X,
+    max_iter = max_iter,
+    verbose = verbose,
+    ret_extra = ret_optionals
+  )
+  
   # Output Initialization
   if (!is.null(Y_init)) {
     if (methods::is(Y_init, "matrix")) {
@@ -1077,14 +1370,18 @@ smallvis <- function(X, k = 2, scale = "absmax",
     else {
       Y_init <- match.arg(tolower(Y_init),
                           c("rand", "pca", "spca", "laplacian", "normlaplacian"))
-
+      
       if (!is_spectral_init(Y_init)) {
         # We now handle scaling coordinates below, so spca is treated like pca
         non_spectral_init <- Y_init
         if (non_spectral_init == "spca") {
           non_spectral_init <- "pca"
         }
-        Y <- init_out(non_spectral_init, X, n, k, pca_preprocessed = pca,
+        Y <- init_out(non_spectral_init,
+                      X,
+                      n,
+                      k,
+                      pca_preprocessed = pca,
                       verbose = verbose)
       }
       else {
@@ -1095,7 +1392,7 @@ smallvis <- function(X, k = 2, scale = "absmax",
           A <- cost_fn$P
           if (!is.null(A)) {
             tsmessage("Using P for spectral initialization")
-          }        
+          }
         }
         else {
           tsmessage("Using V for spectral initialization")
@@ -1126,7 +1423,8 @@ smallvis <- function(X, k = 2, scale = "absmax",
         }
       }
     }
-    if (!is.null(Y_init_sdev) || Y_init == "spca" || Y_init == "rand") {
+    if (!is.null(Y_init_sdev) ||
+        Y_init == "spca" || Y_init == "rand") {
       if (is.null(Y_init_sdev)) {
         Y_init_sdev <- 1e-4
       }
@@ -1134,34 +1432,48 @@ smallvis <- function(X, k = 2, scale = "absmax",
       Y <- shrink_coords(Y, Y_init_sdev)
     }
   }
-
+  
   cost <- NA
   itercosts <- c()
   if (iter0_cost && (verbose || ret_extra)) {
     cost_eval_res <- cost_eval(cost_fn, Y)
     cost_fn <- cost_eval_res$cost
     cost <- cost_eval_res$value
-
+    
     tsmessage("Iteration #0 error: ", formatC(cost))
-
+    
     if (ret_extra) {
       names(cost) <- 0
       itercosts <- c(itercosts, cost)
     }
     cost_fn <- cost_clear(cost_fn)
   }
-
+  
   # Display initialization
   if (!is.null(epoch_callback)) {
     do_callback(epoch_callback, Y, 0, cost, cost_fn, opt)
   }
   if (max_iter < 1) {
-    return(ret_value(Y, ret_extra, method, X, scale, Y_init, iter = 0,
-                     cost_fn = cost_fn, itercosts = itercosts,
-                     start_time = start_time, optionals = ret_optionals,
-                     pca = ifelse(pca && !whiten, initial_dims, 0),
-                     whiten = ifelse(pca && whiten, initial_dims, 0),
-                     use_cpp = use_cpp, n_threads = n_threads))
+    return(
+      ret_value(
+        Y,
+        ret_extra,
+        method,
+        X,
+        scale,
+        Y_init,
+        iter = 0,
+        cost_fn = cost_fn,
+        itercosts = itercosts,
+        start_time = start_time,
+        optionals = ret_optionals,
+        pca = ifelse(pca && !whiten, initial_dims, 0),
+        whiten = ifelse(pca &&
+                          whiten, initial_dims, 0),
+        use_cpp = use_cpp,
+        n_threads = n_threads
+      )
+    )
   }
   
   opt_stages <- c()
@@ -1182,18 +1494,23 @@ smallvis <- function(X, k = 2, scale = "absmax",
     cost_fn <- start_exaggerating(cost_fn, exaggeration_factor)
     if (!is.null(ee_mon_epoch)) {
       epoch <- ee_mon_epoch
-      # prevent tolerance based convergence if we monitor EE 
+      # prevent tolerance based convergence if we monitor EE
       ee_mon_tol <- 0
       tol <- ee_mon_tol
       stop_lying_iter <- max_iter
-      tsmessage("Applying early exaggeration factor = ", exaggeration_factor, 
-                ", epoch every ", epoch, " iterations")
+      tsmessage(
+        "Applying early exaggeration factor = ",
+        exaggeration_factor,
+        ", epoch every ",
+        epoch,
+        " iterations"
+      )
     }
   }
   else {
     stop_lying_iter <- 0
   }
-
+  
   old_cost <- NULL
   tolval <- NULL
   tsmessage("Optimizing coordinates")
@@ -1206,7 +1523,7 @@ smallvis <- function(X, k = 2, scale = "absmax",
     opt <- opt_res$opt
     cost_fn <- opt_res$cost_fn
     Y <- opt_res$Y
-
+    
     if (!is.null(cost_fn$P) && iter == stop_lying_iter &&
         exaggeration_factor != 1) {
       tsmessage("Switching off exaggeration at iter ", iter)
@@ -1215,60 +1532,79 @@ smallvis <- function(X, k = 2, scale = "absmax",
       epoch <- opt_epoch
       tol <- opt_tol
     }
-
+    
     if (!is.null(cost_fn$P) && iter == start_late_lying_iter &&
         late_exaggeration_factor != 1) {
-      tsmessage("Starting late exaggeration = ",
-              formatC(late_exaggeration_factor), " at iter ", iter,
-              " until iter ", max_iter)
+      tsmessage(
+        "Starting late exaggeration = ",
+        formatC(late_exaggeration_factor),
+        " at iter ",
+        iter,
+        " until iter ",
+        max_iter
+      )
       cost_fn <- start_exaggerating(cost_fn, late_exaggeration_factor)
       
       opt_stage_idx <- opt_stage_idx + 1
     }
-
+    
     if (nnat(opt$is_terminated)) {
-      tsmessage("Iteration #", iter,
-                " stopping early: optimizer reports convergence: ",
-                opt$terminate$what)
+      tsmessage(
+        "Iteration #",
+        iter,
+        " stopping early: optimizer reports convergence: ",
+        opt$terminate$what
+      )
       max_iter <- iter
       break
     }
     
     # Recenter after each iteration
     Y <- sweep(Y, 2, colMeans(Y))
-
+    
     if ((epoch > 0 && iter %% epoch == 0) || iter == max_iter) {
       stop_early <- FALSE
       
       cost_eval_res <- cost_eval(cost_fn, Y, opt_res)
       cost_fn <- cost_eval_res$cost
       cost <- cost_eval_res$value
-
+      
       if (!is.null(old_cost)) {
         tolval <- reltol(cost, old_cost) / epoch
       }
       
       if (verbose) {
-        tsmessage("Iteration #", iter, " error: ",
-                  formatC(cost)
-                  , " ||G||2 = ", formatC(norm2(opt_res$G))
-                  , appendLF = FALSE
-                  )
+        tsmessage(
+          "Iteration #",
+          iter,
+          " error: ",
+          formatC(cost)
+          ,
+          " ||G||2 = ",
+          formatC(norm2(opt_res$G))
+          ,
+          appendLF = FALSE
+        )
         if (!is.null(tolval)) {
           message(" tol = ", formatC(tolval), appendLF = FALSE)
         }
         
         if (!is.null(opt$counts)) {
-          message(" nf = ", opt$counts$fn, " ng = ", opt$counts$gr,
+          message(" nf = ",
+                  opt$counts$fn,
+                  " ng = ",
+                  opt$counts$gr,
                   appendLF = FALSE)
         }
-
+        
         # special treatment for mize innards
         if (!is.null(opt$stages$gradient_descent$step_size$value)) {
           opt$stages$gradient_descent$step_size$value
-          message(" alpha = ", 
-                  formatC(opt$stages$gradient_descent$step_size$value), 
-                  appendLF = FALSE)
+          message(
+            " alpha = ",
+            formatC(opt$stages$gradient_descent$step_size$value),
+            appendLF = FALSE
+          )
         }
         
         if (!is.null(old_cost) && cost > old_cost) {
@@ -1285,18 +1621,20 @@ smallvis <- function(X, k = 2, scale = "absmax",
       if (!is.null(epoch_callback)) {
         do_callback(epoch_callback, Y, iter, cost, cost_fn, opt)
       }
-
+      
       if (ret_extra) {
         names(cost) <- iter
         itercosts <- c(itercosts, cost)
       }
-
+      
       # Early stopping tests
       if (!is.null(ee_mon_epoch)) {
         cdiff <- old_cost - cost
         cdiffrc <- cdiff / old_cost
-        if (length(cdiffrc) > 0 && opt_stages[opt_stage_idx] == "early") {
-          if (iter > ee_mon_wait && length(cdiffrc) > 0 && length(old_cdiffrc) > 0 &&
+        if (length(cdiffrc) > 0 &&
+            opt_stages[opt_stage_idx] == "early") {
+          if (iter > ee_mon_wait &&
+              length(cdiffrc) > 0 && length(old_cdiffrc) > 0 &&
               cdiffrc < old_cdiffrc) {
             if (ee_mon_buffer < 1) {
               stop_early <- TRUE
@@ -1314,23 +1652,28 @@ smallvis <- function(X, k = 2, scale = "absmax",
         tsmessage("Stopping early: cost fell below min_cost")
       }
       
-      if (!nnat(opt$is_terminated) && !is.null(tolval) && 
-          tolval < tol && cost <= old_cost && 
-          (iter > stop_lying_iter + tol_wait 
-          || opt_stages[opt_stage_idx] != "opt")) {
+      if (!nnat(opt$is_terminated) && !is.null(tolval) &&
+          tolval < tol && cost <= old_cost &&
+          (iter > stop_lying_iter + tol_wait
+           || opt_stages[opt_stage_idx] != "opt")) {
         stop_early <- TRUE
-        tsmessage("Stopping early: relative tolerance (", formatC(tol), ") met")
+        tsmessage("Stopping early: relative tolerance (",
+                  formatC(tol),
+                  ") met")
       }
-
+      
       # Alternative tolerance grad 2norm doesn't need cost to decrease to stop
       # (Use this for certain settings with e.g. LargeVis where numerical issues
       # can cause the cost function to increase almost negligibly)
       g2tolval <- (norm2(opt_res$G))
-      if (!nnat(opt$is_terminated) && !is.null(g2tol) && g2tolval < g2tol &&
-          (iter > stop_lying_iter + tol_wait 
+      if (!nnat(opt$is_terminated) &&
+          !is.null(g2tol) && g2tolval < g2tol &&
+          (iter > stop_lying_iter + tol_wait
            || opt_stages[opt_stage_idx] != "opt")) {
         stop_early <- TRUE
-        tsmessage("Stopping early: ||G||2 tolerance (", formatC(g2tol), ") met")
+        tsmessage("Stopping early: ||G||2 tolerance (",
+                  formatC(g2tol),
+                  ") met")
       }
       
       # Stop current stage early if we aren't making progress
@@ -1349,27 +1692,28 @@ smallvis <- function(X, k = 2, scale = "absmax",
           mom_switch_iter <- stop_lying_iter + n_low_mom_iters
           opt$mom_switch_iter <- mom_switch_iter
         }
-
+        
         # we know there is at least one more stage or we would have hit the
         # break earlier
         next_opt_stage <- opt_stages[opt_stage_idx + 1]
         
-        switch(next_opt_stage,
-               opt = tsmessage("Proceeding to main optimization stage"),
-               late = {
-                 n_late_exagg_iters <- max_iter - start_late_lying_iter
-                 start_late_lying_iter <- iter + 1
-                 max_iter <- start_late_lying_iter + n_late_exagg_iters
-                 tsmessage("Proceeding to late exaggeration stage")
-               },
-               stop("BUG: unknown optimization stage '", next_opt_stage, "'")
+        switch(
+          next_opt_stage,
+          opt = tsmessage("Proceeding to main optimization stage"),
+          late = {
+            n_late_exagg_iters <- max_iter - start_late_lying_iter
+            start_late_lying_iter <- iter + 1
+            max_iter <- start_late_lying_iter + n_late_exagg_iters
+            tsmessage("Proceeding to late exaggeration stage")
+          },
+          stop("BUG: unknown optimization stage '", next_opt_stage, "'")
         )
       }
       
       if (nnat(opt$is_terminated)) {
         break
       }
-
+      
       old_cost <- cost
       
       # Any special custom epoch stuff
@@ -1378,7 +1722,7 @@ smallvis <- function(X, k = 2, scale = "absmax",
       cost_fn <- epoch_res$cost
     }
   }
-
+  
   if (opt_stages[opt_stage_idx] == "early") {
     cost_fn <- stop_exaggerating(cost_fn, exaggeration_factor)
   }
@@ -1388,413 +1732,33 @@ smallvis <- function(X, k = 2, scale = "absmax",
   
   # Recenter before output
   Y <- sweep(Y, 2, colMeans(Y))
-  res <- ret_value(Y, ret_extra, method, X, scale, Y_init, iter, start_time,
-            cost_fn = cost_fn, opt_res$G,
-            perplexity, itercosts,
-            stop_lying_iter, start_late_lying_iter, opt_list, opt,
-            exaggeration_factor, late_exaggeration_factor,
-            optionals = ret_optionals,
-            pca = ifelse(pca && !whiten, initial_dims, 0),
-            whiten = ifelse(pca && whiten, initial_dims, 0),
-            use_cpp = use_cpp, n_threads = n_threads)
-
-  res
-}
-
-#' Best t-SNE Result From Multiple Initializations
-#'
-#' Run t-SNE multiple times from a random initialization, and return the
-#' embedding with the lowest cost.
-#'
-#' This function ignores any value of \code{Y_init} you set, and uses
-#' \code{Y_init = "rand"}.
-#'
-#' @param nrep Number of repeats.
-#' @param keep_all If \code{TRUE}, then the return value is a list of lists,
-#' indexed from 1 .. \code{nrep}, with each entry the result from each
-#' \code{\link{smallvis}} run. Otherwise just the result with the lowest error
-#' is returned.
-#' @param ... Arguments to apply to each \code{\link{smallvis}} run.
-#' @return The \code{\link{smallvis}} result with the lowest final cost, or
-#' if \code{keep_all} is \code{TRUE} all results as a list, indexed as 1 ..
-#' \code{nrep}. If \code{ret_extra} is not \code{FALSE}, then the final costs for all
-#' \code{nrep} runs are also included in the return value list as a vector
-#' called \code{all_costs}. In this case, if \code{keep_all} is \code{TRUE}, then
-#' \code{all_costs} appears as an extra item on all results. Additionally,
-#' each result will have an extra entry \code{best_rep}, giving the index of the
-#' result with the lowest cost.
-#' @examples
-#' \dontrun{
-#' # Return best result out of five random initializations
-#' tsne_iris_best <- smallvis_rep(nrep = 5, X = iris, perplexity = 50, method = "tsne",
-#'                                ret_extra = TRUE)
-#' # How much do the costs vary between runs?
-#' range(tsne_iris_best$all_costs)
-#' # Display best embedding found
-#' plot(tsne_iris_best$Y)
-#'
-#' # Keep all results
-#' # First result is in tsne_iris_rep[[1]], second in tsne_iris_rep[[2]] etc.
-#' tsne_iris_rep <- smallvis_rep(nrep = 5, X = iris, perplexity = 50, method = "tsne",
-#'                               ret_extra = TRUE, keep_all = TRUE)
-#' # Index of result with smallest error is in special list item 'best_rep'
-#' best_iris <- tsne_iris_rep[[tsne_iris_rep[[1]]$best_rep]]
-#'
-#' }
-#' @export
-smallvis_rep <- function(nrep = 10, keep_all = FALSE, ...) {
-  if (nrep < 1) {
-    stop("nrep must be 1 or greater")
-  }
-  varargs <- list(...)
-  best_res <- NULL
-  best_cost <- Inf
-  all_costs <- c()
-  # Keep requested return type for final result
-  ret_extra <- varargs$ret_extra
-
-  # always return extra so we can find the cost
-  if (!should_ret_extra(ret_extra)) {
-    varargs$ret_extra <- TRUE
-  }
-
-  varargs$Y_init <- "rand"
-  ret <- list()
-
-  for (i in 1:nrep) {
-    # If verbose is not explicitly set to FALSE, it's TRUE by default
-    if (nnat(varargs$verbose) || is.null(varargs$verbose)) {
-      tsmessage("Starting embedding # ", i, " of ", nrep)
-    }
-    res <- do.call(smallvis, varargs)
-
-    if (keep_all) {
-      ret[[i]] <- res
-    }
-
-    final_cost <- res$itercosts[length(res$itercosts)]
-    names(final_cost) <- NULL
-    all_costs <- c(all_costs, final_cost)
-
-    if (!keep_all) {
-      if (final_cost < best_cost) {
-        best_cost <- final_cost
-        best_res <- res
-      }
-    }
-  }
-
-  if (keep_all) {
-    # if keep_all is TRUE and we asked for extra return info
-    # also add the final costs and the index of the best result to each result
-    if (should_ret_extra(ret_extra)) {
-      best_rep <- which.min(all_costs)
-      for (i in 1:nrep) {
-        ret[[i]]$all_costs <- all_costs
-        ret[[i]]$best_rep <- best_rep
-      }
-    }
-    # otherwise just store the Y-coordinates of each result
-    else {
-      for (i in 1:nrep) {
-        ret[[i]] <- ret[[i]]$Y
-      }
-    }
-  }
-  else {
-    # Only keeping one result
-    if (should_ret_extra(ret_extra)) {
-      # store info about other results on best result list
-      best_res$all_costs <- all_costs
-      ret <- best_res
-    }
-    else {
-      ret <- best_res$Y
-    }
-  }
-  ret
-}
-
-# If ret_extra is NULL or FALSE, we aren't returning extra info
-# If ret_extra is TRUE or a vector, we are returning extra info
-should_ret_extra <- function(ret_extra) {
-  !is.null(ret_extra) && ((methods::is(ret_extra, "logical") && ret_extra) ||
-    methods::is(ret_extra, "character"))
-}
-
-#' Dimensionality Reduction With Perplexity Stepping
-#'
-#' Carry out dimensionality reduction of a (small) dataset using one of a
-#' variety of neighbor embedding methods, using a decreasing value of
-#' perplexity to avoid bad local minima.
-#'
-#' This function uses ideas similar to those in the NeRV (Venna et al., 2010)
-#' and JSE (Lee et al., 2013), where to avoid local minima, the initial
-#' optimization steps use affinities with larger bandwidths (NeRV) or larger
-#' perplexity values (JSE). This implementation uses a series of decreasing
-#' perplexity values, as in JSE.
-#'
-#' For details on the arguments that can be passed to the dimensionality
-#' reduction routine, see the help text for \code{\link{smallvis}}.
-#'
-#' To avoid spending too much extra time in perplexity calibrations, the extra
-#' perplexities start at the power of 2 closest to, but not greater than,
-#' half the dataset size (in terms of number of objects). Further calibrations
-#' are then carried out halving the perplexity each time, until the perplexity
-#' specified by the user is reached.
-#'
-#' The number of iterations spent in the larger perplexity values is specified
-#' by the \code{step_iter} parameter. This determines the total number
-#' of iterations, e.g. if \code{step_iter = 250} and extra optimizations
-#' at a perplexity of 1024, 512, 256, 128 and 64 will be carried out, these will
-#' run for 50 iterations each. To keep the number of iterations equivalent to
-#' that used by a single run of \code{\link{smallvis}}, the value of
-#' \code{step_iter} is subtracted from the value of \code{max_iter} before
-#' the optimization at the target perplexity is carried out, e.g. if
-#' \code{max_iter = 1000} and \code{step_iter = 250}, the final
-#' optimization will run for 750 iterations only.
-#'
-#' Any value of \code{tol}, \code{exaggeration_factor} and
-#' \code{stop_lying_iter} provided is used only with the final optimization.
-#'
-#' @param step_iter Number of iterations to carry out the perplexity
-#'  stepping. Must be < the value of \code{max_iter}.
-#' @param ... Arguments to be passed to \code{\link{smallvis}}. See 'Details'
-#'  for information on which arguments may be modified or ignored during certain
-#'  parts of the embedding.
-#' @return The result of the final run of \code{\link{smallvis}} at the target
-#'   perplexity.
-#' @examples
-#' \dontrun{
-#' # t-SNE on the iris with L-BFGS optimization
-#' # The 1000 max_iter is split between 250 iterations at perplexity = 64
-#' # and then 750 iterations at perplexity = 40.
-#' iris_lbfgs_pstep <- smallvis_perpscale(
-#'   step_iter = 250, X = iris, scale = FALSE, verbose = TRUE, Y_init = "spca",
-#'   ret_extra = c("DX", "DY"), perplexity = 40, max_iter = 1000, opt = list("l-bfgs"))
-#' }
-#' @export
-#' @references
-#' Venna, J., Peltonen, J., Nybo, K., Aidos, H., & Kaski, S. (2010).
-#' Information retrieval perspective to nonlinear dimensionality reduction for
-#' data visualization.
-#' \emph{Journal of Machine Learning Research}, \emph{11}, 451-490.
-#'
-#' Lee, J. A., Renard, E., Bernard, G., Dupont, P., & Verleysen, M. (2013).
-#' Type 1 and 2 mixtures of Kullback-Leibler divergences as cost functions in
-#' dimensionality reduction based on similarity preservation.
-#' \emph{Neurocomputing}, \emph{112}, 92-108.
-smallvis_perpstep <- function(step_iter = 250, ...) {
-  varargs <- list(...)
-  max_iter <- varargs$max_iter
-  if (is.null(max_iter)) {
-    max_iter <- 1000
-  }
-  if (max_iter <= step_iter) {
-    stop("max_iter must be > step_iter")
-  }
-
-  target_perplexity <- varargs$perplexity
-  if (is.null(target_perplexity)) {
-    target_perplexity <- 30
-  }
-
-  X <- varargs$X
-  if (methods::is(X, "dist")) {
-    n <- attr(X, "Size")
-  }
-  else {
-    n <- nrow(X)
-  }
-  perps <- scale_perps(n = n, target_perp = target_perplexity)
-
-  nperps <- length(perps)
-  if (nperps > 0) {
-    max_iter_step <- max(1, floor(step_iter / nperps))
-    max_iter_target <- max(1, max_iter - step_iter)
-
-    # Save/Modify some options between step iterations and final optimization
-    ret_extra <- varargs$ret_extra
-    varargs$ret_extra <- FALSE
-    varargs$max_iter <- max_iter_step
-    tol <- varargs$tol
-    varargs$tol <- 0
-    exaggeration_factor <- varargs$exaggeration_factor
-    varargs$exaggeration_factor <- 1
-
-    epoch <- varargs$epoch
-    varargs$epoch <- max_iter_step
-
-    # Loop over initial perplexities
-    res <- NULL
-    for (i in 1:nperps) {
-      if (nnat(varargs$verbose)) {
-        tsmessage("Optimizing at step perplexity ", formatC(perps[i]),
-                  " for ", max_iter_step, " iterations")
-      }
-      varargs$perplexity <- perps[i]
-      if (i > 1) {
-        varargs$Y_init <- res
-      }
-      res <- do.call(smallvis, varargs)
-    }
-
-    varargs$Y_init <- res
-    varargs$max_iter <- max_iter_target
-    # Put the old arguments back before final optimization
-    varargs$perplexity <- target_perplexity
-    if (!is.null(ret_extra)) {
-      varargs$ret_extra <- ret_extra
-    }
-    if (!is.null(tol)) {
-      varargs$tol <- tol
-    }
-    if (!is.null(epoch)) {
-      varargs$epoch <- epoch
-    }
-    if (!is.null(exaggeration_factor)) {
-      varargs$exaggeration_factor <- exaggeration_factor
-    }
-  }
-
-  if (nnat(varargs$verbose)) {
-    tsmessage("Optimizing at target perplexity ", formatC(target_perplexity),
-              " for ", max_iter_target, " iterations")
-  }
-  do.call(smallvis, varargs)
-}
-
-# Utility function for perplexity step
-scale_perps <- function(n, target_perp) {
-  max_perp <- n / 2
-  max_perp <- 2 ^ floor(log(max_perp, 2))
-  perp <- max_perp
-
-  if (max_perp > target_perp) {
-    perps <- c()
-    while (perp > target_perp) {
-      perps <- c(perps, perp)
-      perp <- perp / 2
-    }
-  }
-
-  perps
-}
-
-
-# Input Preprocess --------------------------------------------------------
-
-# Scale X according to various strategies
-scale_input <- function(X, scale, verbose = FALSE) {
-  if (is.null(scale)) {
-    scale <- "none"
-  }
-  if (is.logical(scale)) {
-    if (scale) {
-      scale <- "scale"
-    }
-    else {
-      scale <- "none"
-    }
-  }
-  scale <- match.arg(tolower(scale), c("none", "scale", "range", "absmax"))
-
-  switch(scale,
-         range = {
-           tsmessage("Range scaling X")
-           X <- as.matrix(X)
-           X <- X - min(X)
-           X <- X / max(X)
-         },
-         absmax = {
-           tsmessage("Normalizing by abs-max")
-           X <- base::scale(X, scale = FALSE)
-           X <- X / abs(max(X))
-         },
-         scale = {
-           tsmessage("Scaling to zero mean and unit variance")
-           X <- Filter(stats::var, X)
-           tsmessage("Kept ", ncol(X), " non-zero-variance columns")
-           X <- base::scale(X, scale = TRUE)
-         },
-         none = {
-           X <- as.matrix(X)
-         }
+  res <- ret_value(
+    Y,
+    ret_extra,
+    method,
+    X,
+    scale,
+    Y_init,
+    iter,
+    start_time,
+    cost_fn = cost_fn,
+    opt_res$G,
+    perplexity,
+    itercosts,
+    stop_lying_iter,
+    start_late_lying_iter,
+    opt_list,
+    opt,
+    exaggeration_factor,
+    late_exaggeration_factor,
+    optionals = ret_optionals,
+    pca = ifelse(pca && !whiten, initial_dims, 0),
+    whiten = ifelse(pca && whiten, initial_dims, 0),
+    use_cpp = use_cpp,
+    n_threads = n_threads
   )
-  X
-}
-
-# Reduce input dimensionality via PCA and also optionally whiten data
-pca_preprocess <- function(X, pca, whiten, initial_dims, verbose = FALSE) {
-  # We won't do PCA if the rank of the input is less than the requested
-  # initial dimensionality
-  if (pca) {
-    pca <- min(nrow(X), ncol(X)) >= initial_dims
-  }
-  if (pca) {
-    if (whiten) {
-      tsmessage("Reducing initial dimensionality with PCA and ",
-                "whitening to ", initial_dims, " dims")
-      X <- pca_whiten(X = X, ncol = initial_dims, verbose = verbose)
-    }
-    else {
-      tsmessage("Reducing initial dimensionality with PCA to ",
-                initial_dims, " dims")
-      X <- pca_scores(X = X, ncol = initial_dims, verbose = verbose)
-    }
-  }
-  X
-}
-
-
-
-
-# Epoch Functions ---------------------------------------------------------
-
-do_epoch <- function(opt, cost, iter, Y, fn_val) {
-  if (!is.null(cost$epoch)) {
-    res <- cost$epoch(opt, cost, iter, Y, fn_val)
-    if (!is.null(res$opt)) {
-      opt <- res$opt
-    }
-    if (!is.null(res$cost)) {
-      cost <- res$cost
-    }
-  }
-
-  list(
-    opt = opt,
-    cost = cost
-  )
-}
-
-# Helper function for epoch callback, allowing user to supply callbacks with
-# multiple arities.
-do_callback <- function(cb, Y, iter, cost = NULL, cost_fn = NULL, opt = NULL) {
-  nfs <- length(formals(cb))
-  switch(nfs,
-    "1" = cb(Y),
-    "2" = cb(Y, iter),
-    "3" = cb(Y, iter, cost),
-    "4" = cb(Y, iter, cost, cost_fn),
-    "5" = cb(Y, iter, cost, cost_fn, opt)
-  )
-}
-
-# Create a callback for visualization
-make_smallvis_cb <- function(df) {
-  force(df)
-  palette <- NULL
-  function(Y, iter, cost = NULL) {
-    if (is.null(palette)) {
-      palette <- vizier:::color_helper(df, color_scheme = grDevices::rainbow)$palette
-    }
-    title <- paste0("iter: ", iter)
-    if (!(is.null(cost) || is.na(cost))) {
-      title <- paste0(title, " cost = ", formatC(cost))
-    }
-    vizier::embed_plot(Y, df, title = title, color_scheme = palette)
-  }
+  
+  res
 }
 
 # Result Export -----------------------------------------------------------
@@ -1805,19 +1769,33 @@ make_smallvis_cb <- function(df) {
 # If ret_extra is TRUE and iter > 0, then all the NULL-default parameters are
 # expected to be present. If iter == 0 then the return list will contain only
 # scaling and initialization information.
-ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time = NULL,
+ret_value <- function(Y,
+                      ret_extra,
+                      method,
+                      X,
+                      scale,
+                      Y_init,
+                      iter,
+                      start_time = NULL,
                       cost_fn = NULL,
                       G = NULL,
-                      perplexity = NULL, pca = 0, whiten = 0,
+                      perplexity = NULL,
+                      pca = 0,
+                      whiten = 0,
                       itercosts = NULL,
-                      stop_lying_iter = NULL, start_late_lying_iter = NULL,
-                      opt_input = NULL, opt_res = NULL,
-                      exaggeration_factor = 1, late_exaggeration_factor = 1,
-                      optionals = c(), use_cpp = FALSE, n_threads = 1) {
+                      stop_lying_iter = NULL,
+                      start_late_lying_iter = NULL,
+                      opt_input = NULL,
+                      opt_res = NULL,
+                      exaggeration_factor = 1,
+                      late_exaggeration_factor = 1,
+                      optionals = c(),
+                      use_cpp = FALSE,
+                      n_threads = 1) {
   attr(Y, "dimnames") <- NULL
   if (ret_extra) {
     end_time <- Sys.time()
-
+    
     if (methods::is(X, "dist")) {
       N <- attr(X, "Size")
       origD <- NULL
@@ -1826,7 +1804,7 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
       N <- nrow(X)
       origD <- ncol(X)
     }
-
+    
     res <- list(
       Y = Y,
       N = N,
@@ -1837,24 +1815,24 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
       iter = iter,
       time_secs = as.numeric(end_time - start_time, units = "secs")
     )
-
+    
     if (!is.null(G)) {
       res$G2norm <- norm2(G)
     }
-
+    
     if (pca > 0) {
       res$pca_dims <- pca
     }
     else if (whiten > 0) {
       res$whiten_dims <- whiten
     }
-
+    
     if (is.null(cost_fn$pcost)) {
       cost_fn <- cost_grad(cost_fn, Y)
       cost_fn <- cost_point(cost_fn, Y)
     }
     res$costs <- cost_fn$pcost
-
+    
     if (!is.null(opt_input)) {
       res$opt <- opt_input
       if (!is.null(opt_res$counts)) {
@@ -1862,14 +1840,14 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
       }
     }
     
-
+    
     # Don't report exaggeration settings if they didn't do anything
     if (exaggeration_factor == 1 || late_exaggeration_factor == 1) {
       if (exaggeration_factor == 1) {
         exaggeration_factor <- NULL
         stop_lying_iter <- NULL
       }
-
+      
       if (late_exaggeration_factor == 1) {
         late_exaggeration_factor <- NULL
         start_late_lying_iter <- NULL
@@ -1882,22 +1860,25 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
         late_exaggeration_factor <- NULL
       }
     }
-
-    res <- c(res, list(
-      perplexity = perplexity,
-      itercosts = itercosts,
-      exaggeration_factor = exaggeration_factor,
-      stop_lying_iter = stop_lying_iter,
-      late_exaggeration_factor = late_exaggeration_factor,
-      start_late_lying_iter = start_late_lying_iter
-    ))
-
+    
+    res <- c(
+      res,
+      list(
+        perplexity = perplexity,
+        itercosts = itercosts,
+        exaggeration_factor = exaggeration_factor,
+        stop_lying_iter = stop_lying_iter,
+        late_exaggeration_factor = late_exaggeration_factor,
+        start_late_lying_iter = start_late_lying_iter
+      )
+    )
+    
     # If using the Intrinsic Dimensionality method, use the chosen perplexity
     if (!is.null(cost_fn$idp)) {
       res$perplexity <- cost_fn$idp
     }
-
-
+    
+    
     optionals <- tolower(unique(optionals))
     for (o in optionals) {
       exported <- NULL
@@ -1925,12 +1906,12 @@ ret_value <- function(Y, ret_extra, method, X, scale, Y_init, iter, start_time =
       else if (o == "dy") {
         res$DY <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
       }
-
+      
       if (o == "x") {
         res$X <- X
       }
     }
-
+    
     res <- remove_nulls(res)
     res
   }

From c78fa66dc6d99138e07d28c2e3bf3c0c8cf62cde Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 12:51:41 -0700
Subject: [PATCH 20/26] reformatting

---
 smallvis/R/smallvis.R | 1591 ++++++++++++++++++++---------------------
 1 file changed, 791 insertions(+), 800 deletions(-)

diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index 241e7a0..0c5b12e 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -876,579 +876,203 @@ smallvis <- function(X,
   if (is.logical(epoch_callback)) {
     if (epoch_callback) {
       epoch_callback <- make_smallvis_cb(X)
-    }
-    else {
+    } else {
       epoch_callback <- NULL
     }
-  }
-  else if (is.function(epoch_callback)) {
+  } else if (is.function(epoch_callback)) {
     force(epoch_callback)
   }
-  
+
   # The embedding method
-  method_names <- c(
-    "tsne",
-    "largevis",
-    "umap",
-    "tumap",
-    "ntumap",
-    "mmds",
-    "gmmds",
-    "asne",
-    "ssne",
-    "wtsne",
-    "wssne",
-    "hssne",
-    "ee",
-    "nerv",
-    "snerv",
-    "jse",
-    "sjse",
-    "smmds",
-    "sammon",
-    "tasne",
-    "trmsne",
-    "trsrsne",
-    "tmsne",
-    "arsrsne",
-    "rsrjse",
-    "rsrnerv",
-    "btsne",
-    "bssne",
-    "basne",
-    "btasne",
-    "bnerv",
-    "ballmmds",
-    "knnmmds",
-    "dhssne",
-    "pstsne",
-    "tsneu",
-    "skdtsne",
-    "usne",
-    "cetsne",
-    "tee",
-    "absne",
-    "chsne",
-    "hlsne",
-    "rklsne",
-    "jssne",
-    "gsne",
-    "abssne",
-    "bhssne"
+  cost_fn <- create_cost(method, perplexity, eps, n_threads, use_cpp)
+
+  if (exaggeration_factor != 1) {
+    if (stop_lying_iter < 1) {
+      stop("stop_lying_iter must be >= 1")
+    }
+  }
+
+  if (late_exaggeration_factor != 1) {
+    if (start_late_lying_iter < 1) {
+      stop("start_late_lying_iter must be >= 1")
+    }
+  }
+
+  if (exaggeration_factor != 1 && late_exaggeration_factor != 1) {
+    if (start_late_lying_iter < stop_lying_iter) {
+      stop("start_late_lying_iter must be >= stop_lying_iter")
+    }
+  }
+
+  if (methods::is(pca, "character") && pca == "whiten") {
+    pca <- TRUE
+    whiten <- TRUE
+  } else {
+    whiten <- FALSE
+  }
+  if (pca && initial_dims < k) {
+    stop(
+      "Initial PCA dimensionality must be larger than desired output ",
+      "dimension"
+    )
+  }
+
+  start_time <- NULL
+  ret_optionals <- c()
+  if (methods::is(ret_extra, "character")) {
+    ret_optionals <- ret_extra
+    ret_extra <- TRUE
+  }
+
+  if (ret_extra) {
+    start_time <- Sys.time()
+  }
+
+  if (methods::is(X, "dist")) {
+    n <- attr(X, "Size")
+  } else {
+    if (methods::is(X, "data.frame")) {
+      indexes <- which(vapply(X, is.numeric, logical(1)))
+      tsmessage("Found ", length(indexes), " numeric columns")
+      if (length(indexes) == 0) {
+        stop("No numeric columns found")
+      }
+      X <- X[, indexes]
+    }
+
+    X <- scale_input(X, scale, verbose = verbose)
+    X <- pca_preprocess(X, pca, whiten, initial_dims, verbose = verbose)
+    n <- nrow(X)
+  }
+
+  # Check for NA
+  if (any(is.na(X))) {
+    stop("Input data contains NA: missing data is not allowed")
+  }
+
+  # Fail early as possible if matrix initializer is invalid
+  if (methods::is(Y_init, "matrix")) {
+    if (nrow(Y_init) != n || ncol(Y_init) != k) {
+      stop("Y_init matrix does not match necessary configuration for X")
+    }
+  }
+
+  # Optimizer
+  if (opt[[1]] == "dbd" || opt[[1]] == "ndbd") {
+    if (eta == "optsne") {
+      eta <- n / exaggeration_factor
+      tsmessage("Using opt-SNE learning rate = ", formatC(eta))
+    }
+    opt_list <- lmerge(
+      opt,
+      list(
+        momentum = momentum,
+        final_momentum = final_momentum,
+        mom_switch_iter = mom_switch_iter,
+        eta = eta,
+        min_gain = min_gain,
+        verbose = verbose
+      )
+    )
+  } else {
+    opt_list <- opt
+  }
+  opt <- opt_create(opt_list, verbose = verbose)
+
+  # Initialize the cost function and create P
+  cost_fn <- cost_init(
+    cost_fn,
+    X,
+    max_iter = max_iter,
+    verbose = verbose,
+    ret_extra = ret_optionals
   )
-  if (is.character(method)) {
-    method <- match.arg(tolower(method), method_names)
-    cost_fn <- switch(
-      method,
-      tsne = tsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      umap = umap(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      largevis = largevis(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      tumap = tumap(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      ntumap = ntumap(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      mmds = mmds(
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      gmmds = gmmds(
-        k = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      asne = asne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      ssne = ssne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      wtsne = wtsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      wssne = wssne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      hssne = hssne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      ee = ee(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      nerv = nerv(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      snerv = snerv(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      jse = jse(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      sjse = sjse(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      smmds = smmds(
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      sammon = sammon(
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      tasne = tasne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      trmsne = trmsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      trsrsne = trsrsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      tmsne = tmsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      arsrsne = arsrsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      rsrjse = rsrjse(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      rsrnerv = rsrnerv(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      btsne = btsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      bssne = bssne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      basne = basne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      btasne = btasne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      bnerv = bnerv(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      ballmmds = ballmmds(
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      knnmmds = knnmmds(
-        k = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      dhssne = dhssne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      tsneu = tsneu(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      pstsne = pstsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      skdtsne = skdtsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      usne = usne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      cetsne = cetsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      tee = tee(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      absne = absne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      chsne = chsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      hlsne = hlsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      gsne = gsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      rklsne = rklsne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      jssne = jssne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      abssne = abssne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      bhssne = bhssne(
-        perplexity = perplexity,
-        n_threads = n_threads,
-        eps = eps,
-        use_cpp = use_cpp
-      ),
-      stop("BUG: someone forgot to implement option: '", method, "'")
-    )
-  }
-  else {
-    if (is.list(method)) {
-      methodlist <- method
-      method_name <- methodlist[[1]]
-      methodlist[[1]] <- NULL
-      method <- match.arg(tolower(method_name), method_names)
-      if (exists(method_name)) {
-        fn <- get(method_name)
-      }
-      else {
-        stop("Unknown method: '", method_name, "'")
+
+  # Output Initialization
+  if (!is.null(Y_init)) {
+    if (methods::is(Y_init, "matrix")) {
+      Y <- Y_init
+      Y_init <- "matrix"
+    } else {
+      Y_init <- match.arg(
+        tolower(Y_init),
+        c("rand", "pca", "spca", "laplacian", "normlaplacian")
+      )
+
+      if (!is_spectral_init(Y_init)) {
+        # We now handle scaling coordinates below, so spca is treated like pca
+        non_spectral_init <- Y_init
+        if (non_spectral_init == "spca") {
+          non_spectral_init <- "pca"
+        }
+        Y <- init_out(non_spectral_init,
+          X,
+          n,
+          k,
+          pca_preprocessed = pca,
+          verbose = verbose
+        )
+      } else {
+        # Use normalized or unnormalized input weight for spectral initialization
+        # Pretty sure it doesn't matter, given the current options
+        A <- cost_fn$V
+        if (is.null(A)) {
+          A <- cost_fn$P
+          if (!is.null(A)) {
+            tsmessage("Using P for spectral initialization")
+          }
+        } else {
+          tsmessage("Using V for spectral initialization")
+        }
+
+        if (is.null(A)) {
+          if (!is.null(cost_fn$knn)) {
+            tsmessage("Using knn 1 / 1 + R for spectral initialization")
+            A <- 1 / (1 + cost_fn$R)
+            A[cost_fn$knn == 0] <- 0
+          } else if (!is.null(cost_fn$R)) {
+            tsmessage("Using 1/ (1 + R) for spectral initialization")
+            A <- 1 / (1 + cost_fn$R)
+          } else {
+            stop("No suitable input for spectral initialization")
+          }
+        }
+
+        if (Y_init == "laplacian") {
+          tsmessage("Initializing from Laplacian Eigenmap")
+          Y <- laplacian_eigenmap(A, ndim = k)
+        } else {
+          tsmessage("Initializing from normalized Laplacian")
+          Y <- normalized_spectral_init(A, ndim = k)
+        }
       }
     }
-    else if (is.function(method)) {
-      fn <- method
-      methodlist <- list()
-    }
-    else {
-      stop("Bad method parameter type")
-    }
-    
-    param_names <- names(formals(fn))
-    if ("perplexity" %in% param_names) {
-      methodlist$perplexity <- perplexity
-    }
-    if ("k" %in% param_names) {
-      methodlist$k <- perplexity
-    }
-    cost_fn <- do.call(fn, methodlist)
-  }
-  
-  if (exaggeration_factor != 1) {
-    if (stop_lying_iter < 1) {
-      stop("stop_lying_iter must be >= 1")
+    if (!is.null(Y_init_sdev) ||
+      Y_init == "spca" || Y_init == "rand") {
+      if (is.null(Y_init_sdev)) {
+        Y_init_sdev <- 1e-4
+      }
+      tsmessage("Scaling initial coords to sdev = ", formatC(Y_init_sdev))
+      Y <- shrink_coords(Y, Y_init_sdev)
     }
   }
-  
-  if (late_exaggeration_factor != 1) {
-    if (start_late_lying_iter < 1) {
-      stop("start_late_lying_iter must be >= 1")
-    }
-  }
-  
-  if (exaggeration_factor != 1 && late_exaggeration_factor != 1) {
-    if (start_late_lying_iter < stop_lying_iter) {
-      stop("start_late_lying_iter must be >= stop_lying_iter")
-    }
-  }
-  
-  if (methods::is(pca, "character") && pca == "whiten") {
-    pca <- TRUE
-    whiten <- TRUE
-  }
-  else {
-    whiten <- FALSE
-  }
-  if (pca && initial_dims < k) {
-    stop("Initial PCA dimensionality must be larger than desired output ",
-         "dimension")
-  }
-  
-  start_time <- NULL
-  ret_optionals <- c()
-  if (methods::is(ret_extra, "character")) {
-    ret_optionals <- ret_extra
-    ret_extra <- TRUE
-  }
-  
-  if (ret_extra) {
-    start_time <- Sys.time()
-  }
-  
-  if (methods::is(X, "dist")) {
-    n <- attr(X, "Size")
-  }
-  else {
-    if (methods::is(X, "data.frame")) {
-      indexes <- which(vapply(X, is.numeric, logical(1)))
-      tsmessage("Found ", length(indexes), " numeric columns")
-      if (length(indexes) == 0) {
-        stop("No numeric columns found")
-      }
-      X <- X[, indexes]
-    }
-    
-    X <- scale_input(X, scale, verbose = verbose)
-    X <- pca_preprocess(X, pca, whiten, initial_dims, verbose = verbose)
-    n <- nrow(X)
-  }
-  
-  # Check for NA
-  if (any(is.na(X))) {
-    stop("Input data contains NA: missing data is not allowed")
-  }
-  
-  # Fail early as possible if matrix initializer is invalid
-  if (methods::is(Y_init, "matrix")) {
-    if (nrow(Y_init) != n || ncol(Y_init) != k) {
-      stop("Y_init matrix does not match necessary configuration for X")
-    }
-  }
-  
-  # Optimizer
-  if (opt[[1]] == "dbd" || opt[[1]] == "ndbd") {
-    if (eta == "optsne") {
-      eta <- n / exaggeration_factor
-      tsmessage("Using opt-SNE learning rate = ", formatC(eta))
-    }
-    opt_list <- lmerge(
-      opt,
-      list(
-        momentum = momentum,
-        final_momentum = final_momentum,
-        mom_switch_iter = mom_switch_iter,
-        eta = eta,
-        min_gain = min_gain,
-        verbose = verbose
-      )
-    )
-  }
-  else {
-    opt_list <- opt
-  }
-  opt <- opt_create(opt_list, verbose = verbose)
-  
-  # Initialize the cost function and create P
-  cost_fn <- cost_init(
-    cost_fn,
-    X,
-    max_iter = max_iter,
-    verbose = verbose,
-    ret_extra = ret_optionals
-  )
-  
-  # Output Initialization
-  if (!is.null(Y_init)) {
-    if (methods::is(Y_init, "matrix")) {
-      Y <- Y_init
-      Y_init <- "matrix"
-    }
-    else {
-      Y_init <- match.arg(tolower(Y_init),
-                          c("rand", "pca", "spca", "laplacian", "normlaplacian"))
-      
-      if (!is_spectral_init(Y_init)) {
-        # We now handle scaling coordinates below, so spca is treated like pca
-        non_spectral_init <- Y_init
-        if (non_spectral_init == "spca") {
-          non_spectral_init <- "pca"
-        }
-        Y <- init_out(non_spectral_init,
-                      X,
-                      n,
-                      k,
-                      pca_preprocessed = pca,
-                      verbose = verbose)
-      }
-      else {
-        # Use normalized or unnormalized input weight for spectral initialization
-        # Pretty sure it doesn't matter, given the current options
-        A <- cost_fn$V
-        if (is.null(A)) {
-          A <- cost_fn$P
-          if (!is.null(A)) {
-            tsmessage("Using P for spectral initialization")
-          }
-        }
-        else {
-          tsmessage("Using V for spectral initialization")
-        }
-        
-        if (is.null(A)) {
-          if (!is.null(cost_fn$knn)) {
-            tsmessage("Using knn 1 / 1 + R for spectral initialization")
-            A <- 1 / (1 + cost_fn$R)
-            A[cost_fn$knn == 0] <- 0
-          }
-          else if (!is.null(cost_fn$R)) {
-            tsmessage("Using 1/ (1 + R) for spectral initialization")
-            A <- 1 / (1 + cost_fn$R)
-          }
-          else {
-            stop("No suitable input for spectral initialization")
-          }
-        }
-        
-        if (Y_init == "laplacian") {
-          tsmessage("Initializing from Laplacian Eigenmap")
-          Y <- laplacian_eigenmap(A, ndim = k)
-        }
-        else {
-          tsmessage("Initializing from normalized Laplacian")
-          Y <- normalized_spectral_init(A, ndim = k)
-        }
-      }
-    }
-    if (!is.null(Y_init_sdev) ||
-        Y_init == "spca" || Y_init == "rand") {
-      if (is.null(Y_init_sdev)) {
-        Y_init_sdev <- 1e-4
-      }
-      tsmessage("Scaling initial coords to sdev = ", formatC(Y_init_sdev))
-      Y <- shrink_coords(Y, Y_init_sdev)
-    }
-  }
-  
+
   cost <- NA
   itercosts <- c()
   if (iter0_cost && (verbose || ret_extra)) {
     cost_eval_res <- cost_eval(cost_fn, Y)
     cost_fn <- cost_eval_res$cost
     cost <- cost_eval_res$value
-    
+
     tsmessage("Iteration #0 error: ", formatC(cost))
-    
+
     if (ret_extra) {
       names(cost) <- 0
       itercosts <- c(itercosts, cost)
     }
     cost_fn <- cost_clear(cost_fn)
   }
-  
+
   # Display initialization
   if (!is.null(epoch_callback)) {
     do_callback(epoch_callback, Y, 0, cost, cost_fn, opt)
@@ -1469,13 +1093,13 @@ smallvis <- function(X,
         optionals = ret_optionals,
         pca = ifelse(pca && !whiten, initial_dims, 0),
         whiten = ifelse(pca &&
-                          whiten, initial_dims, 0),
+          whiten, initial_dims, 0),
         use_cpp = use_cpp,
         n_threads = n_threads
       )
     )
   }
-  
+
   opt_stages <- c()
   if (exaggeration_factor != 1) {
     opt_stages <- c(opt_stages, "early")
@@ -1485,7 +1109,7 @@ smallvis <- function(X,
     opt_stages <- c(opt_stages, "late")
   }
   opt_stage_idx <- 1
-  
+
   opt_epoch <- epoch
   opt_tol <- tol
   old_cdiffrc <- NULL
@@ -1506,15 +1130,14 @@ smallvis <- function(X,
         " iterations"
       )
     }
-  }
-  else {
+  } else {
     stop_lying_iter <- 0
   }
-  
+
   old_cost <- NULL
   tolval <- NULL
   tsmessage("Optimizing coordinates")
-  
+
   # Use a while loop, so we can change max_iter inside the loop
   iter <- 0
   while (iter < max_iter) {
@@ -1523,18 +1146,18 @@ smallvis <- function(X,
     opt <- opt_res$opt
     cost_fn <- opt_res$cost_fn
     Y <- opt_res$Y
-    
+
     if (!is.null(cost_fn$P) && iter == stop_lying_iter &&
-        exaggeration_factor != 1) {
+      exaggeration_factor != 1) {
       tsmessage("Switching off exaggeration at iter ", iter)
       cost_fn <- stop_exaggerating(cost_fn, exaggeration_factor)
       opt_stage_idx <- opt_stage_idx + 1
       epoch <- opt_epoch
       tol <- opt_tol
     }
-    
+
     if (!is.null(cost_fn$P) && iter == start_late_lying_iter &&
-        late_exaggeration_factor != 1) {
+      late_exaggeration_factor != 1) {
       tsmessage(
         "Starting late exaggeration = ",
         formatC(late_exaggeration_factor),
@@ -1544,223 +1167,598 @@ smallvis <- function(X,
         max_iter
       )
       cost_fn <- start_exaggerating(cost_fn, late_exaggeration_factor)
-      
+
       opt_stage_idx <- opt_stage_idx + 1
     }
-    
-    if (nnat(opt$is_terminated)) {
-      tsmessage(
-        "Iteration #",
-        iter,
-        " stopping early: optimizer reports convergence: ",
-        opt$terminate$what
-      )
-      max_iter <- iter
-      break
+
+    if (nnat(opt$is_terminated)) {
+      tsmessage(
+        "Iteration #",
+        iter,
+        " stopping early: optimizer reports convergence: ",
+        opt$terminate$what
+      )
+      max_iter <- iter
+      break
+    }
+
+    # Recenter after each iteration
+    Y <- sweep(Y, 2, colMeans(Y))
+
+    if ((epoch > 0 && iter %% epoch == 0) || iter == max_iter) {
+      stop_early <- FALSE
+
+      cost_eval_res <- cost_eval(cost_fn, Y, opt_res)
+      cost_fn <- cost_eval_res$cost
+      cost <- cost_eval_res$value
+
+      if (!is.null(old_cost)) {
+        tolval <- reltol(cost, old_cost) / epoch
+      }
+
+      if (verbose) {
+        tsmessage(
+          "Iteration #",
+          iter,
+          " error: ",
+          formatC(cost),
+          " ||G||2 = ",
+          formatC(norm2(opt_res$G)),
+          appendLF = FALSE
+        )
+        if (!is.null(tolval)) {
+          message(" tol = ", formatC(tolval), appendLF = FALSE)
+        }
+
+        if (!is.null(opt$counts)) {
+          message(" nf = ",
+            opt$counts$fn,
+            " ng = ",
+            opt$counts$gr,
+            appendLF = FALSE
+          )
+        }
+
+        # special treatment for mize innards
+        if (!is.null(opt$stages$gradient_descent$step_size$value)) {
+          opt$stages$gradient_descent$step_size$value
+          message(
+            " alpha = ",
+            formatC(opt$stages$gradient_descent$step_size$value),
+            appendLF = FALSE
+          )
+        }
+
+        if (!is.null(old_cost) && cost > old_cost) {
+          message(" !", appendLF = FALSE)
+        }
+
+        if (!is.null(opt$epoch)) {
+          opt <- opt$epoch(Y, iter, cost, cost_fn, opt)
+        }
+
+        message()
+        utils::flush.console()
+      }
+      if (!is.null(epoch_callback)) {
+        do_callback(epoch_callback, Y, iter, cost, cost_fn, opt)
+      }
+
+      if (ret_extra) {
+        names(cost) <- iter
+        itercosts <- c(itercosts, cost)
+      }
+
+      # Early stopping tests
+      if (!is.null(ee_mon_epoch)) {
+        cdiff <- old_cost - cost
+        cdiffrc <- cdiff / old_cost
+        if (length(cdiffrc) > 0 &&
+          opt_stages[opt_stage_idx] == "early") {
+          if (iter > ee_mon_wait &&
+            length(cdiffrc) > 0 && length(old_cdiffrc) > 0 &&
+            cdiffrc < old_cdiffrc) {
+            if (ee_mon_buffer < 1) {
+              stop_early <- TRUE
+              tsmessage("Stopping early: EE relative change reached maximum")
+            } else {
+              ee_mon_buffer <- ee_mon_buffer - 1
+            }
+          }
+        }
+        old_cdiffrc <- cdiffrc
+      }
+      if (cost < min_cost) {
+        stop_early <- TRUE
+        tsmessage("Stopping early: cost fell below min_cost")
+      }
+
+      if (!nnat(opt$is_terminated) && !is.null(tolval) &&
+        tolval < tol && cost <= old_cost &&
+        (iter > stop_lying_iter + tol_wait ||
+          opt_stages[opt_stage_idx] != "opt")) {
+        stop_early <- TRUE
+        tsmessage(
+          "Stopping early: relative tolerance (",
+          formatC(tol),
+          ") met"
+        )
+      }
+
+      # Alternative tolerance grad 2norm doesn't need cost to decrease to stop
+      # (Use this for certain settings with e.g. LargeVis where numerical issues
+      # can cause the cost function to increase almost negligibly)
+      g2tolval <- (norm2(opt_res$G))
+      if (!nnat(opt$is_terminated) &&
+        !is.null(g2tol) && g2tolval < g2tol &&
+        (iter > stop_lying_iter + tol_wait ||
+          opt_stages[opt_stage_idx] != "opt")) {
+        stop_early <- TRUE
+        tsmessage(
+          "Stopping early: ||G||2 tolerance (",
+          formatC(g2tol),
+          ") met"
+        )
+      }
+
+      # Stop current stage early if we aren't making progress
+      if (stop_early) {
+        if (opt_stage_idx == length(opt_stages)) {
+          # run out of optimization stages, so give up
+          break
+        }
+
+        opt_stage <- opt_stages[opt_stage_idx]
+        if (opt_stage == "early") {
+          # stop early exaggeration and adjust mom_switch_iter accordingly
+          # Only applies to DBD optimizer
+          n_low_mom_iters <- mom_switch_iter - stop_lying_iter
+          stop_lying_iter <- iter + 1
+          mom_switch_iter <- stop_lying_iter + n_low_mom_iters
+          opt$mom_switch_iter <- mom_switch_iter
+        }
+
+        # we know there is at least one more stage or we would have hit the
+        # break earlier
+        next_opt_stage <- opt_stages[opt_stage_idx + 1]
+
+        switch(next_opt_stage,
+          opt = tsmessage("Proceeding to main optimization stage"),
+          late = {
+            n_late_exagg_iters <- max_iter - start_late_lying_iter
+            start_late_lying_iter <- iter + 1
+            max_iter <- start_late_lying_iter + n_late_exagg_iters
+            tsmessage("Proceeding to late exaggeration stage")
+          },
+          stop("BUG: unknown optimization stage '", next_opt_stage, "'")
+        )
+      }
+
+      if (nnat(opt$is_terminated)) {
+        break
+      }
+
+      old_cost <- cost
+
+      # Any special custom epoch stuff
+      epoch_res <- do_epoch(opt, cost_fn, iter, Y, cost)
+      opt <- epoch_res$opt
+      cost_fn <- epoch_res$cost
+    }
+  }
+
+  if (opt_stages[opt_stage_idx] == "early") {
+    cost_fn <- stop_exaggerating(cost_fn, exaggeration_factor)
+  }
+  if (opt_stages[opt_stage_idx] == "late") {
+    cost_fn <- stop_exaggerating(cost_fn, late_exaggeration_factor)
+  }
+
+  # Recenter before output
+  Y <- sweep(Y, 2, colMeans(Y))
+  res <- ret_value(
+    Y,
+    ret_extra,
+    method,
+    X,
+    scale,
+    Y_init,
+    iter,
+    start_time,
+    cost_fn = cost_fn,
+    opt_res$G,
+    perplexity,
+    itercosts,
+    stop_lying_iter,
+    start_late_lying_iter,
+    opt_list,
+    opt,
+    exaggeration_factor,
+    late_exaggeration_factor,
+    optionals = ret_optionals,
+    pca = ifelse(pca && !whiten, initial_dims, 0),
+    whiten = ifelse(pca && whiten, initial_dims, 0),
+    use_cpp = use_cpp,
+    n_threads = n_threads
+  )
+
+  res
+}
+
+create_cost <- function(method,
+                        perplexity,
+                        eps,
+                        n_threads,
+                        use_cpp) {
+  method_names <- c(
+    "tsne",
+    "largevis",
+    "umap",
+    "tumap",
+    "ntumap",
+    "mmds",
+    "gmmds",
+    "asne",
+    "ssne",
+    "wtsne",
+    "wssne",
+    "hssne",
+    "ee",
+    "nerv",
+    "snerv",
+    "jse",
+    "sjse",
+    "smmds",
+    "sammon",
+    "tasne",
+    "trmsne",
+    "trsrsne",
+    "tmsne",
+    "arsrsne",
+    "rsrjse",
+    "rsrnerv",
+    "btsne",
+    "bssne",
+    "basne",
+    "btasne",
+    "bnerv",
+    "ballmmds",
+    "knnmmds",
+    "dhssne",
+    "pstsne",
+    "tsneu",
+    "skdtsne",
+    "usne",
+    "cetsne",
+    "tee",
+    "absne",
+    "chsne",
+    "hlsne",
+    "rklsne",
+    "jssne",
+    "gsne",
+    "abssne",
+    "bhssne"
+  )
+  if (is.character(method)) {
+    method <- match.arg(tolower(method), method_names)
+    cost_fn <- switch(method,
+      tsne = tsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      umap = umap(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      largevis = largevis(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tumap = tumap(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      ntumap = ntumap(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      mmds = mmds(
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      gmmds = gmmds(
+        k = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      asne = asne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      ssne = ssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      wtsne = wtsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      wssne = wssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      hssne = hssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      ee = ee(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      nerv = nerv(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      snerv = snerv(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      jse = jse(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      sjse = sjse(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      smmds = smmds(
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      sammon = sammon(
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tasne = tasne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      trmsne = trmsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      trsrsne = trsrsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tmsne = tmsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      arsrsne = arsrsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      rsrjse = rsrjse(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      rsrnerv = rsrnerv(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      btsne = btsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      bssne = bssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      basne = basne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      btasne = btasne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      bnerv = bnerv(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      ballmmds = ballmmds(
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      knnmmds = knnmmds(
+        k = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      dhssne = dhssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tsneu = tsneu(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      pstsne = pstsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      skdtsne = skdtsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      usne = usne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      cetsne = cetsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      tee = tee(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      absne = absne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      chsne = chsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      hlsne = hlsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      gsne = gsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      rklsne = rklsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      jssne = jssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      abssne = abssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      bhssne = bhssne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        use_cpp = use_cpp
+      ),
+      stop("BUG: someone forgot to implement option: '", method, "'")
+    )
+  } else {
+    if (is.list(method)) {
+      methodlist <- method
+      method_name <- methodlist[[1]]
+      methodlist[[1]] <- NULL
+      method <- match.arg(tolower(method_name), method_names)
+      if (exists(method_name)) {
+        fn <- get(method_name)
+      } else {
+        stop("Unknown method: '", method_name, "'")
+      }
+    } else if (is.function(method)) {
+      fn <- method
+      methodlist <- list()
+    } else {
+      stop("Bad method parameter type")
+    }
+
+    param_names <- names(formals(fn))
+    if ("perplexity" %in% param_names) {
+      methodlist$perplexity <- perplexity
     }
-    
-    # Recenter after each iteration
-    Y <- sweep(Y, 2, colMeans(Y))
-    
-    if ((epoch > 0 && iter %% epoch == 0) || iter == max_iter) {
-      stop_early <- FALSE
-      
-      cost_eval_res <- cost_eval(cost_fn, Y, opt_res)
-      cost_fn <- cost_eval_res$cost
-      cost <- cost_eval_res$value
-      
-      if (!is.null(old_cost)) {
-        tolval <- reltol(cost, old_cost) / epoch
-      }
-      
-      if (verbose) {
-        tsmessage(
-          "Iteration #",
-          iter,
-          " error: ",
-          formatC(cost)
-          ,
-          " ||G||2 = ",
-          formatC(norm2(opt_res$G))
-          ,
-          appendLF = FALSE
-        )
-        if (!is.null(tolval)) {
-          message(" tol = ", formatC(tolval), appendLF = FALSE)
-        }
-        
-        if (!is.null(opt$counts)) {
-          message(" nf = ",
-                  opt$counts$fn,
-                  " ng = ",
-                  opt$counts$gr,
-                  appendLF = FALSE)
-        }
-        
-        # special treatment for mize innards
-        if (!is.null(opt$stages$gradient_descent$step_size$value)) {
-          opt$stages$gradient_descent$step_size$value
-          message(
-            " alpha = ",
-            formatC(opt$stages$gradient_descent$step_size$value),
-            appendLF = FALSE
-          )
-        }
-        
-        if (!is.null(old_cost) && cost > old_cost) {
-          message(" !", appendLF = FALSE)
-        }
-        
-        if (!is.null(opt$epoch)) {
-          opt <- opt$epoch(Y, iter, cost, cost_fn, opt)
-        }
-        
-        message()
-        utils::flush.console()
-      }
-      if (!is.null(epoch_callback)) {
-        do_callback(epoch_callback, Y, iter, cost, cost_fn, opt)
-      }
-      
-      if (ret_extra) {
-        names(cost) <- iter
-        itercosts <- c(itercosts, cost)
-      }
-      
-      # Early stopping tests
-      if (!is.null(ee_mon_epoch)) {
-        cdiff <- old_cost - cost
-        cdiffrc <- cdiff / old_cost
-        if (length(cdiffrc) > 0 &&
-            opt_stages[opt_stage_idx] == "early") {
-          if (iter > ee_mon_wait &&
-              length(cdiffrc) > 0 && length(old_cdiffrc) > 0 &&
-              cdiffrc < old_cdiffrc) {
-            if (ee_mon_buffer < 1) {
-              stop_early <- TRUE
-              tsmessage("Stopping early: EE relative change reached maximum")
-            }
-            else {
-              ee_mon_buffer <- ee_mon_buffer - 1
-            }
-          }
-        }
-        old_cdiffrc <- cdiffrc
-      }
-      if (cost < min_cost) {
-        stop_early <- TRUE
-        tsmessage("Stopping early: cost fell below min_cost")
-      }
-      
-      if (!nnat(opt$is_terminated) && !is.null(tolval) &&
-          tolval < tol && cost <= old_cost &&
-          (iter > stop_lying_iter + tol_wait
-           || opt_stages[opt_stage_idx] != "opt")) {
-        stop_early <- TRUE
-        tsmessage("Stopping early: relative tolerance (",
-                  formatC(tol),
-                  ") met")
-      }
-      
-      # Alternative tolerance grad 2norm doesn't need cost to decrease to stop
-      # (Use this for certain settings with e.g. LargeVis where numerical issues
-      # can cause the cost function to increase almost negligibly)
-      g2tolval <- (norm2(opt_res$G))
-      if (!nnat(opt$is_terminated) &&
-          !is.null(g2tol) && g2tolval < g2tol &&
-          (iter > stop_lying_iter + tol_wait
-           || opt_stages[opt_stage_idx] != "opt")) {
-        stop_early <- TRUE
-        tsmessage("Stopping early: ||G||2 tolerance (",
-                  formatC(g2tol),
-                  ") met")
-      }
-      
-      # Stop current stage early if we aren't making progress
-      if (stop_early) {
-        if (opt_stage_idx == length(opt_stages)) {
-          # run out of optimization stages, so give up
-          break
-        }
-        
-        opt_stage <- opt_stages[opt_stage_idx]
-        if (opt_stage == "early") {
-          # stop early exaggeration and adjust mom_switch_iter accordingly
-          # Only applies to DBD optimizer
-          n_low_mom_iters <- mom_switch_iter - stop_lying_iter
-          stop_lying_iter <- iter + 1
-          mom_switch_iter <- stop_lying_iter + n_low_mom_iters
-          opt$mom_switch_iter <- mom_switch_iter
-        }
-        
-        # we know there is at least one more stage or we would have hit the
-        # break earlier
-        next_opt_stage <- opt_stages[opt_stage_idx + 1]
-        
-        switch(
-          next_opt_stage,
-          opt = tsmessage("Proceeding to main optimization stage"),
-          late = {
-            n_late_exagg_iters <- max_iter - start_late_lying_iter
-            start_late_lying_iter <- iter + 1
-            max_iter <- start_late_lying_iter + n_late_exagg_iters
-            tsmessage("Proceeding to late exaggeration stage")
-          },
-          stop("BUG: unknown optimization stage '", next_opt_stage, "'")
-        )
-      }
-      
-      if (nnat(opt$is_terminated)) {
-        break
-      }
-      
-      old_cost <- cost
-      
-      # Any special custom epoch stuff
-      epoch_res <- do_epoch(opt, cost_fn, iter, Y, cost)
-      opt <- epoch_res$opt
-      cost_fn <- epoch_res$cost
+    if ("k" %in% param_names) {
+      methodlist$k <- perplexity
     }
+    cost_fn <- do.call(fn, methodlist)
   }
-  
-  if (opt_stages[opt_stage_idx] == "early") {
-    cost_fn <- stop_exaggerating(cost_fn, exaggeration_factor)
-  }
-  if (opt_stages[opt_stage_idx] == "late") {
-    cost_fn <- stop_exaggerating(cost_fn, late_exaggeration_factor)
-  }
-  
-  # Recenter before output
-  Y <- sweep(Y, 2, colMeans(Y))
-  res <- ret_value(
-    Y,
-    ret_extra,
-    method,
-    X,
-    scale,
-    Y_init,
-    iter,
-    start_time,
-    cost_fn = cost_fn,
-    opt_res$G,
-    perplexity,
-    itercosts,
-    stop_lying_iter,
-    start_late_lying_iter,
-    opt_list,
-    opt,
-    exaggeration_factor,
-    late_exaggeration_factor,
-    optionals = ret_optionals,
-    pca = ifelse(pca && !whiten, initial_dims, 0),
-    whiten = ifelse(pca && whiten, initial_dims, 0),
-    use_cpp = use_cpp,
-    n_threads = n_threads
-  )
-  
-  res
 }
 
+
 # Result Export -----------------------------------------------------------
 
 # Prepare the return value.
@@ -1795,16 +1793,15 @@ ret_value <- function(Y,
   attr(Y, "dimnames") <- NULL
   if (ret_extra) {
     end_time <- Sys.time()
-    
+
     if (methods::is(X, "dist")) {
       N <- attr(X, "Size")
       origD <- NULL
-    }
-    else {
+    } else {
       N <- nrow(X)
       origD <- ncol(X)
     }
-    
+
     res <- list(
       Y = Y,
       N = N,
@@ -1815,52 +1812,50 @@ ret_value <- function(Y,
       iter = iter,
       time_secs = as.numeric(end_time - start_time, units = "secs")
     )
-    
+
     if (!is.null(G)) {
       res$G2norm <- norm2(G)
     }
-    
+
     if (pca > 0) {
       res$pca_dims <- pca
-    }
-    else if (whiten > 0) {
+    } else if (whiten > 0) {
       res$whiten_dims <- whiten
     }
-    
+
     if (is.null(cost_fn$pcost)) {
       cost_fn <- cost_grad(cost_fn, Y)
       cost_fn <- cost_point(cost_fn, Y)
     }
     res$costs <- cost_fn$pcost
-    
+
     if (!is.null(opt_input)) {
       res$opt <- opt_input
       if (!is.null(opt_res$counts)) {
         res$opt$counts <- opt_res$counts
       }
     }
-    
-    
+
+
     # Don't report exaggeration settings if they didn't do anything
     if (exaggeration_factor == 1 || late_exaggeration_factor == 1) {
       if (exaggeration_factor == 1) {
         exaggeration_factor <- NULL
         stop_lying_iter <- NULL
       }
-      
+
       if (late_exaggeration_factor == 1) {
         late_exaggeration_factor <- NULL
         start_late_lying_iter <- NULL
       }
-    }
-    else {
+    } else {
       # Don't report start_late_lying_iter if we never got there
       if (start_late_lying_iter > iter) {
         start_late_lying_iter <- NULL
         late_exaggeration_factor <- NULL
       }
     }
-    
+
     res <- c(
       res,
       list(
@@ -1872,13 +1867,13 @@ ret_value <- function(Y,
         start_late_lying_iter = start_late_lying_iter
       )
     )
-    
+
     # If using the Intrinsic Dimensionality method, use the chosen perplexity
     if (!is.null(cost_fn$idp)) {
       res$perplexity <- cost_fn$idp
     }
-    
-    
+
+
     optionals <- tolower(unique(optionals))
     for (o in optionals) {
       exported <- NULL
@@ -1889,8 +1884,7 @@ ret_value <- function(Y,
       if (!is.null(exported)) {
         if (nchar(o) < 3) {
           res[[toupper(o)]] <- exported
-        }
-        else {
+        } else {
           res[[o]] <- exported
         }
       }
@@ -1898,24 +1892,21 @@ ret_value <- function(Y,
       else if (o == "dx") {
         if (methods::is(X, "dist")) {
           res$DX <- X
-        }
-        else {
+        } else {
           res$DX <- calc_d(X, use_cpp = use_cpp, n_threads = n_threads)
         }
-      }
-      else if (o == "dy") {
+      } else if (o == "dy") {
         res$DY <- calc_d(Y, use_cpp = use_cpp, n_threads = n_threads)
       }
-      
+
       if (o == "x") {
         res$X <- X
       }
     }
-    
+
     res <- remove_nulls(res)
     res
-  }
-  else {
+  } else {
     Y
   }
 }

From 9c67c6774ea9022f37a99f3fb095ad1975b9f922 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 20:44:26 -0700
Subject: [PATCH 21/26] perplexity using 3 * knn

---
 smallvis/R/RcppExports.R     |   4 ++
 smallvis/R/perplexity.R      |  45 +++++++-----
 smallvis/src/RcppExports.cpp |  17 +++++
 smallvis/src/perplexity.cpp  | 129 +++++++++++++++++++++++++++++++++++
 4 files changed, 179 insertions(+), 16 deletions(-)

diff --git a/smallvis/R/RcppExports.R b/smallvis/R/RcppExports.R
index 3728f9e..d41745b 100644
--- a/smallvis/R/RcppExports.R
+++ b/smallvis/R/RcppExports.R
@@ -25,6 +25,10 @@ mmds_grad_cpp <- function(R, D, Y, eps, n_threads) {
     .Call(`_smallvis_mmds_grad_cpp`, R, D, Y, eps, n_threads)
 }
 
+find_beta_knn_cpp <- function(knn_distances, knn_indices, perplexity = 15, tol = 1e-5, max_tries = 50L, n_threads = 1L) {
+    .Call(`_smallvis_find_beta_knn_cpp`, knn_distances, knn_indices, perplexity, tol, max_tries, n_threads)
+}
+
 find_beta_cpp <- function(X, perplexity = 15, tol = 1e-5, max_tries = 50L, n_threads = 1L) {
     .Call(`_smallvis_find_beta_cpp`, X, perplexity, tol, max_tries, n_threads)
 }
diff --git a/smallvis/R/perplexity.R b/smallvis/R/perplexity.R
index d2e59f1..627c8aa 100644
--- a/smallvis/R/perplexity.R
+++ b/smallvis/R/perplexity.R
@@ -488,7 +488,7 @@ sne_init <- function(cost,
                      row_normalize = TRUE,
                      normalize = TRUE,
                      n_threads = 0,
-                     use_cpp = use_cpp,
+                     use_cpp = FALSE,
                      verbose = FALSE,
                      ret_extra = c()) {
   if (tolower(kernel) == "knn") {
@@ -559,21 +559,34 @@ sne_init <- function(cost,
     if (!is.numeric(perplexity)) {
       stop("Unknown perplexity method, '", perplexity[[1]], "'")
     }
-    tsmessage(
-      "Commencing calibration for perplexity = ",
-      format_perps(perplexity)
-    )
-    if (use_cpp) {
-      P <- find_beta_cpp(X, perplexity, tol = 1e-5, n_threads = n_threads)$W
-    } else {
-      x2ares <- x2aff(
-        X,
-        perplexity,
-        tol = 1e-5,
-        kernel = kernel,
-        verbose = verbose
-      )
-      P <- x2ares$W
+    if (kernel == "perpknn") {
+      k <- 3 * perplexity
+      if (k > nrow(X) - 1) {
+        warning("Perplexity probably too high for number of points,",
+                " result may not be meaningful")
+        k <- nrow(X) - 1
+      }
+      tsmessage("Commencing calibration for perplexity = ",
+                format_perps(perplexity), " with k = ", k)
+      knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
+      knn_dist <- knn$dist[, 2:(k + 1)]
+      knn_idx <- knn$idx[, 2:(k + 1)]
+      P <- find_beta_knn_cpp(knn_dist, knn_idx, perplexity = perplexity,
+                                n_threads = n_threads)$P
+    }
+    else {
+      tsmessage("Commencing calibration for perplexity = ",
+                format_perps(perplexity))
+      if (use_cpp) {
+        P <- find_beta_cpp(X, perplexity, tol = 1e-5, n_threads = n_threads)$W
+      } else {
+        x2ares <- x2aff(X,
+                        perplexity,
+                        tol = 1e-5,
+                        kernel = kernel,
+                        verbose = verbose)
+        P <- x2ares$W
+      }
     }
   }
 
diff --git a/smallvis/src/RcppExports.cpp b/smallvis/src/RcppExports.cpp
index 93960dc..ef11c3b 100644
--- a/smallvis/src/RcppExports.cpp
+++ b/smallvis/src/RcppExports.cpp
@@ -88,6 +88,22 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// find_beta_knn_cpp
+List find_beta_knn_cpp(const NumericMatrix& knn_distances, const IntegerMatrix& knn_indices, double perplexity, double tol, int max_tries, std::size_t n_threads);
+RcppExport SEXP _smallvis_find_beta_knn_cpp(SEXP knn_distancesSEXP, SEXP knn_indicesSEXP, SEXP perplexitySEXP, SEXP tolSEXP, SEXP max_triesSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const NumericMatrix& >::type knn_distances(knn_distancesSEXP);
+    Rcpp::traits::input_parameter< const IntegerMatrix& >::type knn_indices(knn_indicesSEXP);
+    Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP);
+    Rcpp::traits::input_parameter< double >::type tol(tolSEXP);
+    Rcpp::traits::input_parameter< int >::type max_tries(max_triesSEXP);
+    Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(find_beta_knn_cpp(knn_distances, knn_indices, perplexity, tol, max_tries, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
 // find_beta_cpp
 List find_beta_cpp(const NumericMatrix& X, double perplexity, double tol, int max_tries, std::size_t n_threads);
 RcppExport SEXP _smallvis_find_beta_cpp(SEXP XSEXP, SEXP perplexitySEXP, SEXP tolSEXP, SEXP max_triesSEXP, SEXP n_threadsSEXP) {
@@ -111,6 +127,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_smallvis_d2_to_tweight_cpp", (DL_FUNC) &_smallvis_d2_to_tweight_cpp, 2},
     {"_smallvis_tsne_grad_cpp", (DL_FUNC) &_smallvis_tsne_grad_cpp, 5},
     {"_smallvis_mmds_grad_cpp", (DL_FUNC) &_smallvis_mmds_grad_cpp, 5},
+    {"_smallvis_find_beta_knn_cpp", (DL_FUNC) &_smallvis_find_beta_knn_cpp, 6},
     {"_smallvis_find_beta_cpp", (DL_FUNC) &_smallvis_find_beta_cpp, 5},
     {NULL, NULL, 0}
 };
diff --git a/smallvis/src/perplexity.cpp b/smallvis/src/perplexity.cpp
index b261897..af7c85d 100644
--- a/smallvis/src/perplexity.cpp
+++ b/smallvis/src/perplexity.cpp
@@ -9,6 +9,83 @@
 
 using namespace Rcpp;
 
+void find_beta_knn(std::vector<double> &knn_distances, std::size_t n,
+                   std::size_t k, double perplexity, double logU, double tol,
+                   int max_tries, std::vector<double> &W,
+                   std::vector<double> &beta, int &bad_perp,
+                   std::size_t start_row, std::size_t end_row) {
+
+  for (std::size_t i = start_row; i < end_row; ++i) {
+    const std::size_t idx = i * k;
+    double betamin = -std::numeric_limits<double>::infinity();
+    double betamax = std::numeric_limits<double>::infinity();
+
+    // Square the distances
+    for (std::size_t j = 0; j < k; ++j) {
+      knn_distances[idx + j] *= knn_distances[idx + j];
+    }
+
+    // Initial guess for beta: 0.5 * perplexity / mean(knn_distances)
+    double sum_d2i = std::accumulate(knn_distances.begin() + idx,
+                                     knn_distances.begin() + idx + k, 0.0);
+    beta[i] = (0.5 * perplexity * k) / sum_d2i;
+
+    double Z = 0.0;
+    double entropy = 0.0;
+    for (std::size_t j = 0; j < k; ++j) {
+      W[idx + j] = exp(-knn_distances[idx + j] * beta[i]);
+      entropy += knn_distances[idx + j] * W[idx + j];
+      Z += W[idx + j];
+    }
+    if (Z == 0.0) {
+      entropy = 0.0;
+    } else {
+      entropy = (entropy / Z) * beta[i] + log(Z);
+    }
+
+    double Hdiff = entropy - logU;
+    int tries = 0;
+    while (fabs(Hdiff) > tol && tries < max_tries) {
+      if (Hdiff > 0) {
+        betamin = beta[i];
+        if (std::isinf(betamax)) {
+          beta[i] *= 2;
+        } else {
+          beta[i] = (beta[i] + betamax) / 2;
+        }
+      } else {
+        betamax = beta[i];
+        if (std::isinf(betamin)) {
+          beta[i] /= 2;
+        } else {
+          beta[i] = (beta[i] + betamin) / 2;
+        }
+      }
+
+      Z = 0.0;
+      entropy = 0.0;
+      for (std::size_t j = 0; j < k; ++j) {
+        W[idx + j] = exp(-knn_distances[idx + j] * beta[i]);
+        entropy += knn_distances[idx + j] * W[idx + j];
+        Z += W[idx + j];
+      }
+      if (Z == 0.0) {
+        entropy = 0.0;
+      } else {
+        entropy = (entropy / Z) * beta[i] + log(Z);
+      }
+
+      Hdiff = entropy - logU;
+      tries++;
+    }
+
+    if (fabs(Hdiff) > tol) {
+      bad_perp++;
+      std::fill(W.begin() + idx, W.begin() + idx + k, 1.0 / k);
+    }
+  }
+}
+
 void find_beta(const std::vector<double> &data, std::size_t n, std::size_t d,
                double perplexity, double logU, double tol, int max_tries,
                std::vector<double> &W, std::vector<double> &beta, int &bad_perp,
@@ -114,6 +191,58 @@ void find_beta(const std::vector<double> &data, std::size_t n, std::size_t d,
   }
 }
 
+// [[Rcpp::export]]
+List find_beta_knn_cpp(const NumericMatrix &knn_distances,
+                       const IntegerMatrix &knn_indices, double perplexity = 15,
+                       double tol = 1e-5, int max_tries = 50,
+                       std::size_t n_threads = 1) {
+
+  const std::size_t n = knn_distances.nrow();
+  const std::size_t k = knn_distances.ncol();
+  const double logU = log(perplexity);
+
+  // Flatten the knn_distances matrix into a vector
+  std::vector<double> knn_distances_vec(n * k);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < k; ++j) {
+      knn_distances_vec[i * k + j] = knn_distances(i, j);
+    }
+  }
+
+  std::vector<double> W(n * k, 0.0);
+  std::vector<double> beta(n, 0.0);
+  int bad_perp = 0;
+
+  if (n_threads > 1) {
+    std::size_t chunk_size = (n + n_threads - 1) / n_threads;
+    std::vector<std::thread> threads;
+    for (std::size_t t = 0; t < n_threads; ++t) {
+      std::size_t start_row = t * chunk_size;
+      std::size_t end_row = std::min(start_row + chunk_size, n);
+      threads.emplace_back(find_beta_knn, std::ref(knn_distances_vec), n, k,
+                           perplexity, logU, tol, max_tries, std::ref(W),
+                           std::ref(beta), std::ref(bad_perp), start_row,
+                           end_row);
+    }
+    for (auto &thread : threads) {
+      thread.join();
+    }
+  } else {
+    find_beta_knn(knn_distances_vec, n, k, perplexity, logU, tol, max_tries, W,
+                  beta, bad_perp, 0, n);
+  }
+
+  NumericMatrix P(n, n);
+  for (std::size_t i = 0; i < n; ++i) {
+    for (std::size_t j = 0; j < k; ++j) {
+      P(i, knn_indices(i, j) - 1) = W[i * k + j];
+    }
+  }
+
+  return List::create(Named("P") = P, Named("beta") = beta,
+                      Named("bad_perp") = bad_perp);
+}
+
 // [[Rcpp::export]]
 List find_beta_cpp(const NumericMatrix &X, double perplexity = 15,
                    double tol = 1e-5, int max_tries = 50,

From e84cc1cbbb9754765bb6aa03089c379310ca469f Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sat, 3 Aug 2024 21:50:47 -0700
Subject: [PATCH 22/26] unroll loops we know are 2D

---
 smallvis/src/gradients.cpp | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/smallvis/src/gradients.cpp b/smallvis/src/gradients.cpp
index 055202f..b0e4062 100644
--- a/smallvis/src/gradients.cpp
+++ b/smallvis/src/gradients.cpp
@@ -31,12 +31,12 @@ void mmds_grad(const std::vector<double> &R, const std::vector<double> &D,
 void tsne_grad(const std::vector<double> &P, const std::vector<double> &W,
                double Z, const std::vector<double> &Y,
                std::vector<double> &gradient, std::size_t start,
-               std::size_t end, std::size_t n, std::size_t d) {
+               std::size_t end, std::size_t n) {
 
   const double Z4 = 4.0 / Z;
   
   for (std::size_t i = start; i < end; ++i) {
-    const std::size_t i_d = i * d;
+    const std::size_t i_d = i + i;
     const std::size_t i_n = i * n;
     for (std::size_t j = 0; j < n; ++j) {
       if (i == j) {
@@ -44,9 +44,8 @@ void tsne_grad(const std::vector<double> &P, const std::vector<double> &W,
       }
       const std::size_t ij = i_n + j;
       double k_ij = Z4 * W[ij] * (Z * P[ij] - W[ij]);
-      for (std::size_t k = 0; k < d; ++k) {
-        gradient[i_d + k] += k_ij * (Y[i_d + k] - Y[j * d + k]);
-      }
+      gradient[i_d] += k_ij * (Y[i_d] - Y[j * 2]);
+      gradient[i_d + 1] += k_ij * (Y[i_d + 1] - Y[j * 2 + 1]);
     }
   }
 }
@@ -262,19 +261,17 @@ NumericMatrix tsne_grad_cpp(const NumericMatrix &P,
                                     const NumericMatrix &Y,
                                     std::size_t n_threads) {
   std::size_t n = Y.nrow();
-  std::size_t d = Y.ncol();
 
   std::vector<double> P_vec(P.begin(), P.end());
   std::vector<double> W_vec(W.begin(), W.end());
 
-  std::vector<double> Y_vec(n * d);
+  std::vector<double> Y_vec(n * 2);
   for (std::size_t i = 0; i < n; ++i) {
-    for (std::size_t j = 0; j < d; ++j) {
-      Y_vec[i * d + j] = Y(i, j);
-    }
+    Y_vec[i * 2] = Y(i, 0);
+    Y_vec[i * 2 + 1] = Y(i, 1);
   }
 
-  std::vector<double> gradient_vec(n * d, 0.0);
+  std::vector<double> gradient_vec(n * 2, 0.0);
 
   if (n_threads > 1) {
     std::size_t chunk_size = (n + n_threads - 1) / n_threads;
@@ -284,20 +281,19 @@ NumericMatrix tsne_grad_cpp(const NumericMatrix &P,
       std::size_t end_row = std::min(start_row + chunk_size, n);
       threads.emplace_back(tsne_grad, std::cref(P_vec), std::cref(W_vec), Z,
                            std::cref(Y_vec), std::ref(gradient_vec), start_row,
-                           end_row, n, d);
+                           end_row, n);
     }
     for (auto &thread : threads) {
       thread.join();
     }
   } else {
-    tsne_grad(P_vec, W_vec, Z, Y_vec, gradient_vec, 0, n, n, d);
+    tsne_grad(P_vec, W_vec, Z, Y_vec, gradient_vec, 0, n, n);
   }
 
-  NumericMatrix gradient(n, d);
+  NumericMatrix gradient(n, 2);
   for (std::size_t i = 0; i < n; ++i) {
-    for (std::size_t j = 0; j < d; ++j) {
-      gradient(i, j) = gradient_vec[i * d + j];
-    }
+    gradient(i, 0) = gradient_vec[i * 2];
+    gradient(i, 1) = gradient_vec[i * 2 + 1];
   }
 
   return gradient;

From 3023efbf3000ffbf1e44d246dce4bcce6392b305 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sun, 11 Aug 2024 20:55:00 -0700
Subject: [PATCH 23/26] Add Barnes-Hut, irlba PCA, perplexity on neighbors

---
 smallvis/R/RcppExports.R             |  12 +-
 smallvis/R/cost.R                    |   2 +-
 smallvis/R/init.R                    |  46 +++-
 smallvis/R/perplexity.R              | 186 +++++++++-----
 smallvis/R/smallvis.R                |  23 +-
 smallvis/R/sne.R                     |  70 +++++
 smallvis/man/smallvis.Rd             | 100 ++++----
 smallvis/man/smallvis_perpstep.Rd    |   5 +-
 smallvis/man/smallvis_rep.Rd         |  15 +-
 smallvis/src/RcppExports.cpp         |  45 +++-
 smallvis/src/bh.cpp                  |  40 +++
 smallvis/src/bh.h                    | 369 +++++++++++++++++++++++++++
 smallvis/src/gradients.cpp           |  39 ++-
 smallvis/src/perplexity.cpp          |  22 +-
 smallvis/src/threads.h               |  65 +++++
 smallvis/tests/testthat/helper_api.R |   4 +-
 smallvis/tests/testthat/test_api.R   |  12 +-
 17 files changed, 882 insertions(+), 173 deletions(-)
 create mode 100644 smallvis/src/bh.cpp
 create mode 100644 smallvis/src/bh.h
 create mode 100644 smallvis/src/threads.h

diff --git a/smallvis/R/RcppExports.R b/smallvis/R/RcppExports.R
index d41745b..f96ef06 100644
--- a/smallvis/R/RcppExports.R
+++ b/smallvis/R/RcppExports.R
@@ -1,6 +1,14 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
+bh_tsne_gradient_cpp <- function(indices, indptr, P_data, embedding, theta = 0.5, eps = 1e-16, n_threads = 1L) {
+    .Call(`_smallvis_bh_tsne_gradient_cpp`, indices, indptr, P_data, embedding, theta, eps, n_threads)
+}
+
+bh_plogq_cpp <- function(indices, indptr, P_data, embedding, theta = 0.5, eps = 1e-16, n_threads = 1L) {
+    .Call(`_smallvis_bh_plogq_cpp`, indices, indptr, P_data, embedding, theta, eps, n_threads)
+}
+
 dist2_cpp <- function(input, n_threads = 1L) {
     .Call(`_smallvis_dist2_cpp`, input, n_threads)
 }
@@ -25,8 +33,8 @@ mmds_grad_cpp <- function(R, D, Y, eps, n_threads) {
     .Call(`_smallvis_mmds_grad_cpp`, R, D, Y, eps, n_threads)
 }
 
-find_beta_knn_cpp <- function(knn_distances, knn_indices, perplexity = 15, tol = 1e-5, max_tries = 50L, n_threads = 1L) {
-    .Call(`_smallvis_find_beta_knn_cpp`, knn_distances, knn_indices, perplexity, tol, max_tries, n_threads)
+find_beta_knn_cpp <- function(knn_distances, knn_indices, perplexity = 15, tol = 1e-5, max_tries = 50L, ret_sparse = FALSE, n_threads = 1L) {
+    .Call(`_smallvis_find_beta_knn_cpp`, knn_distances, knn_indices, perplexity, tol, max_tries, ret_sparse, n_threads)
 }
 
 find_beta_cpp <- function(X, perplexity = 15, tol = 1e-5, max_tries = 50L, n_threads = 1L) {
diff --git a/smallvis/R/cost.R b/smallvis/R/cost.R
index de9219c..810d696 100644
--- a/smallvis/R/cost.R
+++ b/smallvis/R/cost.R
@@ -140,7 +140,7 @@ cost_point <- function(cost, Y) {
 cost_clear <- function(cost) {
   if (!is.null(cost$sentinel)) {
     cost[[cost$sentinel]] <- NULL
-  } else if (is.null(cost$clear)) {
+  } else if (!is.null(cost$clear)) {
     cost <- cost$clear(cost)
   }
   cost
diff --git a/smallvis/R/init.R b/smallvis/R/init.R
index 2272da4..35c9f57 100644
--- a/smallvis/R/init.R
+++ b/smallvis/R/init.R
@@ -54,7 +54,8 @@ laplacian_eigenmap <- function(A,
   # This effectively row-normalizes A: colSums is normally faster than rowSums
   # and because A is symmetric, they're equivalent
   M <- A / colSums(A)
-  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE, warn.conflicts = FALSE)) {
+  if (use_RSpectra &&
+    requireNamespace("RSpectra", quietly = TRUE, warn.conflicts = FALSE)) {
     tsmessage("Using RSpectra for eigenvectors")
     Re(RSpectra::eigs(M, k = ndim + 1)$vectors[, 2:(ndim + 1)])
   } else {
@@ -78,7 +79,8 @@ normalized_spectral_init <- function(A,
   L <- -t(A / Dsq) / Dsq
   diag(L) <- 1 + diag(L)
 
-  if (use_RSpectra && requireNamespace("RSpectra", quietly = TRUE, warn.conflicts = FALSE)) {
+  if (use_RSpectra &&
+    requireNamespace("RSpectra", quietly = TRUE, warn.conflicts = FALSE)) {
     tsmessage("Using RSpectra for eigenvectors")
     k <- ndim + 1
     ncv <- max(2 * k + 1, floor(sqrt(n)))
@@ -143,23 +145,43 @@ pca_scores <- function(X,
     }
     scores <- res_mds$points
   } else {
-    X <- scale(X, center = TRUE, scale = FALSE)
-    # do SVD on X directly rather than forming covariance matrix
-    s <- svd(X, nu = ncol, nv = 0)
-    D <- diag(c(s$d[1:ncol]))
-    if (verbose || ret_extra) {
-      # calculate eigenvalues of covariance matrix from singular values
-      lambda <- (s$d^2) / (nrow(X) - 1)
-      varex <- sum(lambda[1:ncol]) / sum(lambda)
+    if (ncol < 0.5 * min(dim(X))) {
+      res <- irlba::prcomp_irlba(
+        X,
+        n = ncol,
+        retx = TRUE,
+        center = TRUE,
+        scale = FALSE
+      )
+      scores <- res$x
+      ncol <- ncol(res$rotation)
+      varex <- sum(res$sdev[1:ncol]^2) / res$totalvar
       tsmessage(
-        "PCA: ",
+        "PCA (via irlba): ",
         ncol,
         " components explained ",
         formatC(varex * 100),
         "% variance"
       )
+    } else {
+      X <- scale(X, center = TRUE, scale = FALSE)
+      # do SVD on X directly rather than forming covariance matrix
+      s <- svd(X, nu = ncol, nv = 0)
+      D <- diag(c(s$d[1:ncol]))
+      if (verbose || ret_extra) {
+        # calculate eigenvalues of covariance matrix from singular values
+        lambda <- (s$d^2) / (nrow(X) - 1)
+        varex <- sum(lambda[1:ncol]) / sum(lambda)
+        tsmessage(
+          "PCA (via SVD): ",
+          ncol,
+          " components explained ",
+          formatC(varex * 100),
+          "% variance"
+        )
+      }
+      scores <- s$u %*% D
     }
-    scores <- s$u %*% D
   }
 
   if (ret_extra) {
diff --git a/smallvis/R/perplexity.R b/smallvis/R/perplexity.R
index 627c8aa..e2aaa29 100644
--- a/smallvis/R/perplexity.R
+++ b/smallvis/R/perplexity.R
@@ -455,20 +455,19 @@ scale_affinities <- function(P,
   # row normalization before anything else
   if (nnat(row_normalize)) {
     if (symmetrize == "rowsymm") {
-      P <- 0.5 * (P + t(P))
+      P <- 0.5 * (P + Matrix::t(P))
       symmetrize <- "none"
     }
-    P <- P / rowSums(P)
+    P <- P / Matrix::rowSums(P)
   } else if (is.numeric(row_normalize)) {
-    P <- row_normalize * P / rowSums(P)
+    P <- row_normalize * P / Matrix::rowSums(P)
   }
-
   # Symmetrize
   P <- switch(symmetrize,
     none = P,
-    symmetric = 0.5 * (P + t(P)),
-    average = 0.5 * (P + t(P)),
-    mutual = sqrt(P * t(P)),
+    symmetric = 0.5 * (P + Matrix::t(P)),
+    average = 0.5 * (P + Matrix::t(P)),
+    mutual = sqrt(P * Matrix::t(P)),
     umap = fuzzy_set_union(P),
     fuzzy = fuzzy_set_union(P),
     stop("unknown symmetrization: ", symmetrize)
@@ -559,32 +558,89 @@ sne_init <- function(cost,
     if (!is.numeric(perplexity)) {
       stop("Unknown perplexity method, '", perplexity[[1]], "'")
     }
-    if (kernel == "perpknn") {
+
+    # perpnn options:
+    # perpnnks: use exact knn with sparse output
+    # perpnnas: use approximate knn with sparse output
+    # perpnnkd: use exact knn with dense output
+    # perpnnad: use approximate knn with dense output
+    if (startsWith(kernel, "perpnn")) {
+      n <- nrow(X)
+      if (perplexity > n - 1) {
+        stop("Perplexity too high for number of points")
+      }
       k <- 3 * perplexity
-      if (k > nrow(X) - 1) {
-        warning("Perplexity probably too high for number of points,",
-                " result may not be meaningful")
-        k <- nrow(X) - 1
+      if (k > n - 1) {
+        warning(
+          "Perplexity probably too high for number of points,",
+          " result may not be meaningful"
+        )
+        k <- n - 1
+      }
+      if (kernel %in% c("perpnnks", "perpnnkd")) {
+        tsmessage(
+          "Finding exact nearest neighbors with k = ",
+          k,
+          " n_threads = ",
+          n_threads
+        )
+        knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
+      } else {
+        tsmessage(
+          "Finding approximate nearest neighbors with k = ",
+          k,
+          " n_threads = ",
+          n_threads
+        )
+        knn <- rnndescent::rnnd_knn(X, k = k + 1, n_threads = n_threads)
       }
-      tsmessage("Commencing calibration for perplexity = ",
-                format_perps(perplexity), " with k = ", k)
-      knn <- rnndescent::brute_force_knn(X, k = k + 1, n_threads = n_threads)
       knn_dist <- knn$dist[, 2:(k + 1)]
       knn_idx <- knn$idx[, 2:(k + 1)]
-      P <- find_beta_knn_cpp(knn_dist, knn_idx, perplexity = perplexity,
-                                n_threads = n_threads)$P
-    }
-    else {
-      tsmessage("Commencing calibration for perplexity = ",
-                format_perps(perplexity))
+
+      ret_sparse <- kernel %in% c("perpnnks", "perpnnas")
+      tsmessage(
+        "Commencing calibration for perplexity = ",
+        format_perps(perplexity),
+        " n_threads = ",
+        n_threads
+      )
+      P <- find_beta_knn_cpp(
+        knn_dist,
+        knn_idx,
+        perplexity = perplexity,
+        n_threads = n_threads,
+        ret_sparse = ret_sparse
+      )$P
+      if (ret_sparse) {
+        P <- Matrix::sparseMatrix(
+          i = rep(1:n, each = k),
+          j = as.vector(t(knn_idx)),
+          x = P,
+          dims = c(n, n),
+          repr = "C"
+        )
+      }
+    } else {
       if (use_cpp) {
+        tsmessage(
+          "Commencing calibration for perplexity = ",
+          format_perps(perplexity),
+          " n_threads = ",
+          n_threads
+        )
         P <- find_beta_cpp(X, perplexity, tol = 1e-5, n_threads = n_threads)$W
       } else {
-        x2ares <- x2aff(X,
-                        perplexity,
-                        tol = 1e-5,
-                        kernel = kernel,
-                        verbose = verbose)
+        tsmessage(
+          "Commencing calibration for perplexity = ",
+          format_perps(perplexity)
+        )
+        x2ares <- x2aff(
+          X,
+          perplexity,
+          tol = 1e-5,
+          kernel = kernel,
+          verbose = verbose
+        )
         P <- x2ares$W
       }
     }
@@ -598,46 +654,48 @@ sne_init <- function(cost,
   )
   cost$P <- P
 
-  if (is.logical(row_normalize)) {
-    tsmessage(
-      "Effective perplexity of P approx = ",
-      formatC(stats::median(perpp(P)))
-    )
-  }
+  if (!methods::is(P, "sparseMatrix")) {
+    if (is.logical(row_normalize)) {
+      tsmessage(
+        "Effective perplexity of P approx = ",
+        formatC(stats::median(perpp(P)))
+      )
+    }
 
-  for (r in unique(tolower(ret_extra))) {
-    switch(r,
-      v = {
-        cost$V <- x2ares$W
-      },
-      dint = {
-        if (!is.null(x2ares$dint)) {
-          cost$dint <- x2ares$dint
+    for (r in unique(tolower(ret_extra))) {
+      switch(r,
+        v = {
+          cost$V <- x2ares$W
+        },
+        dint = {
+          if (!is.null(x2ares$dint)) {
+            cost$dint <- x2ares$dint
+          }
+        },
+        beta = {
+          if (!is.null(x2ares$beta)) {
+            cost$beta <- x2ares$beta
+          }
+        },
+        adegc = {
+          cost$adegc <- 0.5 * rowSums(x2ares$W) + colSums(x2ares$W)
+        },
+        adegin = {
+          cost$adegin <- rowSums(x2ares$W)
+        },
+        adegout = {
+          cost$adegout <- colSums(x2ares$W)
+        },
+        pdeg = {
+          cost$pdeg <- colSums(P)
+        },
+        idp = {
+          if (!is.null(x2ares$idp)) {
+            cost$idp <- x2ares$idp
+          }
         }
-      },
-      beta = {
-        if (!is.null(x2ares$beta)) {
-          cost$beta <- x2ares$beta
-        }
-      },
-      adegc = {
-        cost$adegc <- 0.5 * rowSums(x2ares$W) + colSums(x2ares$W)
-      },
-      adegin = {
-        cost$adegin <- rowSums(x2ares$W)
-      },
-      adegout = {
-        cost$adegout <- colSums(x2ares$W)
-      },
-      pdeg = {
-        cost$pdeg <- colSums(P)
-      },
-      idp = {
-        if (!is.null(x2ares$idp)) {
-          cost$idp <- x2ares$idp
-        }
-      }
-    )
+      )
+    }
   }
   cost
 }
diff --git a/smallvis/R/smallvis.R b/smallvis/R/smallvis.R
index 0c5b12e..e2c93e7 100644
--- a/smallvis/R/smallvis.R
+++ b/smallvis/R/smallvis.R
@@ -8,6 +8,7 @@
 #' \itemize{
 #'   \item \code{"tsne"} t-Distributed Stochastic Neighbor Embedding
 #'   (van der Maaten and Hinton, 2008).
+#'   \item \code{"bhtsne"} Barnes-Hut t-SNE (van der Maaten, 2014).
 #'   \item \code{"largevis"} the cost function of the LargeVis algorithm
 #'   (Tang et al, 2016). Input affinities are calculated and symmetrized using
 #'   the same perplexity calibration method as t-SNE, but are not normalized.
@@ -535,6 +536,8 @@
 #'   \code{.Machine$double.eps}, but if you see inconsistent convergence results
 #'   with optimizer that should be reducing the cost each iteration, then try
 #'   setting this to a larger value, e.g. between \code{1e-3 - 1e-9}.
+#' @param theta Barnes-Hut approximation accuracy. Default is 0.5. Set to 0.0
+#'   for exact t-SNE. Applies only for \code{method = "bhtsne"}.
 #' @param verbose If \code{TRUE}, log progress messages to the console.
 #' @return If \code{ret_extra} is \code{FALSE}, the embedded output coordinates
 #'   as a matrix. Otherwise, a list with the following items:
@@ -807,6 +810,11 @@
 #' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
 #' \url{http://www.jmlr.org/papers/v9/vandermaaten08a.html}
 #'
+#' Van der Maaten, L. (2014). 
+#' Accelerating t-SNE using tree-based algorithms.
+#' \emph{Journal of Machine Learning Research}, \emph{15}(1) (3221-3245).
+#' \url{https://jmlr.org/papers/v15/vandermaaten14a.html}
+#'
 #' Venna, J., Peltonen, J., Nybo, K., Aidos, H., & Kaski, S. (2010).
 #' Information retrieval perspective to nonlinear dimensionality reduction for
 #' data visualization.
@@ -871,6 +879,7 @@ smallvis <- function(X,
                      ret_extra = FALSE,
                      n_threads = 0,
                      use_cpp = FALSE,
+                     theta = 0.5,
                      eps = .Machine$double.eps,
                      verbose = TRUE) {
   if (is.logical(epoch_callback)) {
@@ -884,7 +893,7 @@ smallvis <- function(X,
   }
 
   # The embedding method
-  cost_fn <- create_cost(method, perplexity, eps, n_threads, use_cpp)
+  cost_fn <- create_cost(method, perplexity, eps, n_threads, use_cpp, theta = theta)
 
   if (exaggeration_factor != 1) {
     if (stop_lying_iter < 1) {
@@ -1389,7 +1398,8 @@ create_cost <- function(method,
                         perplexity,
                         eps,
                         n_threads,
-                        use_cpp) {
+                        use_cpp,
+                        theta) {
   method_names <- c(
     "tsne",
     "largevis",
@@ -1438,7 +1448,8 @@ create_cost <- function(method,
     "jssne",
     "gsne",
     "abssne",
-    "bhssne"
+    "bhssne",
+    "bhtsne"
   )
   if (is.character(method)) {
     method <- match.arg(tolower(method), method_names)
@@ -1449,6 +1460,12 @@ create_cost <- function(method,
         eps = eps,
         use_cpp = use_cpp
       ),
+      bhtsne = bhtsne(
+        perplexity = perplexity,
+        n_threads = n_threads,
+        eps = eps,
+        theta = theta
+      ),
       umap = umap(
         perplexity = perplexity,
         n_threads = n_threads,
diff --git a/smallvis/R/sne.R b/smallvis/R/sne.R
index e6acda2..468cea9 100644
--- a/smallvis/R/sne.R
+++ b/smallvis/R/sne.R
@@ -79,6 +79,76 @@ tsne <- function(perplexity,
   )
 }
 
+bhtsne <- function(perplexity,
+                   inp_kernel = "perpnnas",
+                   eps = .Machine$double.eps,
+                   theta = 0.5,
+                   n_threads = 0) {
+  list(
+    init = function(cost,
+                    X,
+                    max_iter,
+                    verbose = FALSE,
+                    ret_extra = c()) {
+      cost <- sne_init(
+        cost,
+        X,
+        perplexity = perplexity,
+        kernel = inp_kernel,
+        symmetrize = "symmetric",
+        normalize = TRUE,
+        row_normalize = TRUE,
+        verbose = verbose,
+        ret_extra = ret_extra,
+        n_threads = n_threads,
+        use_cpp = TRUE
+      )
+      cost$eps <- eps
+      cost
+    },
+    cache_input = function(cost) {
+      cost$plogp <- sum(cost$P@x * log(cost$P@x))
+      cost
+    },
+    pfn = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$pcost <- cost$plogp - bh_plogq_cpp(cost$P@i,
+        cost$P@p,
+        cost$P@x,
+        Y,
+        theta = theta,
+        n_threads = n_threads
+      )
+      # browser()
+      cost
+    },
+    gr = function(cost, Y) {
+      cost <- cost_update(cost, Y)
+      cost$G <- bh_tsne_gradient_cpp(cost$P@i,
+        cost$P@p,
+        cost$P@x,
+        Y,
+        theta = theta,
+        n_threads = n_threads
+      )
+      # browser()
+      cost$G <- 4 * cost$G
+      cost
+    },
+    export = function(cost, val) {
+      res <- cost_export(cost, val)
+      res
+    },
+    update = function(cost, Y) {
+      cost
+    },
+    exaggerate = function(cost, exaggeration_factor) {
+      cost$P <- cost$P * exaggeration_factor
+      cost
+    }
+  )
+}
+
 # Cook, J., Sutskever, I., Mnih, A., & Hinton, G. E. (2007).
 # Visualizing similarity data with a mixture of maps.
 # In \emph{International Conference on Artificial Intelligence and Statistics} (pp. 67-74).
diff --git a/smallvis/man/smallvis.Rd b/smallvis/man/smallvis.Rd
index 284de3c..1991e0f 100644
--- a/smallvis/man/smallvis.Rd
+++ b/smallvis/man/smallvis.Rd
@@ -38,6 +38,7 @@ smallvis(
   ret_extra = FALSE,
   n_threads = 0,
   use_cpp = FALSE,
+  theta = 0.5,
   eps = .Machine$double.eps,
   verbose = TRUE
 )
@@ -65,7 +66,7 @@ standard deviation. See 'Output initialization' section.}
 \item{perplexity}{The target perplexity for parameterizing the input
 probabilities. For method \code{"umap"}, controls the neighborhood size
 for parameterizing the smoothed k-nearest neighbor distances. See also the
-'Intrinsic dimensionality perplexity' and 'Multiscale perplexities' 
+'Intrinsic dimensionality perplexity' and 'Multiscale perplexities'
 sections.}
 
 \item{max_iter}{Maximum number of iterations in the optimization.}
@@ -114,24 +115,24 @@ this case, use the gradient norm to stop early.}
 
 \item{momentum}{Initial momentum value.}
 
-\item{final_momentum}{Final momentum value. If 
+\item{final_momentum}{Final momentum value. If
 \code{late_exaggeration_factor > 1}, then during late exaggeration, the
 momentum is switched back to \code{momentum} from this value.}
 
 \item{mom_switch_iter}{Iteration at which the momentum will switch from
-\code{momentum} to \code{final_momentum}. If 
+\code{momentum} to \code{final_momentum}. If
 \code{exaggeration_factor > 1}, then this should occur at some point
 after \code{stop_lying_iter} (default is 150 iterations after). If
 the early exaggeration phase stops early, this value is treated as being
 relative to when early exaggeration stops, to avoid wasting iterations
 at the lower momentum value. For example, if \code{stop_lying_iter = 100}
 and \code{mom_switch_iter = 250} (the defaults), but early exaggeration
-converges at iteration 50, the switch iteration will occur at iteration 
+converges at iteration 50, the switch iteration will occur at iteration
 150.}
 
 \item{eta}{Learning rate value, a positive number. Or set to \code{"optsne"},
 to use the formula suggested by Belkina and co-workers (2018) in their
-opt-SNE package (the size of the dataset divided by the 
+opt-SNE package (the size of the dataset divided by the
 \code{exaggeration_factor}).}
 
 \item{min_gain}{Minimum gradient descent step size.}
@@ -179,8 +180,8 @@ number of iterations before monitoring the relative rate of change of the
 cost function during early exaggeration.}
 
 \item{ee_mon_buffer}{If \code{ee_mon_epoch} is non-\code{NULL}, then ignore
-this number of occurences of the relative rate of change of the cost 
-function decreasing, which would otherwise signal termination of 
+this number of occurences of the relative rate of change of the cost
+function decreasing, which would otherwise signal termination of
 the early exaggeration stage. This is to prevent erroneous termination of
 early exaggeration under conditions when the cost can fluctuate noisily.}
 
@@ -197,7 +198,7 @@ to those value which are returned when this value is \code{TRUE}. See the
 
 \item{n_threads}{Number of threads to use in multi-threaded code. Default is
 0, which means no multi-threading. Mainly affects the calculation of things
-like distance matrices and perplexity calibration if you set 
+like distance matrices and perplexity calibration if you set
 \code{use_cpp = TRUE}. Otherwise, only methods that need to calculate
 nearest neighbors will be affected.}
 
@@ -205,6 +206,9 @@ nearest neighbors will be affected.}
 calculations. Default is \code{FALSE}. This won't speed up all steps and
 you will want to use this in conjunction with \code{n_threads}.}
 
+\item{theta}{Barnes-Hut approximation accuracy. Default is 0.5. Set to 0.0
+for exact t-SNE. Applies only for \code{method = "bhtsne"}.}
+
 \item{eps}{Set epsilon for avoiding division-by-zero errors. Default is
 \code{.Machine$double.eps}, but if you see inconsistent convergence results
 with optimizer that should be reducing the cost each iteration, then try
@@ -300,6 +304,7 @@ to the \code{method} parameter are:
 \itemize{
   \item \code{"tsne"} t-Distributed Stochastic Neighbor Embedding
   (van der Maaten and Hinton, 2008).
+  \item \code{"bhtsne"} Barnes-Hut t-SNE (van der Maaten, 2014).
   \item \code{"largevis"} the cost function of the LargeVis algorithm
   (Tang et al, 2016). Input affinities are calculated and symmetrized using
   the same perplexity calibration method as t-SNE, but are not normalized.
@@ -339,17 +344,17 @@ to the \code{method} parameter are:
   co-workers, use method \code{"bnerv"}.
   \item \code{"jse"} The Jensen-Shannon Embedding method of Lee and
   co-workers (2013).
-  \item \code{"absne"} The alpha-beta SNE method of Narayan and co-workers 
+  \item \code{"absne"} The alpha-beta SNE method of Narayan and co-workers
   (2015).
-  \item \code{"chsne"} The chi-squared divergence version of t-SNE 
+  \item \code{"chsne"} The chi-squared divergence version of t-SNE
   (Im and co-workers, 2018).
-  \item \code{"hlsne"} The Hellinger distance divergence version of t-SNE 
+  \item \code{"hlsne"} The Hellinger distance divergence version of t-SNE
   (Im and co-workers, 2018).
   \item \code{"rklsne"} The reverse Kullback-Leibler divergence version of
   t-SNE (Im and co-workers, 2018).
   \item \code{"jssne"} The Jensen-Shannon divergence version of t-SNE (Im and
   co-workers, 2018).
-  \item \code{"gsne"}, The global SNE (g-SNE) method of Zhou and Sharpee 
+  \item \code{"gsne"}, The global SNE (g-SNE) method of Zhou and Sharpee
   (2018).
 }
 
@@ -381,9 +386,9 @@ Some parameters are available for all (nor nearly all) methods.
       sparsification is carried out with this kernel, so there are no
       memory or performance improvements to be had with this setting.
       \code{"skd"} uses the smooth knn distances method as used by UMAP.
-   \item{\code{symmetrize}} the type of symmetrization, used by symmetric 
+   \item{\code{symmetrize}} the type of symmetrization, used by symmetric
       methods only. Can be one of:
-      \code{"symmetric"} symmetric nearest neighbor style, by arithmetic 
+      \code{"symmetric"} symmetric nearest neighbor style, by arithmetic
       averaging, as in t-SNE.
       \code{"fuzzy"} symmetrization by fuzzy set union as used in UMAP.
       \code{"mutual"} mutual nearest neighbor style as suggested by Schubert
@@ -394,10 +399,10 @@ Some parameters are available for all (nor nearly all) methods.
    \item \code{"LargeVis"}
    \itemize{
    \item{\code{gamma}} Weighting term for the repulsive versus attractive
-    forces. Default is \code{1}. The implementation by the creators of 
-    LargeVis uses a default \code{gamma = 7}, but note that this is for 
-    stochastic gradient descent with limited sampling of the repulsive 
-    contributions so it's unlikely to be a good choice with the 
+    forces. Default is \code{1}. The implementation by the creators of
+    LargeVis uses a default \code{gamma = 7}, but note that this is for
+    stochastic gradient descent with limited sampling of the repulsive
+    contributions so it's unlikely to be a good choice with the
     implementation used in this package.
    \item{\code{gr_eps}} Epsilon used in the gradient to prevent
     division by zero. Default is \code{0.1}.
@@ -456,9 +461,9 @@ Some parameters are available for all (nor nearly all) methods.
    }
    \item \code{"ABSNE"}
    \itemize{
-   \item{\code{alpha}} Alpha value for the alpha-beta divergence. Set 
+   \item{\code{alpha}} Alpha value for the alpha-beta divergence. Set
    \code{alpha < 1} to produce more smaller, finer-grained clusters, and
-   \code{alpha > 1} to produce fewer, larger clusters, with more emphasis 
+   \code{alpha > 1} to produce fewer, larger clusters, with more emphasis
    on global structure. Default is \code{1.0}, to give t-SNE-like behavior.
    \item{\code{lambda}} Sum of alpha + beta, where beta is the beta value
    for the alpha-beta divergence. Set \code{lambda < 1} to increase cluster
@@ -467,8 +472,8 @@ Some parameters are available for all (nor nearly all) methods.
    }
    \item \code{"gsne"}
    \itemize{
-   \item{\code{lambda}} Weighting factor to put increasing emphasis on 
-   preserving global similarities. Set to \code{0} to get t-SNE (no 
+   \item{\code{lambda}} Weighting factor to put increasing emphasis on
+   preserving global similarities. Set to \code{0} to get t-SNE (no
    extra emphasis on global structure), and to \code{1.0} to get equal
    weighting between the local and global divergences. Default is \code{1.0}.
    }
@@ -488,7 +493,7 @@ to one of the following:
   \item{A matrix}: which must have dimensions \code{n} by \code{k}, where
   \code{n} is the number of rows in \code{X}.
   \item{\code{"rand"}}: initialize from a Gaussian distribution with mean 0
-  and standard deviation 1e-4, the default used by t-SNE. The standard 
+  and standard deviation 1e-4, the default used by t-SNE. The standard
   deviation can be controlled with \code{Y_init_sdev} (see below).
   \item{\code{"pca"}}: use the first \code{k} scores of the
   PCA: columns are centered, but no scaling beyond that which is applied by
@@ -527,8 +532,8 @@ Steinerberger, 2017): it may therefore unnecessary to use the
 
 The \code{Y_init_sdev} parameter, if provided, will scale the input
 coordinates such that the standard deviation of each dimension is the
-provided value. The default is to do no scaling, except for 
-\code{Y_init = "spca"} and \code{Y_init = "rand"} where a scaling to a 
+provided value. The default is to do no scaling, except for
+\code{Y_init = "spca"} and \code{Y_init = "rand"} where a scaling to a
 standard deviation of \code{1e-4} is used, as in t-SNE initialization.
 \code{Y_init = "spca"} is effectively an alias for \code{Y_init = "pca",
 Y_init_sdev = 1e-4}.
@@ -585,19 +590,19 @@ increasing order. For more details on IDP, see
 
 
 Another technique to combine multiple perplexities is to use the multiscale
-approach given by de Bodt and co-workers (2018). As with IDP, a series of 
+approach given by de Bodt and co-workers (2018). As with IDP, a series of
 candidate perplexities are used, but all the affinity matrices are used to
 create an average matrix which is used as the final probability matrix.
-To use this method, set \code{perplexity = "multiscale"}. Default and custom 
-list of perplexities to use can be provided in the same way as with IDP. 
+To use this method, set \code{perplexity = "multiscale"}. Default and custom
+list of perplexities to use can be provided in the same way as with IDP.
 
 Note that previous work by this group described a slightly more complex
 approach where the number of individual perplexity results are introduced
 into the average sequentially over the course of the optimization, and
-the output probabilities are also generated by an averaging. Although also 
+the output probabilities are also generated by an averaging. Although also
 referred to as "multiscale", these variations are not implemented. Also,
-if \code{ret_extra = TRUE} is used, extra data associated with a specific 
-perplexity (e.g. degree centrality, intrinsic dimensionality) will not 
+if \code{ret_extra = TRUE} is used, extra data associated with a specific
+perplexity (e.g. degree centrality, intrinsic dimensionality) will not
 be returned.
 }
 
@@ -816,14 +821,14 @@ In \emph{Advances in neural information processing systems}
 (pp. 585-591).
 \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
 
-Belkina, A. C., Ciccolella, C. O., Anno, R., Spidlen, J., Halpert, R., & Snyder-Cappione, J. (2018). 
-Automated optimal parameters for T-distributed stochastic neighbor embedding improve visualization and allow analysis of large datasets. 
+Belkina, A. C., Ciccolella, C. O., Anno, R., Spidlen, J., Halpert, R., & Snyder-Cappione, J. (2018).
+Automated optimal parameters for T-distributed stochastic neighbor embedding improve visualization and allow analysis of large datasets.
 \emph{bioRxiv}, 451690.
 \url{https://www.biorxiv.org/content/10.1101/451690v2.abstract}
 
-De Bodt, C., Mulders, D., Verleysen, M., & Lee, J. A. (2018). 
-Perplexity-free t-SNE and twice Student tt-SNE. 
-In \emph{European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning (ESANN 2018)} (pp. 123-128). 
+De Bodt, C., Mulders, D., Verleysen, M., & Lee, J. A. (2018).
+Perplexity-free t-SNE and twice Student tt-SNE.
+In \emph{European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning (ESANN 2018)} (pp. 123-128).
 \url{http://hdl.handle.net/2078.1/200844}
 
 Borg, I., & Groenen, P. J. (2005).
@@ -842,13 +847,13 @@ Hinton, G. E., & Roweis, S. T. (2002).
 Stochastic neighbor embedding.
 In \emph{Advances in neural information processing systems} (pp. 833-840).
 
-Im, D. J., Verma, N., & Branson, K. (2018). 
-Stochastic Neighbor Embedding under f-divergences. 
+Im, D. J., Verma, N., & Branson, K. (2018).
+Stochastic Neighbor Embedding under f-divergences.
 \emph{arXiv preprint} \emph{arXiv}:1811.01247.
 \url{https://arxiv.org/abs/1811.01247}
 
 Kobak, D., & Berens, P. (2018).
-The art of using t-SNE for single-cell transcriptomics. 
+The art of using t-SNE for single-cell transcriptomics.
 \emph{bioRxiv}, 453449.
 \url{https://doi.org/10.1101/453449}
 
@@ -872,13 +877,13 @@ UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
 \emph{arXiv preprint} \emph{arXiv}:1802.03426.
 \url{https://arxiv.org/abs/1802.03426}
 
-Narayan, K. S., Punjani, A., & Abbeel, P. (2015, June). 
-Alpha-Beta Divergences Discover Micro and Macro Structures in Data. 
+Narayan, K. S., Punjani, A., & Abbeel, P. (2015, June).
+Alpha-Beta Divergences Discover Micro and Macro Structures in Data.
 In \emph{Proceedings of the 32nd International Conference on Machine Learning (ICML-14)}
 (pp 796-804).
 \url{http://proceedings.mlr.press/v37/narayan15.html}
 
-Schubert, E., & Gertz, M. (2017, October). 
+Schubert, E., & Gertz, M. (2017, October).
 Intrinsic t-stochastic neighbor embedding for visualization and outlier detection.
 In \emph{International Conference on Similarity Search and Applications}
 (pp. 188-203). Springer, Cham.
@@ -901,6 +906,11 @@ Visualizing data using t-SNE.
 \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
 \url{http://www.jmlr.org/papers/v9/vandermaaten08a.html}
 
+Van der Maaten, L. (2014). 
+Accelerating t-SNE using tree-based algorithms.
+\emph{Journal of Machine Learning Research}, \emph{15}(1) (3221-3245).
+\url{https://jmlr.org/papers/v15/vandermaaten14a.html}
+
 Venna, J., Peltonen, J., Nybo, K., Aidos, H., & Kaski, S. (2010).
 Information retrieval perspective to nonlinear dimensionality reduction for
 data visualization.
@@ -926,8 +936,8 @@ Majorization-Minimization for Manifold Embedding.
 In \emph{Proceedings of the 18th International Conference on Artificial Intelligence and Statistics (AISTATS 2015)}
 (pp. 1088-1097).
 
-Zhou, Y., & Sharpee, T. (2018). 
-Using global t-SNE to preserve inter-cluster data structure. 
-\emph{bioRxiv}, 331611. 
+Zhou, Y., & Sharpee, T. (2018).
+Using global t-SNE to preserve inter-cluster data structure.
+\emph{bioRxiv}, 331611.
 \url{https://doi.org/10.1101/331611}
 }
diff --git a/smallvis/man/smallvis_perpstep.Rd b/smallvis/man/smallvis_perpstep.Rd
index 5015b24..ebafcce 100644
--- a/smallvis/man/smallvis_perpstep.Rd
+++ b/smallvis/man/smallvis_perpstep.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/smallvis.R
+% Please edit documentation in R/iterated.R
 \name{smallvis_perpstep}
 \alias{smallvis_perpstep}
 \title{Dimensionality Reduction With Perplexity Stepping}
@@ -60,7 +60,8 @@ Any value of \code{tol}, \code{exaggeration_factor} and
 # and then 750 iterations at perplexity = 40.
 iris_lbfgs_pstep <- smallvis_perpscale(
   step_iter = 250, X = iris, scale = FALSE, verbose = TRUE, Y_init = "spca",
-  ret_extra = c("DX", "DY"), perplexity = 40, max_iter = 1000, opt = list("l-bfgs"))
+  ret_extra = c("DX", "DY"), perplexity = 40, max_iter = 1000, opt = list("l-bfgs")
+)
 }
 }
 \references{
diff --git a/smallvis/man/smallvis_rep.Rd b/smallvis/man/smallvis_rep.Rd
index 81958bc..3245943 100644
--- a/smallvis/man/smallvis_rep.Rd
+++ b/smallvis/man/smallvis_rep.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/smallvis.R
+% Please edit documentation in R/iterated.R
 \name{smallvis_rep}
 \alias{smallvis_rep}
 \title{Best t-SNE Result From Multiple Initializations}
@@ -37,8 +37,10 @@ This function ignores any value of \code{Y_init} you set, and uses
 \examples{
 \dontrun{
 # Return best result out of five random initializations
-tsne_iris_best <- smallvis_rep(nrep = 5, X = iris, perplexity = 50, method = "tsne",
-                               ret_extra = TRUE)
+tsne_iris_best <- smallvis_rep(
+  nrep = 5, X = iris, perplexity = 50, method = "tsne",
+  ret_extra = TRUE
+)
 # How much do the costs vary between runs?
 range(tsne_iris_best$all_costs)
 # Display best embedding found
@@ -46,10 +48,11 @@ plot(tsne_iris_best$Y)
 
 # Keep all results
 # First result is in tsne_iris_rep[[1]], second in tsne_iris_rep[[2]] etc.
-tsne_iris_rep <- smallvis_rep(nrep = 5, X = iris, perplexity = 50, method = "tsne",
-                              ret_extra = TRUE, keep_all = TRUE)
+tsne_iris_rep <- smallvis_rep(
+  nrep = 5, X = iris, perplexity = 50, method = "tsne",
+  ret_extra = TRUE, keep_all = TRUE
+)
 # Index of result with smallest error is in special list item 'best_rep'
 best_iris <- tsne_iris_rep[[tsne_iris_rep[[1]]$best_rep]]
-
 }
 }
diff --git a/smallvis/src/RcppExports.cpp b/smallvis/src/RcppExports.cpp
index ef11c3b..9b849df 100644
--- a/smallvis/src/RcppExports.cpp
+++ b/smallvis/src/RcppExports.cpp
@@ -10,6 +10,40 @@ Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 #endif
 
+// bh_tsne_gradient_cpp
+NumericMatrix bh_tsne_gradient_cpp(IntegerVector indices, IntegerVector indptr, NumericVector P_data, NumericMatrix embedding, double theta, double eps, int n_threads);
+RcppExport SEXP _smallvis_bh_tsne_gradient_cpp(SEXP indicesSEXP, SEXP indptrSEXP, SEXP P_dataSEXP, SEXP embeddingSEXP, SEXP thetaSEXP, SEXP epsSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< IntegerVector >::type indices(indicesSEXP);
+    Rcpp::traits::input_parameter< IntegerVector >::type indptr(indptrSEXP);
+    Rcpp::traits::input_parameter< NumericVector >::type P_data(P_dataSEXP);
+    Rcpp::traits::input_parameter< NumericMatrix >::type embedding(embeddingSEXP);
+    Rcpp::traits::input_parameter< double >::type theta(thetaSEXP);
+    Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
+    Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(bh_tsne_gradient_cpp(indices, indptr, P_data, embedding, theta, eps, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// bh_plogq_cpp
+double bh_plogq_cpp(IntegerVector indices, IntegerVector indptr, NumericVector P_data, NumericMatrix embedding, double theta, double eps, int n_threads);
+RcppExport SEXP _smallvis_bh_plogq_cpp(SEXP indicesSEXP, SEXP indptrSEXP, SEXP P_dataSEXP, SEXP embeddingSEXP, SEXP thetaSEXP, SEXP epsSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< IntegerVector >::type indices(indicesSEXP);
+    Rcpp::traits::input_parameter< IntegerVector >::type indptr(indptrSEXP);
+    Rcpp::traits::input_parameter< NumericVector >::type P_data(P_dataSEXP);
+    Rcpp::traits::input_parameter< NumericMatrix >::type embedding(embeddingSEXP);
+    Rcpp::traits::input_parameter< double >::type theta(thetaSEXP);
+    Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
+    Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(bh_plogq_cpp(indices, indptr, P_data, embedding, theta, eps, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
 // dist2_cpp
 NumericMatrix dist2_cpp(NumericMatrix input, std::size_t n_threads);
 RcppExport SEXP _smallvis_dist2_cpp(SEXP inputSEXP, SEXP n_threadsSEXP) {
@@ -89,8 +123,8 @@ BEGIN_RCPP
 END_RCPP
 }
 // find_beta_knn_cpp
-List find_beta_knn_cpp(const NumericMatrix& knn_distances, const IntegerMatrix& knn_indices, double perplexity, double tol, int max_tries, std::size_t n_threads);
-RcppExport SEXP _smallvis_find_beta_knn_cpp(SEXP knn_distancesSEXP, SEXP knn_indicesSEXP, SEXP perplexitySEXP, SEXP tolSEXP, SEXP max_triesSEXP, SEXP n_threadsSEXP) {
+List find_beta_knn_cpp(const NumericMatrix& knn_distances, const IntegerMatrix& knn_indices, double perplexity, double tol, int max_tries, bool ret_sparse, std::size_t n_threads);
+RcppExport SEXP _smallvis_find_beta_knn_cpp(SEXP knn_distancesSEXP, SEXP knn_indicesSEXP, SEXP perplexitySEXP, SEXP tolSEXP, SEXP max_triesSEXP, SEXP ret_sparseSEXP, SEXP n_threadsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
@@ -99,8 +133,9 @@ BEGIN_RCPP
     Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP);
     Rcpp::traits::input_parameter< double >::type tol(tolSEXP);
     Rcpp::traits::input_parameter< int >::type max_tries(max_triesSEXP);
+    Rcpp::traits::input_parameter< bool >::type ret_sparse(ret_sparseSEXP);
     Rcpp::traits::input_parameter< std::size_t >::type n_threads(n_threadsSEXP);
-    rcpp_result_gen = Rcpp::wrap(find_beta_knn_cpp(knn_distances, knn_indices, perplexity, tol, max_tries, n_threads));
+    rcpp_result_gen = Rcpp::wrap(find_beta_knn_cpp(knn_distances, knn_indices, perplexity, tol, max_tries, ret_sparse, n_threads));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -121,13 +156,15 @@ END_RCPP
 }
 
 static const R_CallMethodDef CallEntries[] = {
+    {"_smallvis_bh_tsne_gradient_cpp", (DL_FUNC) &_smallvis_bh_tsne_gradient_cpp, 7},
+    {"_smallvis_bh_plogq_cpp", (DL_FUNC) &_smallvis_bh_plogq_cpp, 7},
     {"_smallvis_dist2_cpp", (DL_FUNC) &_smallvis_dist2_cpp, 2},
     {"_smallvis_dist_cpp", (DL_FUNC) &_smallvis_dist_cpp, 2},
     {"_smallvis_tweight_cpp", (DL_FUNC) &_smallvis_tweight_cpp, 2},
     {"_smallvis_d2_to_tweight_cpp", (DL_FUNC) &_smallvis_d2_to_tweight_cpp, 2},
     {"_smallvis_tsne_grad_cpp", (DL_FUNC) &_smallvis_tsne_grad_cpp, 5},
     {"_smallvis_mmds_grad_cpp", (DL_FUNC) &_smallvis_mmds_grad_cpp, 5},
-    {"_smallvis_find_beta_knn_cpp", (DL_FUNC) &_smallvis_find_beta_knn_cpp, 6},
+    {"_smallvis_find_beta_knn_cpp", (DL_FUNC) &_smallvis_find_beta_knn_cpp, 7},
     {"_smallvis_find_beta_cpp", (DL_FUNC) &_smallvis_find_beta_cpp, 5},
     {NULL, NULL, 0}
 };
diff --git a/smallvis/src/bh.cpp b/smallvis/src/bh.cpp
new file mode 100644
index 0000000..a246ecc
--- /dev/null
+++ b/smallvis/src/bh.cpp
@@ -0,0 +1,40 @@
+#include "bh.h"
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// [[Rcpp::export]]
+NumericMatrix bh_tsne_gradient_cpp(IntegerVector indices, IntegerVector indptr,
+                                   NumericVector P_data,
+                                   NumericMatrix embedding, double theta = 0.5,
+                                   double eps = 1e-16, int n_threads = 1) {
+  std::vector<std::size_t> indices_cpp = as<std::vector<std::size_t>>(indices);
+  std::vector<std::size_t> indptr_cpp = as<std::vector<std::size_t>>(indptr);
+  std::vector<double> P_data_cpp = as<std::vector<double>>(P_data);
+
+  std::vector<double> embedding_vec =
+      as<std::vector<double>>(transpose(embedding));
+
+  std::vector<double> gradient_cpp =
+      smallvis::bh_tsne_gradient(indices_cpp, indptr_cpp, P_data_cpp,
+                                 embedding_vec, theta, eps, n_threads);
+
+  std::size_t n_samples = embedding.nrow();
+  NumericMatrix gradient(2, n_samples, gradient_cpp.begin());
+  return transpose(gradient);
+}
+
+// [[Rcpp::export]]
+double bh_plogq_cpp(IntegerVector indices, IntegerVector indptr,
+                    NumericVector P_data, NumericMatrix embedding,
+                    double theta = 0.5, double eps = 1e-16, int n_threads = 1) {
+  std::vector<std::size_t> indices_cpp = as<std::vector<std::size_t>>(indices);
+  std::vector<std::size_t> indptr_cpp = as<std::vector<std::size_t>>(indptr);
+  std::vector<double> P_data_cpp = as<std::vector<double>>(P_data);
+
+  std::vector<double> embedding_vec =
+      as<std::vector<double>>(transpose(embedding));
+
+  return smallvis::bh_plogq(indices_cpp, indptr_cpp, P_data_cpp, embedding_vec,
+                            theta, eps, n_threads);
+}
diff --git a/smallvis/src/bh.h b/smallvis/src/bh.h
new file mode 100644
index 0000000..6d7519f
--- /dev/null
+++ b/smallvis/src/bh.h
@@ -0,0 +1,369 @@
+// BSD 3-Clause License
+//
+// Copyright (c) 2024, James Melville
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// * Neither the name of the copyright holder nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+// This is a rough translation of the Barnes-Hut t-SNE cython code from the
+// OpenTSNE package originally written by Pavlin Poličar. The original code can
+// found at https://github.com/pavlin-policar/openTSNE
+
+#ifndef SMALLVIS_BH_H
+#define SMALLVIS_BH_H
+
+#include <cmath>
+#include <cstddef>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "threads.h"
+
+namespace smallvis {
+
+class QuadTree {
+public:
+  // Node structure
+  struct Node {
+    // Center of mass of points within this node
+    double center_of_mass_x = 0.0;
+    double center_of_mass_y = 0.0;
+    std::size_t num_points = 0;
+    Node *children[4] = {nullptr, nullptr, nullptr, nullptr};
+
+    Node(double center_x, double center_y, double length) noexcept
+        : center_x(center_x), center_y(center_y), length(length),
+          length2(length * length) {}
+
+    ~Node() noexcept {
+      for (Node *child : children) {
+        delete child;
+      }
+    }
+
+    bool is_close(double point_x, double point_y, double eps) const noexcept {
+      return std::abs(center_of_mass_x - point_x) < eps &&
+             std::abs(center_of_mass_y - point_y) < eps;
+    }
+
+    bool is_skippable(double point_x, double point_y,
+                      double eps) const noexcept {
+      return num_points == 0 || (is_leaf && is_close(point_x, point_y, eps));
+    }
+
+    // Barnes-Hut criterion rearranged for squared distances and squared theta
+    // to avoid square roots and divisions
+    bool is_summary(double d2, double theta2) const noexcept {
+      return is_leaf || length2 < d2 * theta2;
+    }
+
+    void add_point(double point_x, double point_y, double eps) {
+      if ((is_leaf && num_points == 0) || is_close(point_x, point_y, eps)) {
+        update_center_of_mass(point_x, point_y);
+        return;
+      }
+
+      if (is_leaf) {
+        split();
+        std::size_t qid = find_quadrant(center_of_mass_x, center_of_mass_y);
+        children[qid]->add_point(center_of_mass_x, center_of_mass_y, eps);
+      }
+
+      update_center_of_mass(point_x, point_y);
+      std::size_t qid = find_quadrant(point_x, point_y);
+      children[qid]->add_point(point_x, point_y, eps);
+    }
+
+  private:
+    // Center of the node
+    double center_x;
+    double center_y;
+    // (Longest) side length of the node
+    double length;
+    // length squared used in Barnes Hut criterion
+    double length2;
+    bool is_leaf = true;
+
+    std::size_t find_quadrant(double point_x, double point_y) const noexcept {
+      // Determine which quadrant the point is in -- 0: SW, 1: SE, 2: NW, 3: NE
+      if (point_x <= center_x) {
+        return point_y <= center_y ? 0 : 2;
+      }
+      return point_y <= center_y ? 1 : 3;
+    }
+
+    void update_center_of_mass(double point_x, double point_y) noexcept {
+      center_of_mass_x =
+          (center_of_mass_x * num_points + point_x) / (num_points + 1);
+      center_of_mass_y =
+          (center_of_mass_y * num_points + point_y) / (num_points + 1);
+      num_points += 1;
+    }
+
+    void split() {
+      is_leaf = false;
+      double new_len = length * 0.5;
+      double half_len = new_len * 0.5;
+
+      children[0] = new Node(center_x - half_len, center_y - half_len, new_len);
+      children[1] = new Node(center_x + half_len, center_y - half_len, new_len);
+      children[2] = new Node(center_x - half_len, center_y + half_len, new_len);
+      children[3] = new Node(center_x + half_len, center_y + half_len, new_len);
+    }
+  };
+  std::unique_ptr<Node> root;
+
+  QuadTree(const std::vector<double> &data, double eps = EPS) {
+    double coords_min_x = std::numeric_limits<double>::max();
+    double coords_max_x = std::numeric_limits<double>::lowest();
+    double coords_min_y = std::numeric_limits<double>::max();
+    double coords_max_y = std::numeric_limits<double>::lowest();
+
+    const std::size_t n_points = data.size() / 2;
+    for (std::size_t i = 0; i < n_points * 2; i += 2) {
+      double x = data[i];
+      double y = data[i + 1];
+
+      coords_min_x = std::min(coords_min_x, x);
+      coords_max_x = std::max(coords_max_x, x);
+      coords_min_y = std::min(coords_min_y, y);
+      coords_max_y = std::max(coords_max_y, y);
+    }
+
+    double center_x = (coords_max_x + coords_min_x) * 0.5;
+    double center_y = (coords_max_y + coords_min_y) * 0.5;
+    double length =
+        std::max(coords_max_x - coords_min_x, coords_max_y - coords_min_y);
+
+    root = std::make_unique<Node>(center_x, center_y, length);
+    for (std::size_t i = 0; i < n_points * 2; i += 2) {
+      root->add_point(data[i], data[i + 1], eps);
+    }
+  }
+
+private:
+  constexpr static double EPS = std::numeric_limits<double>::epsilon();
+};
+
+void _bh_tsne_negative_gradient_single(const QuadTree::Node &node,
+                                       double point_x, double point_y,
+                                       double theta2, double eps,
+                                       double &gradient_x, double &gradient_y,
+                                       double &Zi) {
+  // Ensure we do not process empty nodes or simple self-interactions
+  if (node.is_skippable(point_x, point_y, eps)) {
+    return;
+  }
+
+  // Compute the squared Euclidean distance between the point and the center of
+  // mass
+  double diff_x = node.center_of_mass_x - point_x;
+  double diff_y = node.center_of_mass_y - point_y;
+  double d2_ij = eps + diff_x * diff_x + diff_y * diff_y;
+
+  // Check if we can use this node as a summary
+  if (node.is_summary(d2_ij, theta2)) {
+    double w_ij = 1.0 / (1.0 + d2_ij);
+
+    Zi += node.num_points * w_ij;
+
+    // un-normalized -ve grad W*W*dY
+    w_ij *= w_ij;
+    gradient_x += node.num_points * w_ij * diff_x;
+    gradient_y += node.num_points * w_ij * diff_y;
+
+    return;
+  }
+
+  // Recursively apply Barnes-Hut to the children
+  for (const auto &child : node.children) {
+    _bh_tsne_negative_gradient_single(*child, point_x, point_y, theta2, eps,
+                                      gradient_x, gradient_y, Zi);
+  }
+}
+
+// Function to estimate the negative gradient using the Barnes-Hut approximation
+void bh_tsne_negative_gradient(const QuadTree &tree,
+                               const std::vector<double> &embedding,
+                               double theta2, double eps, std::size_t n_threads,
+                               std::vector<double> &gradient) {
+  std::size_t num_points = embedding.size() / 2;
+  std::vector<double> Zi(std::max(static_cast<std::size_t>(1), n_threads), 0.0);
+
+  const QuadTree::Node *root = tree.root.get();
+
+  // Function to calculate the negative gradient for a single point
+  auto worker = [&](std::size_t start, std::size_t end, std::size_t thread_id) {
+    for (std::size_t i = start * 2; i < end * 2; i += 2) {
+      _bh_tsne_negative_gradient_single(*root, embedding[i], embedding[i + 1],
+                                        theta2, eps, gradient[i],
+                                        gradient[i + 1], Zi[thread_id]);
+    }
+  };
+  parallel_for(num_points, n_threads, worker);
+
+  double Z = std::accumulate(Zi.begin(), Zi.end(), eps);
+
+  // Normalize the gradient
+  for (std::size_t i = 0; i < num_points * 2; i += 2) {
+    gradient[i] /= Z;
+    gradient[i + 1] /= Z;
+  }
+}
+
+void bh_tsne_positive_gradient(const std::vector<std::size_t> &indices,
+                               const std::vector<std::size_t> &indptr,
+                               const std::vector<double> &P_data,
+                               const std::vector<double> &embedding,
+                               std::size_t n_threads,
+                               std::vector<double> &gradient) {
+
+  auto worker = [&](std::size_t start, std::size_t end) {
+    for (std::size_t i = start; i < end; ++i) {
+      const std::size_t i2 = i * 2;
+      for (std::size_t k = indptr[i]; k < indptr[i + 1]; ++k) {
+        std::size_t j2 = indices[k] * 2;
+
+        // Compute the direction of the points' attraction and the squared
+        // Euclidean distance
+        double diff_x = embedding[i2] - embedding[j2];
+        double diff_y = embedding[i2 + 1] - embedding[j2 + 1];
+        double w_ij_p_ij =
+            P_data[k] / (1.0 + diff_x * diff_x + diff_y * diff_y);
+
+        // Compute F_{attr} of point `j` on point `i`
+        // W x P x dY
+        gradient[i2] += w_ij_p_ij * diff_x;
+        gradient[i2 + 1] += w_ij_p_ij * diff_y;
+      }
+    }
+  };
+  parallel_for(gradient.size() / 2, n_threads, worker);
+}
+
+void _bh_Zi(const QuadTree::Node &node, double point_x, double point_y,
+            double theta2, double eps, double &Zi) {
+  // Ensure we do not process empty nodes or simple self-interactions
+  if (node.is_skippable(point_x, point_y, eps)) {
+    return;
+  }
+
+  // Compute the squared Euclidean distance between the point and the center of
+  // mass
+  double diff_x = node.center_of_mass_x - point_x;
+  double diff_y = node.center_of_mass_y - point_y;
+  double d2_ij = eps + diff_x * diff_x + diff_y * diff_y;
+
+  // Check if we can use this node as a summary
+  if (node.is_summary(d2_ij, theta2)) {
+    Zi += node.num_points / (1.0 + d2_ij);
+    return;
+  }
+
+  // Recursively apply Barnes-Hut to the children
+  for (const auto &child : node.children) {
+    _bh_Zi(*child, point_x, point_y, theta2, eps, Zi);
+  }
+}
+
+double bh_Z(const QuadTree &tree, const std::vector<double> &embedding,
+            double theta2, double eps, std::size_t n_threads) {
+  std::size_t num_points = embedding.size() / 2;
+  std::vector<double> Zi(std::max(static_cast<std::size_t>(1), n_threads), 0.0);
+
+  const QuadTree::Node *root = tree.root.get();
+
+  // Function to calculate the negative gradient for a single point
+  auto worker = [&](std::size_t start, std::size_t end, std::size_t thread_id) {
+    for (std::size_t i = start * 2; i < end * 2; i += 2) {
+      _bh_Zi(*root, embedding[i], embedding[i + 1], theta2, eps, Zi[thread_id]);
+    }
+  };
+  parallel_for(num_points, n_threads, worker);
+
+  return std::accumulate(Zi.begin(), Zi.end(), eps);
+}
+
+double bh_plogq(const std::vector<std::size_t> &indices,
+                const std::vector<std::size_t> &indptr,
+                const std::vector<double> &P_data,
+                const std::vector<double> &embedding, double theta, double eps,
+                std::size_t n_threads) {
+  std::size_t n_samples = embedding.size() / 2;
+
+  QuadTree tree(embedding, eps);
+  double mlogZ =
+      -std::log(bh_Z(tree, embedding, theta * theta, eps, n_threads));
+
+  std::vector<double> partial_plogq(
+      std::max(static_cast<std::size_t>(1), n_threads));
+  auto worker = [&](std::size_t start, std::size_t end, std::size_t thread_id) {
+    double window_plogq = 0.0;
+    for (std::size_t i = start, i2 = start * 2; i < end; ++i, i2 += 2) {
+      for (std::size_t k = indptr[i]; k < indptr[i + 1]; ++k) {
+        std::size_t j2 = indices[k] * 2;
+
+        // p * log(q) = p * log(w/Z) = p * (log w - log Z) =
+        // p * (log(1/(1+d2)) - log Z) = p * (-log(1+d2) - log Z)
+        double diff_x = embedding[i2] - embedding[j2];
+        double diff_y = embedding[i2 + 1] - embedding[j2 + 1];
+
+        window_plogq +=
+            P_data[k] *
+            (mlogZ - std::log(1.0 + diff_x * diff_x + diff_y * diff_y));
+      }
+    }
+    partial_plogq[thread_id] += window_plogq;
+  };
+  parallel_for(n_samples, n_threads, worker);
+
+  return std::accumulate(partial_plogq.begin(), partial_plogq.end(), 0.0);
+}
+
+std::vector<double> bh_tsne_gradient(const std::vector<std::size_t> &indices,
+                                     const std::vector<std::size_t> &indptr,
+                                     const std::vector<double> &P_data,
+                                     const std::vector<double> &embedding,
+                                     double theta, double eps,
+                                     std::size_t n_threads) {
+
+  QuadTree tree(embedding, eps);
+  std::vector<double> gradient(embedding.size(), 0.0);
+
+  // don't re-arrange the order of the following two function calls!
+  bh_tsne_negative_gradient(tree, embedding, theta * theta, eps, n_threads,
+                            gradient);
+  bh_tsne_positive_gradient(indices, indptr, P_data, embedding, n_threads,
+                            gradient);
+
+  return gradient;
+}
+} // namespace smallvis
+
+#endif // SMALLVIS_BH_H
diff --git a/smallvis/src/gradients.cpp b/smallvis/src/gradients.cpp
index b0e4062..69ef34c 100644
--- a/smallvis/src/gradients.cpp
+++ b/smallvis/src/gradients.cpp
@@ -10,7 +10,7 @@ void mmds_grad(const std::vector<double> &R, const std::vector<double> &D,
                const std::vector<double> &Y, double eps,
                std::vector<double> &gradient, std::size_t start,
                std::size_t end, std::size_t n, std::size_t d) {
-  
+
   for (std::size_t i = start; i < end; ++i) {
     const std::size_t i_d = i * d;
     const std::size_t i_n = i * n;
@@ -27,14 +27,13 @@ void mmds_grad(const std::vector<double> &R, const std::vector<double> &D,
   }
 }
 
-
 void tsne_grad(const std::vector<double> &P, const std::vector<double> &W,
                double Z, const std::vector<double> &Y,
                std::vector<double> &gradient, std::size_t start,
                std::size_t end, std::size_t n) {
 
   const double Z4 = 4.0 / Z;
-  
+
   for (std::size_t i = start; i < end; ++i) {
     const std::size_t i_d = i + i;
     const std::size_t i_n = i * n;
@@ -256,10 +255,9 @@ NumericMatrix d2_to_tweight_cpp(NumericMatrix dist_matrix, int n_threads) {
 }
 
 // [[Rcpp::export]]
-NumericMatrix tsne_grad_cpp(const NumericMatrix &P,
-                                    const NumericMatrix &W, double Z,
-                                    const NumericMatrix &Y,
-                                    std::size_t n_threads) {
+NumericMatrix tsne_grad_cpp(const NumericMatrix &P, const NumericMatrix &W,
+                            double Z, const NumericMatrix &Y,
+                            std::size_t n_threads) {
   std::size_t n = Y.nrow();
 
   std::vector<double> P_vec(P.begin(), P.end());
@@ -300,35 +298,33 @@ NumericMatrix tsne_grad_cpp(const NumericMatrix &P,
 }
 
 // [[Rcpp::export]]
-NumericMatrix mmds_grad_cpp(const NumericMatrix &R,
-                            const NumericMatrix &D,
-                            const NumericMatrix &Y,
-                            double eps,
+NumericMatrix mmds_grad_cpp(const NumericMatrix &R, const NumericMatrix &D,
+                            const NumericMatrix &Y, double eps,
                             std::size_t n_threads) {
   std::size_t n = Y.nrow();
   std::size_t d = Y.ncol();
-  
+
   std::vector<double> R_vec(R.begin(), R.end());
   std::vector<double> D_vec(D.begin(), D.end());
-  
+
   std::vector<double> Y_vec(n * d);
   for (std::size_t i = 0; i < n; ++i) {
     for (std::size_t j = 0; j < d; ++j) {
       Y_vec[i * d + j] = Y(i, j);
     }
   }
-  
+
   std::vector<double> gradient_vec(n * d, 0.0);
-  
+
   if (n_threads > 1) {
     std::size_t chunk_size = (n + n_threads - 1) / n_threads;
     std::vector<std::thread> threads;
     for (std::size_t t = 0; t < n_threads; ++t) {
       std::size_t start_row = t * chunk_size;
       std::size_t end_row = std::min(start_row + chunk_size, n);
-      threads.emplace_back(mmds_grad, std::cref(R_vec), std::cref(D_vec), 
-                           std::cref(Y_vec), eps, std::ref(gradient_vec), start_row,
-                           end_row, n, d);
+      threads.emplace_back(mmds_grad, std::cref(R_vec), std::cref(D_vec),
+                           std::cref(Y_vec), eps, std::ref(gradient_vec),
+                           start_row, end_row, n, d);
     }
     for (auto &thread : threads) {
       thread.join();
@@ -336,16 +332,13 @@ NumericMatrix mmds_grad_cpp(const NumericMatrix &R,
   } else {
     mmds_grad(R_vec, D_vec, Y_vec, eps, gradient_vec, 0, n, n, d);
   }
-  
+
   NumericMatrix gradient(n, d);
   for (std::size_t i = 0; i < n; ++i) {
     for (std::size_t j = 0; j < d; ++j) {
       gradient(i, j) = gradient_vec[i * d + j];
     }
   }
-  
+
   return gradient;
 }
-
-
-
diff --git a/smallvis/src/perplexity.cpp b/smallvis/src/perplexity.cpp
index af7c85d..0acdf66 100644
--- a/smallvis/src/perplexity.cpp
+++ b/smallvis/src/perplexity.cpp
@@ -195,7 +195,7 @@ void find_beta(const std::vector<double> &data, std::size_t n, std::size_t d,
 List find_beta_knn_cpp(const NumericMatrix &knn_distances,
                        const IntegerMatrix &knn_indices, double perplexity = 15,
                        double tol = 1e-5, int max_tries = 50,
-                       std::size_t n_threads = 1) {
+                       bool ret_sparse = false, std::size_t n_threads = 1) {
 
   const std::size_t n = knn_distances.nrow();
   const std::size_t k = knn_distances.ncol();
@@ -232,15 +232,21 @@ List find_beta_knn_cpp(const NumericMatrix &knn_distances,
                   beta, bad_perp, 0, n);
   }
 
-  NumericMatrix P(n, n);
-  for (std::size_t i = 0; i < n; ++i) {
-    for (std::size_t j = 0; j < k; ++j) {
-      P(i, knn_indices(i, j) - 1) = W[i * k + j];
+  if (ret_sparse) {
+    NumericVector P(n * k);
+    std::copy(W.begin(), W.end(), P.begin());
+    return List::create(Named("P") = P, Named("beta") = beta,
+                        Named("bad_perp") = bad_perp);
+  } else {
+    NumericMatrix P(n, n);
+    for (std::size_t i = 0; i < n; ++i) {
+      for (std::size_t j = 0; j < k; ++j) {
+        P(i, knn_indices(i, j) - 1) = W[i * k + j];
+      }
     }
+    return List::create(Named("P") = P, Named("beta") = beta,
+                        Named("bad_perp") = bad_perp);
   }
-
-  return List::create(Named("P") = P, Named("beta") = beta,
-                      Named("bad_perp") = bad_perp);
 }
 
 // [[Rcpp::export]]
diff --git a/smallvis/src/threads.h b/smallvis/src/threads.h
new file mode 100644
index 0000000..d7c9c6e
--- /dev/null
+++ b/smallvis/src/threads.h
@@ -0,0 +1,65 @@
+#ifndef SMALLVIS_THREADS_H
+#define SMALLVIS_THREADS_H
+
+#include <functional>
+#include <thread>
+
+// Trait to detect the arity of the worker (2- or 3- arg)
+template <typename T> struct function_traits;
+
+template <typename ReturnType, typename... Args>
+struct function_traits<std::function<ReturnType(Args...)>> {
+  static constexpr std::size_t arity = sizeof...(Args);
+};
+
+template <typename T> auto make_function(T &&t) {
+  return std::function{std::forward<T>(t)};
+}
+
+// Enable if the callable has 3 arguments (start, end, thread_id)
+template <typename WorkerFunc>
+std::enable_if_t<function_traits<decltype(make_function(
+                     std::declval<WorkerFunc>()))>::arity == 3>
+parallel_for(std::size_t N, std::size_t n_threads, WorkerFunc worker) {
+  if (n_threads > 1) {
+    std::size_t chunk_size = N / n_threads;
+    std::vector<std::thread> threads;
+
+    for (std::size_t t = 0; t < n_threads; ++t) {
+      std::size_t start = t * chunk_size;
+      std::size_t end = (t == n_threads - 1) ? N : (t + 1) * chunk_size;
+      threads.emplace_back(worker, start, end, t);
+    }
+
+    for (auto &t : threads) {
+      t.join();
+    }
+  } else {
+    worker(0, N, 0);
+  }
+}
+
+// Enable if the callable has 2 arguments (start, end)
+template <typename WorkerFunc>
+std::enable_if_t<function_traits<decltype(make_function(
+                     std::declval<WorkerFunc>()))>::arity == 2>
+parallel_for(std::size_t N, std::size_t n_threads, WorkerFunc worker) {
+  if (n_threads > 1) {
+    std::size_t chunk_size = N / n_threads;
+    std::vector<std::thread> threads;
+
+    for (std::size_t t = 0; t < n_threads; ++t) {
+      std::size_t start = t * chunk_size;
+      std::size_t end = (t == n_threads - 1) ? N : (t + 1) * chunk_size;
+      threads.emplace_back(worker, start, end);
+    }
+
+    for (auto &t : threads) {
+      t.join();
+    }
+  } else {
+    worker(0, N);
+  }
+}
+
+#endif // SMALLVIS_THREADS_H
\ No newline at end of file
diff --git a/smallvis/tests/testthat/helper_api.R b/smallvis/tests/testthat/helper_api.R
index 6b679ad..d4ad980 100644
--- a/smallvis/tests/testthat/helper_api.R
+++ b/smallvis/tests/testthat/helper_api.R
@@ -1,7 +1,7 @@
-expect_api <- function(method, Y, cost, X = iris10, use_cpp = FALSE) {
+expect_api <- function(method, Y, cost, X = iris10, use_cpp = FALSE, perplexity = 5) {
   res <- smallvis(X, Y_init = iris10_Y, 
                   method = method, eta = 0.1,
-                  perplexity = 5, epoch_callback = NULL, verbose = FALSE, 
+                  perplexity = perplexity, epoch_callback = NULL, verbose = FALSE, 
                   ret_extra = TRUE, use_cpp = use_cpp)
   expect_equal(res$Y, c2y(Y), tolerance = 1e-3, info = paste0(method[[1]], " Y"))
   expect_equal(final_cost(res), cost, tolerance = 1e-4, info = paste0(method[[1]], " cost"))
diff --git a/smallvis/tests/testthat/test_api.R b/smallvis/tests/testthat/test_api.R
index 1ae7e7e..bc1d4cd 100644
--- a/smallvis/tests/testthat/test_api.R
+++ b/smallvis/tests/testthat/test_api.R
@@ -519,7 +519,17 @@ test_that("Miscellany", {
   expect_api(method = "tsne", Y = c(-4.617, 2.008, -0.8907, 4.044, 0.6146, -3.478, 0.4454,
                                     1.967, 4.82, -4.913, 0.5231, 0.837, 0.02866, -1.871, -1.438,
                                     1.273, 0.6842, -1.849, -0.6084, 2.42), X = ui10, use_cpp = TRUE,
-             cost = 0.02485) 
+             cost = 0.02485)
+  expect_api(method = list("bhtsne", perplexity = 3, inp_kernel = "perpnnks", theta = 0.0),
+             Y = c(-12.52, 4.749, -1.071, 10.54, 2.669, -10.44, 1.578, 5.019,
+                   13.21, -13.73, 3.153, 3.662, 1.081, -7.454, -3.565, 4.962,
+                   2.446, -5.197, -6.536, 7.447), X = ui10, use_cpp = TRUE, perplexity = 3,
+             cost = 0.05768)
+  expect_api(method = list("bhtsne", perplexity = 3, inp_kernel = "perpnnks", theta = 0.5),
+             Y = c(-11.78, 4.035, -1.215, 10.35, 2.76, -9.966, 1.177, 5.08,
+                   12.8, -13.23, 2.413, 3.66, 0.7804, -6.291, -3.386, 4.257,
+                   2.233, -4.78, -5.25, 6.362), X = ui10, use_cpp = TRUE, perplexity = 3,
+             cost = 0.05564)
              
 })
 

From 25fb184cf2311ea1118d117f7c21d7dec8924956 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sun, 11 Aug 2024 21:24:59 -0700
Subject: [PATCH 24/26] Add license to threading functions

---
 smallvis/src/threads.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/smallvis/src/threads.h b/smallvis/src/threads.h
index d7c9c6e..af38696 100644
--- a/smallvis/src/threads.h
+++ b/smallvis/src/threads.h
@@ -1,9 +1,42 @@
+// BSD 3-Clause License
+//
+// Copyright (c) 2024, James Melville
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// * Neither the name of the copyright holder nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
 #ifndef SMALLVIS_THREADS_H
 #define SMALLVIS_THREADS_H
 
 #include <functional>
 #include <thread>
 
+namespace smallvis {
+
 // Trait to detect the arity of the worker (2- or 3- arg)
 template <typename T> struct function_traits;
 
@@ -61,5 +94,6 @@ parallel_for(std::size_t N, std::size_t n_threads, WorkerFunc worker) {
     worker(0, N);
   }
 }
+} // namespace smallvis
 
 #endif // SMALLVIS_THREADS_H
\ No newline at end of file

From aea582f9feb4eceb7a921566303992eb2b1dc2dd Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sun, 11 Aug 2024 21:26:07 -0700
Subject: [PATCH 25/26] Mention initial Barnes-Hut and related code

---
 README.md | 80 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 887b60d..d18cd49 100644
--- a/README.md
+++ b/README.md
@@ -2,28 +2,56 @@
 
 An R package for small-scale dimensionality reduction using 
 neighborhood-preservation
-dimensionality reduction methods, including [t-Distributed Stochastic Neighbor Embedding](https://lvdmaaten.github.io/tsne/), 
-[LargeVis](https://arxiv.org/abs/1602.00370) and 
-[UMAP](https://arxiv.org/abs/1802.03426). 
-
-LargeVis and UMAP are of particular interest because they seem to give
-visualizations which are very competitive with t-SNE, but can use stochastic
-gradient descent to give faster run times and/or better scaling with dataset
-size than the typical Barnes-Hut t-SNE implementation.
-
-This package is designed to make it easier to experiment with and compare these 
-methods, by removing differences in implementation details.
-
-One way it does this is by abandoning the more advanced nearest-neighbor 
-methods, distance approximations, sampling, and multi-threaded stochastic 
-gradient descent techniques. The price paid for this simplification is that the 
-algorithms are back to being O(N^2) in storage and computation costs (and being 
-in pure R). Unlike UMAP, 
-[the official implementation of LargeVis](https://github.com/lferry007/LargeVis),
-and the 
-[Barnes-Hut implementation of t-SNE](https://github.com/lvdmaaten/bhtsne),
-this package is therefore *not* suitable for large scale visualization.
-Hence the name smallvis.
+dimensionality reduction methods, including 
+[t-Distributed Stochastic Neighbor Embedding](https://lvdmaaten.github.io/tsne/), 
+and a non-stochastic version of the [LargeVis](https://arxiv.org/abs/1602.00370)
+and [UMAP](https://arxiv.org/abs/1802.03426) cost functions.
+
+The purpose of this package is to make it easier to experiment with different
+dimensionality reduction methods while having more control over things like
+input scaling, nearest neighbor calculations, initialization and optimization, 
+which can make comparisons between different packages difficult. Be warned, 
+most implementations are not optimized for speed and scale like O(N^2), but see
+below for my dream to upgrade `smallvis` to more of a mediumvis.
+
+*August 11 2024* **The Turbo Championship Edition Update**. I have briefly
+brought `smallvis` back from the dead to speed it up a bit. I have added:
+
+* Barnes-Hut t-SNE. This will scale up to larger datasets and it is feasible
+to run it on the full MNIST digits dataset (i.e. 70,000 items). This uses a
+(2D only) C++ translation of the cython implementation in the Python
+[openTSNE](https://github.com/pavlin-policar/openTSNE) package originally
+authored by [Pavlin Poličar](https://github.com/pavlin-policar). It's BSD
+3-clause licensed (and can be found in `src/bh.h`). Use it with 
+`method = "bhtsne"`. The degree of approximation can be controlled with 
+`theta`. Be aware that it's not as fast as e.g.
+[Rtsne](https://github.com/jkrijthe/Rtsne), at least during the optimization
+step. That package uses Laurens van der Maaten's original C++ code which I am
+very unsure can be redistributed with R code due to its BSD 4-clause license. I
+would love to be wrong about that though! The Quad Tree implementation could
+be used with other embedding methods, but I haven't got round to implementing
+that yet.
+* A C++ multi-threaded perplexity search using only the nearest neighbors of
+each point (3 times the perplexity) is used with BH t-SNE.
+* My own [rnndescent](https://cran.r-project.org/package=rnndescent) package
+replaces FNN for nearest neighbor search. Apart from being a monument to my ego,
+it can be faster than FNN for brute force search because it can be
+multi-threaded (use `n_threads` to control this). Also, approximate nearest
+neighbor search becomes quite important with larger datasets.
+* For exact search, I have started adding multi-threaded C++ code to calculate
+the gradient. Set `use_cpp = TRUE` to use this. It's not as big a win in speed
+up as you might hope because the R code is using some very optimized linear
+algebra for some steps which will blow my puny C++ code out of the water.
+However in many cases the linear algebra libraries won't be mult-threaded so
+sheer brute force threads can overcome this. Just don't expect setting `n` 
+threads to give you `n` times the speed. Like Barnes-Hut, this requires me to
+implement the gradients in C++ for each method and I haven't done that yet.
+* [irlba](https://cran.r-project.org/package=irlba) is now a dependency for 
+doing PCA on larger datasets.
+
+I will probably at least attempt to apply Barnes-Hut and multi-threading to some
+other methods. On the other hand, it's taken me five years to get back to this,
+so don't hold your breath.
 
 ## Prerequisites
 
@@ -61,6 +89,10 @@ library(smallvis)
 # Automatically plots the results during optimization
 tsne_iris <- smallvis(iris, perplexity = 25, verbose = TRUE)
 
+# Barnes-Hut:
+bhtsne_iris <- smallvis(iris, perplexity = 25, method = "bhtsne", theta = 0.8)
+
+
 # Using a custom epoch_callback
 uniq_spec <- unique(iris$Species)
 colors <- rainbow(length(uniq_spec))
@@ -205,7 +237,7 @@ scale.
 Also of relevance are:
 
 * [UMAP](https://github.com/lmcinnes/umap) (in Python)
-* [UWOT](https://github.com/jlmelville/uwot) a package implementing LargeVis 
+* [uwot](https://github.com/jlmelville/uwot) a package implementing LargeVis 
 and UMAP.
 * [LargeVis](https://github.com/lferry007/LargeVis) (in C++)
 * [Spectra](http://spectralib.org/), the C++ library that RSpectra wraps.
@@ -221,5 +253,5 @@ of Justin Donaldson's R package for [t-SNE](https://cran.r-project.org/package=t
 [GPLv2 or later](https://www.gnu.org/licenses/gpl-2.0.txt). Any 
 LargeVis-specific code (e.g. cost and gradient calculation) can also be 
 considered [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0).
-Similarly, UMAP-related code is also licensed as 
+Similarly, Barnes-Hut and UMAP-related code is also licensed as 
 [BSD 3-clause](https://opensource.org/licenses/BSD-3-Clause).

From 18725dbb26dcff9c902f69c0f4ec40e733c1abb2 Mon Sep 17 00:00:00 2001
From: James Melville <jlmelville@gmail.com>
Date: Sun, 11 Aug 2024 21:26:34 -0700
Subject: [PATCH 26/26] Time for a new version

---
 smallvis/DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smallvis/DESCRIPTION b/smallvis/DESCRIPTION
index 7470b56..1b1e135 100644
--- a/smallvis/DESCRIPTION
+++ b/smallvis/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: smallvis
 Type: Package
 Title: Small Scale Neighborhood Embedding Algorithms
-Version: 0.0.1.9001
+Version: 0.0.2.9001
 Authors@R: person("James", "Melville", email = "jlmelville@gmail.com",
     role = c("aut", "cre"))
 Description: Neighborhood embedding methods for small scale visualization, such