From 57830229824ad2cd75fdeb8044ad4caae06d045f Mon Sep 17 00:00:00 2001
From: huizezhang-sherry <huizezhangsh@gmail.com>
Date: Thu, 23 May 2024 23:28:01 -0500
Subject: [PATCH] fixes

---
 DESCRIPTION            |  3 +-
 NAMESPACE              |  1 +
 R/calc-smoothness.R    |  8 ++---
 R/calc-squintability.R | 75 +++++++++++++++++++++++++++---------------
 R/ferrn-package.R      |  1 +
 man/data.Rd            |  2 +-
 man/optim.Rd           | 11 ++++---
 7 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8006c06..0a95f01 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -33,7 +33,8 @@ Imports:
     ggforce,
     tidyr,
     GpGp,
-    cli
+    cli,
+    progress
 RoxygenNote: 7.3.1
 Depends: 
     R (>= 2.10)
diff --git a/NAMESPACE b/NAMESPACE
index ac57e12..ec6a494 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -51,6 +51,7 @@ importFrom(GpGp,fit_model)
 importFrom(cli,cli_abort)
 importFrom(ggplot2,"%+replace%")
 importFrom(magrittr,"%>%")
+importFrom(progress,progress_bar)
 importFrom(rlang,.data)
 importFrom(rlang,`:=`)
 importFrom(tidyr,unnest)
diff --git a/R/calc-smoothness.R b/R/calc-smoothness.R
index 38d42eb..02a882c 100644
--- a/R/calc-smoothness.R
+++ b/R/calc-smoothness.R
@@ -15,7 +15,7 @@ calc_smoothness <- function(idx, data = sine1000, n_basis = 300, n = 6, d = 2,
                             best = matrix(c(0, 0, 0, 0, 1, 0,
                                             0, 0, 0, 0, 0, 1), nrow = 6),
                             start_parms = c(0.001, 0.5, 2, 2),
-                            other_gp_params = list(NULL)
+                            other_gp_params = NULL
                             ){
 
   # sample basis
@@ -23,7 +23,7 @@ calc_smoothness <- function(idx, data = sine1000, n_basis = 300, n = 6, d = 2,
   set.seed(123)
   seed <- sample(1: 10000, size = n_basis)
   basis_df <- tibble::tibble(basis = lapply(1:n_basis, function(i){
-    set.seed(seed[i]); tourr::basis_random(n = n, d = d)}))
+    set.seed(seed[i]); tourr::basis_random(n = n, d = d)})) |>
     dplyr::rowwise() |>
     dplyr::mutate(proj_dist = tourr::proj_dist(best, basis),
                   index_val = get(idx)()(as.matrix(data) %*% basis))
@@ -34,13 +34,13 @@ calc_smoothness <- function(idx, data = sine1000, n_basis = 300, n = 6, d = 2,
                     start_parms = start_parms, covfun_name = "matern_isotropic",
                     other_gp_params
                     )
-  fit <- do.call("GpGp::fit_model", gp_params)
+  fit <- do.call("fit_model", gp_params)
   cov_params <- tibble::as_tibble_row(fit$covparms, .name_repair = "unique")
   colnames(cov_params) <- c("variance", "range", "smoothness", "nugget", "convergence")
   cov_params <-  cov_params |> dplyr::mutate(convergence = fit$conv, idx = as.character(idx))
 
   # return
-  list(basis_df = basis_df, gp_res = fit, measure = cov_params)
+  list(basis_df = basis_df, gp_res = list(fit), measure = cov_params)
 }
 
 
diff --git a/R/calc-squintability.R b/R/calc-squintability.R
index cb0750e..3d05e92 100644
--- a/R/calc-squintability.R
+++ b/R/calc-squintability.R
@@ -7,24 +7,26 @@
 #' @param bin_nobs_threshold numeric, only for squintability, the threshold
 #' number of observations for
 #' applying binning before fitting the kernel
-#' @param bin_size only for squintability, the bin size for binning the data
+#' @param bin_width only for squintability, the bin size for binning the data
 #' before fitting the kernel
+#' @param sampling_seed the seed used for sampling the random basis
 #' @rdname optim
 #' @export
 calc_squintability <- function(idx, data = sine1000,
-                               method = c("ks", "nls"), n_basis, n = 6, d = 2,
+                               method = c("ks", "nls"), n_basis = 100, n = 6, d = 2,
                                proj_dist_threshold = 1.5, step = 0.02,
                                best = matrix(c(0, 0, 0, 0, 1, 0,
                                                0, 0, 0, 0, 0, 1), nrow = 6),
-                               bin_nobs_threshold = 5000, bin_size = 0.02
+                               bin_nobs_threshold = 5000, bin_width = 0.02,
+                               sampling_seed = 123
                                ){
-
+  cli::cli_inform("sample random bases ...")
   ## sample basis
-  set.seed(123)
+  set.seed(sampling_seed)
   seed <- sample(1: 10000, size = 1000)
-  basis_lst <- list()
+  bb_lst <- list()
   i <- 1
-  while (length(basis_lst) < n_basis){
+  while (length(bb_lst) < n_basis){
     set.seed(seed[i])
     bb <- tourr::basis_random(n = n, d = d)
     if (tourr::proj_dist(best, bb) > proj_dist_threshold){
@@ -35,21 +37,37 @@ calc_squintability <- function(idx, data = sine1000,
 
   ## interpolate between the best and the random basis
   ## TODO: progress bar here
+  cli::cli_inform("interpolate between the best and the random bases ...")
   basis_df <- tibble::tibble(id = 1:n_basis) |>
     dplyr::mutate(res = lapply(bb_lst, function(bb){
       interp_bb_best(bb = bb, best = best, step = step)
       })) |>
     tidyr::unnest(res) |>
-    dplyr::rowwise() |>
-    dplyr::mutate(!!dplyr::sym(idx) := get(idx)()(data %*% basis)) |>
-    dplyr::ungroup()
+    unnest(dist)
+
+  df_add_idx_val <- function(data, idx, org_data){
+    pb$tick()
+    data |> dplyr::mutate(!!rlang::sym(idx) := get(idx)()(org_data %*% basis[[1]]))
+  }
 
+  cli::cli_inform("calculate index values for interpolated bases ...")
+  pb <- progress::progress_bar$new(total = nrow(basis_df))
+  basis_df <- basis_df |>
+    dplyr::group_split(aa = dplyr::row_number()) |>
+    purrr::map_dfr(~df_add_idx_val(.x, idx, data)) |>
+    dplyr::select(-aa)
+
+
+  cli::cli_inform("fit kernel smoothing or non-linear least square ...")
   res <- switch(method,
-                ks = fit_ks(basis_df, idx = idx, bin_nobs_threshold, bin_size),
-                nls = fit_nls(basis_df, idx = idx, bin_nobs_threshold, bin_size)
+                ks = fit_ks(basis_df, idx = idx, bin_width = bin_width,
+                            bin_nobs_threshold = bin_nobs_threshold),
+                nls = fit_nls(basis_df, idx = idx, bin_width = bin_width,
+                              bin_nobs_threshold = bin_nobs_threshold)
                 )
 
-  list(basis_df = basis_df, measure = res)
+  tibble::tibble(basis_df = list(basis_df), measure = res) |>
+    unnest(measure)
 
 }
 
@@ -71,12 +89,14 @@ interp_bb_best <- function(bb, best, step = 0.02){
   tibble::tibble(basis = tt_mat, dist = dist)
 }
 
-fit_ks <- function(basis_df, idx, bin_nobs_threshold, bin_size){
-
+fit_ks <- function(basis_df, idx, bin_nobs_threshold, bin_width){
   if (nrow(basis_df) > bin_nobs_threshold){
-    cli::cli_abort("apply binning before fitting the kernel smoother with bin_size = {bin_size}")
+    cli::cli_inform("apply binning: bin_width = {bin_width}")
+    dist_bin <- ceiling(basis_df$dist / bin_width) * bin_width
     basis_df <- basis_df |>
-      dplyr::mutate(dist_bin = ceiling(dist / bin_size) * bin_size)
+      dplyr::bind_cols(dist_bin = dist_bin) |>
+      dplyr::group_by(dist_bin) |>
+      dplyr::summarise(!!rlang::sym(idx) := mean(!!rlang::sym(idx)))
   } else{
     basis_df <- basis_df |> dplyr::mutate(dist_bin = dist)
   }
@@ -90,22 +110,25 @@ fit_ks <- function(basis_df, idx, bin_nobs_threshold, bin_size){
 
 }
 
-fit_nls <- function(basis_df, idx, bin_nobs_threshold, bin_size){
+fit_nls <- function(basis_df, idx, bin_nobs_threshold, bin_width){
 
   if (nrow(basis_df) > bin_nobs_threshold){
-    cli::cli_abort("apply binning before fitting the kernel smoother with bin_size = {bin_size}")
+    cli::cli_inform("apply binning: bin_width = {bin_width}")
+    dist_bin <- ceiling(basis_df$dist / bin_width) * bin_width
     basis_df <- basis_df |>
-      dplyr::mutate(dist_bin = ceiling(dist / bin_size) * bin_size,
-                    dist_bin = dist_bin / pi * 180)
+      dplyr::bind_cols(dist_bin = dist_bin / pi * 180) |>
+      dplyr::group_by(dist_bin) |>
+      dplyr::summarise(idx := mean(!!rlang::sym(idx)))
   } else{
-    basis_df <- basis_df |> dplyr::mutate(dist_bin = dist / pi * 180)
+    dist_bin <- ceiling(basis_df$dist / bin_width) * bin_width
+    basis_df <- basis_df |> dplyr::bind_cols(dist_bin = dist_bin / pi * 180) |>
+       dplyr::rename(idx = !!dplyr::sym(idx))
   }
 
   model = stats::nls(idx ~ theta1/(1 + exp(-theta2 + theta3 * dist_bin)),
-              start = list(theta1 = 1, theta2 = 5, theta3 = 0.1))
+                     data = basis_df, start = list(theta1 = 1, theta2 = 5, theta3 = 0.1))
   theta_params <- stats::coef(model)
-  colnames(theta_params) <- paste0("theta", 1:length(theta_params))
-  tibble::tibble(idx = idx) |> dplyr::bind_cols(theta_params)
+  tibble::tibble(idx = idx) |> dplyr::bind_cols(tibble::as_tibble_row(theta_params))
 }
 
-globalVariables(c("dist", "dist_bin", "dist", "y", "x", "dy", "max_dev", "max_x"))
+globalVariables(c("dist", "dist_bin", "dist", "y", "x", "dy", "max_dev", "max_x", "aa", "measure"))
diff --git a/R/ferrn-package.R b/R/ferrn-package.R
index 69a1da2..84929bc 100644
--- a/R/ferrn-package.R
+++ b/R/ferrn-package.R
@@ -4,4 +4,5 @@
 #' @importFrom cli cli_abort
 #' @importFrom tidyr unnest
 #' @importFrom rlang `:=`
+#' @importFrom progress progress_bar
 "_PACKAGE"
diff --git a/man/data.Rd b/man/data.Rd
index b774310..e113886 100644
--- a/man/data.Rd
+++ b/man/data.Rd
@@ -7,7 +7,7 @@
 \alias{holes_1d_jellyfish}
 \alias{holes_2d_better}
 \alias{holes_2d_better_max_tries}
-\title{Data object collected during the projection pursuit optimisation}
+\title{Data objects collected during the projection pursuit optimisation}
 \format{
 An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 416 rows and 8 columns.
 
diff --git a/man/optim.Rd b/man/optim.Rd
index 711a7a0..780c253 100644
--- a/man/optim.Rd
+++ b/man/optim.Rd
@@ -13,21 +13,22 @@ calc_smoothness(
   d = 2,
   best = matrix(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1), nrow = 6),
   start_parms = c(0.001, 0.5, 2, 2),
-  other_gp_params = list(NULL)
+  other_gp_params = NULL
 )
 
 calc_squintability(
   idx,
   data = sine1000,
   method = c("ks", "nls"),
-  n_basis,
+  n_basis = 100,
   n = 6,
   d = 2,
   proj_dist_threshold = 1.5,
   step = 0.02,
   best = matrix(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1), nrow = 6),
   bin_nobs_threshold = 5000,
-  bin_size = 0.02
+  bin_width = 0.02,
+  sampling_seed = 123
 )
 }
 \arguments{
@@ -64,8 +65,10 @@ recommend between 0.01 and 0.1}
 number of observations for
 applying binning before fitting the kernel}
 
-\item{bin_size}{only for squintability, the bin size for binning the data
+\item{bin_width}{only for squintability, the bin size for binning the data
 before fitting the kernel}
+
+\item{sampling_seed}{the seed used for sampling the random basis}
 }
 \description{
 Function to calculate smoothness and squintability